source: trunk/src/helpers/textv_html.c@ 19

Last change on this file since 19 was 14, checked in by umoeller, 25 years ago

Major updates; timers, LVM, miscellaneous.

  • Property svn:eol-style set to CRLF
  • Property svn:keywords set to Author Date Id Revision
File size: 61.5 KB
Line 
1
2/*
3 *@@sourcefile textv_html.c:
4 * this code converts HTML code to escape sequences for the
5 * XTextView control (textview.c).
6 *
7 * This code is in part ugly spaghetti, but this is intentional to
8 * make this HTML parser FAST. In general, you get about double or
9 * triple the speed compared to Netscape 4.6 on OS/2. This code
10 * doesn't understand all of HTML though, but you get most of HTML 2.
11 * There's no tables or frames at this point.
12 *
13 * The entry point into this mess is txvConvertFromHTML, which
14 * is easy to use.
15 *
16 * Note: Version numbering in this file relates to XWorkplace version
17 * numbering.
18 *
19 *@@header "helpers\textv_html.h"
20 *
21 *@@added V0.9.3 (2000-05-10) [umoeller]
22 */
23
24/*
25 * Copyright (C) 2000 Ulrich M”ller.
26 * This program is part of the XWorkplace package.
27 * This program is free software; you can redistribute it and/or modify
28 * it under the terms of the GNU General Public License as published by
29 * the Free Software Foundation, in version 2 as it comes in the COPYING
30 * file of the XWorkplace main distribution.
31 * This program is distributed in the hope that it will be useful,
32 * but WITHOUT ANY WARRANTY; without even the implied warranty of
33 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34 * GNU General Public License for more details.
35 */
36
37#define OS2EMX_PLAIN_CHAR
38 // this is needed for "os2emx.h"; if this is defined,
39 // emx will define PSZ as _signed_ char, otherwise
40 // as unsigned char
41
42#include <os2.h>
43
44#include <stdlib.h>
45#include <stdio.h>
46#include <string.h>
47
48#include "setup.h" // code generation and debugging options
49
50#include "helpers\linklist.h"
51#include "helpers\stringh.h"
52#include "helpers\textview.h"
53
54#include "helpers\textv_html.h"
55
56/*
57 *@@category: Helpers\PM helpers\Window classes\XTextView control\HTML conversion
58 */
59
60/* ******************************************************************
61 *
62 * Declarations
63 *
64 ********************************************************************/
65
66/*
67 *@@ LISTDESC:
68 * structure stored in COPYTARGET to
69 * hold list information (UL, OL, ... tags).
70 *
71 *@@added V0.9.3 (2000-05-07) [umoeller]
72 */
73
74typedef struct _LISTDESC
75{
76 ULONG ulListType; // 0: unordered (UL)
77 // 1: ordered (OL)
78 // 2: definition lists (DL)
79 ULONG ulItem; // list enumeration; 1 on first item,
80 // 2 on next, ...
81} LISTDESC, *PLISTDESC;
82
83/*
84 *@@ COPYTARGET:
85 * monster structure which holds the current
86 * status of the HTML converter while conversion
87 * is taking place. This stores input/output pointers
88 * and various flags to avoid duplicate line breaks
89 * and such.
90 *
91 * One instance of this is created in txvConvertFromHTML
92 * on the stack and then passed to all the sub-function
93 * calls.
94 *
95 *@@added V0.9.3 (2000-05-06) [umoeller]
96 */
97
98typedef struct _COPYTARGET
99{
100 PSZ pSource; // ptr into source string;
101 // valid ONLY while we're in a tag handler
102 PSZ pNewSource; // can be set by tag handler to skip characters;
103 // this is set to NULL before calling a tag
104 // handler; if this is still NULL, default
105 // processing occurs
106
107 // new string:
108 PSZ pszNew; // memory buffer
109 ULONG cbNew; // size of buffer (reallocated)
110 PSZ pTarget; // current char ptr into pszNew
111
112 // saved character while tag handler is being called
113 CHAR cSaved;
114
115 PXHTMLDATA pxhtml; // ptr to XHTMLDATA passed to txvConvertFromHTML
116
117 // formatting flags while going through the text
118 BOOL fSkipNextSpace;
119 // if TRUE, subsequent spaces are skipped
120 BOOL fNeedsLinebreak;
121 // if TRUE, \n is inserted before any other character
122 BOOL fSkipNextLinebreak;
123 // if TRUE, subsequent linebreaks are skipped
124 BOOL fPRE;
125 // are we currently in a PRE tag?
126 BOOL fInLink;
127 // are we currently in a A HREF= tag?
128
129 // arguments (attributes) for tag handlers
130 PSZ pszAttributes; // != NULL while a tag handler is being called
131 // and attributes exist for the tag
132
133 // anchors count
134 USHORT usAnchorIndex; // start with 1
135
136 // list maintenance
137 ULONG ulListLevel; // if > 0, we're in a UL or OL block;
138 // raised for each block
139 ULONG ulUnorderedListLevel; // raised with each UL block to keep track
140 // of bullets
141 ULONG ulOrderedListLevel; // raised with each UL block to keep track
142 // of 1), 2), a), b)... numbering
143 ULONG ulCurrentListType; // current list type (from highest LISTDESC)
144 BOOL fInDT; // TRUE if we're currently in a DT tag
145 LINKLIST llLists; // stack of LISTDESC items
146} COPYTARGET, *PCOPYTARGET;
147
148typedef VOID FNPROCESSTAG(PCOPYTARGET pct);
149typedef FNPROCESSTAG *PFNPROCESSTAG;
150
151/* ******************************************************************
152 *
153 * Global variables
154 *
155 ********************************************************************/
156
157/* ******************************************************************
158 *
159 * Append-char helpers
160 *
161 ********************************************************************/
162
163#define COPYTARGETALLOC 100000
164
165/*
166 *@@ AppendChar:
167 * helper for txvConvertFromHTML to
168 * append a char to the target string
169 * in COPYTARGET.
170 * This performs a few additional checks
171 * and manages memory.
172 *
173 *@@added V0.9.3 (2000-05-06) [umoeller]
174 */
175
176VOID AppendChar(PCOPYTARGET pct, // in/out: formatting buffer
177 unsigned char c)
178{
179 // calculate ofs where to store next char
180 ULONG cbOfsNext = pct->pTarget - pct->pszNew;
181 if (cbOfsNext >= pct->cbNew) // have we reached the buffer size yet?
182 {
183 // more mem needed:
184 pct->cbNew += COPYTARGETALLOC;
185 pct->pszNew = (PSZ)realloc(pct->pszNew, pct->cbNew);
186 // if first call, pszNew is NULL, and realloc
187 // behaves just like malloc
188 // adjust target, because ptr might have changed
189 pct->pTarget = pct->pszNew + cbOfsNext;
190 }
191
192 // append character
193 *pct->pTarget++ = c;
194}
195
196/*
197 *@@ AppendString:
198 * appends the characters in *ach,
199 * which must be null-terminated.
200 * Does NOT append a null character though.
201 *
202 *@@added V0.9.3 (2000-05-06) [umoeller]
203 */
204
205VOID AppendString(PCOPYTARGET pct, // in/out: formatting buffer
206 char *ach)
207{
208 ULONG cbAppend = strlen(ach);
209 ULONG ul;
210 PSZ pSource;
211
212 // calculate ofs where to store next char
213 ULONG cbOfsNext = pct->pTarget - pct->pszNew;
214 while (cbOfsNext + cbAppend >= pct->cbNew)
215 {
216 // more mem needed:
217 pct->cbNew += COPYTARGETALLOC;
218 pct->pszNew = (PSZ)realloc(pct->pszNew, pct->cbNew);
219 // if first call, pszNew is NULL, and realloc
220 // behaves just like malloc
221 // adjust target, because ptr might have changed
222 pct->pTarget = pct->pszNew + cbOfsNext;
223 }
224
225 // append characters
226 pSource = ach;
227 for (ul = 0;
228 ul < cbAppend;
229 ul++)
230 *pct->pTarget++ = *pSource++;
231}
232
233/*
234 *@@ AppendLinebreakCheck:
235 * checks if a linebreak is needed and
236 * inserts one if so.
237 *
238 *@@added V0.9.3 (2000-05-17) [umoeller]
239 */
240
241VOID AppendLinebreakCheck(PCOPYTARGET pct)
242{
243 if ((!pct->fPRE) && (pct->fNeedsLinebreak))
244 {
245 // yes: insert linebreak; this resets pct->fNeedsLinebreak
246 if (!pct->fSkipNextLinebreak)
247 {
248 AppendChar(pct, '\n');
249
250 if ((pct->ulListLevel) && (!pct->fInDT))
251 // if we're in a list, add a tab also,
252 // because we'll have a negative first-line margin
253 AppendString(pct, TXVESC_TAB);
254 }
255 pct->fNeedsLinebreak = FALSE;
256 }
257}
258
259/*
260 *@@ AppendEscapeWithDecimal:
261 * appends the specified escape code
262 * with a three-digit decimal parameter.
263 * Calls AppendString in turn.
264 *
265 *@@added V0.9.3 (2000-05-07) [umoeller]
266 */
267
268VOID AppendEscapeWith3Decimals(PCOPYTARGET pct, // in/out: formatting buffer
269 char *ach,
270 USHORT us)
271{
272 CHAR szDecimal[10];
273 if (us > 999)
274 us = 999;
275 sprintf(szDecimal, "%03d", us);
276 // append escape
277 AppendString(pct, ach);
278 AppendString(pct, szDecimal);
279}
280
281/*
282 *@@ AppendEscapeWith4Decimals:
283 *
284 *@@added V0.9.3 (2000-05-07) [umoeller]
285 */
286
287VOID AppendEscapeWith4Decimals(PCOPYTARGET pct, // in/out: formatting buffer
288 char *ach,
289 USHORT us)
290{
291 CHAR szDecimal[10];
292 if (us > 9999)
293 us = 9999;
294 sprintf(szDecimal, "%04d", us);
295 // append escape
296 AppendString(pct, ach);
297 AppendString(pct, szDecimal);
298}
299
300/* ******************************************************************
301 *
302 * Tag converter functions
303 *
304 ********************************************************************/
305
306/*
307 *@@ StartList:
308 * starts a list (UL or OL).
309 * This uses a linked list in COPYTARGET
310 * to keep a pseudo-stack for nested lists.
311 *
312 *@@added V0.9.3 (2000-05-08) [umoeller]
313 */
314
315VOID StartList(PCOPYTARGET pct, // in/out: formatting buffer
316 ULONG ulListType) // list type:
317 // 0: unordered (UL)
318 // 1: ordered (OL)
319 // 2: definition lists (DL)
320{
321 PLISTDESC pListDesc;
322
323 // raise list level
324 pct->ulListLevel++;
325
326 if (ulListType == 0)
327 // unordered:
328 pct->ulUnorderedListLevel++;
329 else if (ulListType == 1)
330 // ordered:
331 pct->ulOrderedListLevel++;
332
333 // create LISTDESC and store it on stack
334 pListDesc = (PLISTDESC)malloc(sizeof(LISTDESC));
335 pListDesc->ulListType
336 = pct->ulCurrentListType
337 = ulListType;
338 pListDesc->ulItem = 1;
339
340 lstAppendItem(&pct->llLists,
341 pListDesc);
342
343 AppendEscapeWith4Decimals(pct,
344 TXVESC_LEFTMARGIN,
345 pct->ulListLevel * 5);
346 AppendEscapeWith3Decimals(pct,
347 TXVESC_FIRSTLINEMARGIN_LEFT,
348 (ulListType == 2)
349 ? 5 // for definition lists
350 : 3); // negative!
351 // add \n before any other character
352 pct->fNeedsLinebreak = TRUE;
353}
354
355/*
356 *@@ StopList:
357 * stops a list (UL or OL).
358 *
359 *@@added V0.9.3 (2000-05-07) [umoeller]
360 */
361
362VOID StopList(PCOPYTARGET pct)
363{
364 if (pct->ulListLevel)
365 {
366 PLISTNODE pNode;
367
368 // lower list level
369 pct->ulListLevel--;
370 AppendEscapeWith4Decimals(pct,
371 TXVESC_LEFTMARGIN,
372 pct->ulListLevel * 5);
373 AppendEscapeWith3Decimals(pct,
374 TXVESC_FIRSTLINEMARGIN_LEFT,
375 (pct->ulListLevel)
376 ? 3 // we still have a list level (nested)
377 : 0);
378 pct->fNeedsLinebreak = TRUE;
379
380 // remove the LISTDESC from the stack
381 pNode = lstNodeFromIndex(&pct->llLists,
382 pct->ulListLevel); // this has been lowered already
383 if (pNode)
384 {
385 PLISTDESC pListDesc = (PLISTDESC)pNode->pItemData;
386 if (pListDesc->ulListType == 0)
387 // was unordered:
388 pct->ulUnorderedListLevel--;
389 else if (pListDesc->ulListType == 1)
390 // was ordered:
391 pct->ulOrderedListLevel--;
392
393 lstRemoveNode(&pct->llLists, pNode);
394
395 // update COPYTARGET with previous list level
396 if (pct->ulListLevel)
397 {
398 // we're still in a list (nested lists):
399 PLISTDESC pListDesc2 = (PLISTDESC)lstItemFromIndex(&pct->llLists,
400 pct->ulListLevel - 1);
401 if (pListDesc2)
402 pct->ulCurrentListType = pListDesc2->ulListType;
403 }
404 }
405 }
406 // else: buggy HTML code, ignore
407}
408
409/*
410 *@@ TagTITLE:
411 *
412 *@@added V0.9.3 (2000-05-19) [umoeller]
413 */
414
415VOID TagTITLE(PCOPYTARGET pct)
416{
417 // pSource currently points to <TITLE tag
418 PSZ pSource = pct->pSource + strlen(pct->pSource);
419 // points to temporary null byte in main buffer now
420 *pSource = pct->cSaved;
421
422 pSource = strchr(pct->pSource, '>');
423 if (pSource)
424 {
425 PSZ pNextOpen = strchr(pSource, '<');
426 if (pNextOpen)
427 {
428 // extract title
429 pct->pxhtml->pszTitle = strhSubstr(pSource + 1, pNextOpen);
430
431 if (strnicmp(pNextOpen + 1, "/TITLE", 6) == 0)
432 {
433 // closing /TITLE tag found:
434 // search on after that
435 pct->pNewSource = strchr(pNextOpen, '>');
436 if (pct->pNewSource)
437 pct->pNewSource++;
438 }
439 }
440 }
441}
442
443/*
444 *@@ TagP:
445 *
446 */
447
448VOID TagP(PCOPYTARGET pct)
449{
450 // append newline:
451 // add \n before any other character
452 pct->fNeedsLinebreak = TRUE;
453
454 /* if (pct->ulListLevel)
455 {
456 // if we are currently in a list, we must also
457 // add a tab escape, because we have set
458 // the first line margin to the left of the
459 // left margin
460 AppendString(pct,
461 TXVESC_TAB);
462 } */
463}
464
465VOID TagBR(PCOPYTARGET pct)
466{
467 AppendChar(pct,
468 '\r');
469
470 if (pct->ulListLevel)
471 {
472 // if we are currently in a list, we must also
473 // add a tab escape, because we have set
474 // the first line margin to the left of the
475 // left margin
476 AppendString(pct,
477 TXVESC_TAB);
478 }
479 if (!pct->fPRE)
480 pct->fSkipNextSpace = TRUE;
481}
482
483VOID TagPRE(PCOPYTARGET pct)
484{
485 // start of PRE tag:
486 // add \n before any other character
487 // pct->fNeedsLinebreak = TRUE;
488 AppendChar(pct, '\n');
489 pct->fNeedsLinebreak = FALSE;
490 /* AppendString(pct,
491 TXVESC_PRE_BEGIN); */
492 AppendEscapeWith3Decimals(pct,
493 TXVESC_SET_FONT,
494 1); // monospaced font
495 AppendEscapeWith4Decimals(pct,
496 TXVESC_SPACEBEFORE,
497 0); // no spacing before
498 AppendEscapeWith4Decimals(pct,
499 TXVESC_SPACEAFTER,
500 0); // no spacing after
501 // disable word-wrapping
502 AppendString(pct,
503 TXVESC_WORDWRAP "0");
504 pct->fPRE = TRUE;
505 pct->fSkipNextSpace = FALSE;
506}
507
508VOID TagXPRE(PCOPYTARGET pct)
509{
510 pct->fPRE = FALSE;
511 AppendEscapeWith3Decimals(pct,
512 TXVESC_SET_FONT,
513 0); // standard font
514 AppendString(pct, TXVESC_SPACEBEFORE);
515 AppendString(pct, "####"); // reset to default
516 AppendString(pct, TXVESC_SPACEAFTER);
517 AppendString(pct, "####"); // reset to default
518 // re-enable word-wrapping
519 AppendString(pct,
520 TXVESC_WORDWRAP "1"
521 "\n"); // force line break
522 pct->fNeedsLinebreak = FALSE;
523 // refuse to add \n even if we have another "p" coming up
524 pct->fSkipNextLinebreak = TRUE;
525 pct->fSkipNextSpace = TRUE;
526}
527
528VOID TagH1(PCOPYTARGET pct)
529{
530 pct->fNeedsLinebreak = TRUE;
531 AppendEscapeWith3Decimals(pct,
532 TXVESC_POINTSIZE_REL,
533 200); // double size
534 AppendString(pct,
535 TXVESC_BOLD_BEGIN);
536}
537
538VOID TagXH1(PCOPYTARGET pct)
539{
540 AppendString(pct,
541 TXVESC_BOLD_END);
542 AppendEscapeWith3Decimals(pct,
543 TXVESC_POINTSIZE_REL,
544 100); // regular size
545 // add \n before any other character
546 pct->fNeedsLinebreak = TRUE;
547}
548
549VOID TagH2(PCOPYTARGET pct)
550{
551 pct->fNeedsLinebreak = TRUE;
552 AppendEscapeWith3Decimals(pct,
553 TXVESC_POINTSIZE_REL,
554 175); // size in percent of regular point size
555 AppendString(pct,
556 TXVESC_BOLD_BEGIN);
557}
558
559VOID TagXH2(PCOPYTARGET pct)
560{
561 AppendString(pct,
562 TXVESC_BOLD_END);
563 AppendEscapeWith3Decimals(pct,
564 TXVESC_POINTSIZE_REL,
565 100); // regular size
566 // add \n before any other character
567 pct->fNeedsLinebreak = TRUE;
568}
569
570VOID TagH3(PCOPYTARGET pct)
571{
572 pct->fNeedsLinebreak = TRUE;
573 AppendEscapeWith3Decimals(pct,
574 TXVESC_POINTSIZE_REL,
575 150); // size in percent of regular point size
576 AppendString(pct,
577 TXVESC_BOLD_BEGIN);
578}
579
580VOID TagXH3(PCOPYTARGET pct)
581{
582 AppendString(pct,
583 TXVESC_BOLD_END);
584 AppendEscapeWith3Decimals(pct,
585 TXVESC_POINTSIZE_REL,
586 100); // size in percent of regular point size
587 // add \n before any other character
588 pct->fNeedsLinebreak = TRUE;
589}
590
591VOID TagH4(PCOPYTARGET pct)
592{
593 pct->fNeedsLinebreak = TRUE;
594 AppendEscapeWith3Decimals(pct,
595 TXVESC_POINTSIZE_REL,
596 125); // size in percent of regular point size
597 AppendString(pct,
598 TXVESC_BOLD_BEGIN);
599}
600
601VOID TagXH4(PCOPYTARGET pct)
602{
603 AppendString(pct,
604 TXVESC_BOLD_END);
605 AppendEscapeWith3Decimals(pct,
606 TXVESC_POINTSIZE_REL,
607 100); // regular size
608 // add \n before any other character
609 pct->fNeedsLinebreak = TRUE;
610}
611
612VOID TagH5(PCOPYTARGET pct)
613{
614 pct->fNeedsLinebreak = TRUE;
615 AppendEscapeWith3Decimals(pct,
616 TXVESC_POINTSIZE_REL,
617 100); // size in percent of regular point size
618 AppendString(pct,
619 TXVESC_BOLD_BEGIN);
620}
621
622VOID TagXH5(PCOPYTARGET pct)
623{
624 AppendString(pct,
625 TXVESC_BOLD_END);
626 AppendEscapeWith3Decimals(pct,
627 TXVESC_POINTSIZE_REL,
628 100); // regular size
629 // add \n before any other character
630 pct->fNeedsLinebreak = TRUE;
631}
632
633VOID TagH6(PCOPYTARGET pct)
634{
635 pct->fNeedsLinebreak = TRUE;
636 AppendEscapeWith3Decimals(pct,
637 TXVESC_POINTSIZE_REL,
638 80 ); // size in percent of regular point size
639 AppendString(pct,
640 TXVESC_BOLD_BEGIN);
641}
642
643VOID TagXH6(PCOPYTARGET pct)
644{
645 AppendString(pct,
646 TXVESC_BOLD_END);
647 AppendEscapeWith3Decimals(pct,
648 TXVESC_POINTSIZE_REL,
649 100); // regular size
650 // add \n before any other character
651 pct->fNeedsLinebreak = TRUE;
652}
653
654VOID TagUL(PCOPYTARGET pct)
655{
656 StartList(pct,
657 0); // unordered
658}
659
660VOID TagXUL(PCOPYTARGET pct)
661{
662 StopList(pct);
663}
664
665VOID TagOL(PCOPYTARGET pct)
666{
667 StartList(pct,
668 1); // ordered
669}
670
671VOID TagXOL(PCOPYTARGET pct)
672{
673 StopList(pct);
674}
675
676VOID TagLI(PCOPYTARGET pct)
677{
678 PLISTDESC pListDesc;
679 CHAR szMarker[20] = TXVESC_MARKER "\x01";
680
681 if (pct->ulListLevel)
682 {
683 // we're in a list:
684 pListDesc = (PLISTDESC)lstItemFromIndex(&pct->llLists,
685 pct->ulListLevel - 1);
686 if (pListDesc)
687 {
688 if (pListDesc->ulListType == 1)
689 // is ordered list:
690 sprintf(szMarker, "%lu.", (pListDesc->ulItem)++);
691 else if (pListDesc->ulListType == 0)
692 // is unordered list:
693 // set bullet type according to unordered nesting
694 szMarker[2] = pct->ulUnorderedListLevel;
695 }
696 }
697
698 // add \n before any other character
699 // pct->fNeedsLinebreak = TRUE;
700 // if (pct->fNeedsLinebreak)
701 {
702 AppendChar(pct, '\n');
703 pct->fNeedsLinebreak = FALSE;
704 }
705
706 AppendString(pct, szMarker);
707 AppendString(pct, TXVESC_TAB);
708}
709
710VOID TagDL(PCOPYTARGET pct)
711{
712 StartList(pct,
713 2); // definition list
714}
715
716VOID TagXDL(PCOPYTARGET pct)
717{
718 StopList(pct);
719 pct->fInDT = FALSE;
720}
721
722VOID TagDT(PCOPYTARGET pct)
723{
724 pct->fNeedsLinebreak = TRUE;
725 pct->fInDT = TRUE;
726}
727
728VOID TagDD(PCOPYTARGET pct)
729{
730 pct->fNeedsLinebreak = TRUE;
731 AppendString(pct, TXVESC_TAB);
732 if (!pct->fPRE)
733 pct->fSkipNextSpace = TRUE;
734 pct->fInDT = FALSE;
735}
736
737VOID TagTR(PCOPYTARGET pct)
738{
739 pct->fNeedsLinebreak = TRUE;
740}
741
742VOID TagB(PCOPYTARGET pct)
743{
744 AppendString(pct,
745 TXVESC_BOLD_BEGIN);
746}
747
748VOID TagXB(PCOPYTARGET pct)
749{
750 AppendString(pct,
751 TXVESC_BOLD_END);
752}
753
754VOID TagI(PCOPYTARGET pct)
755{
756 AppendString(pct,
757 TXVESC_ITALICS_BEGIN);
758}
759
760VOID TagXI(PCOPYTARGET pct)
761{
762 AppendString(pct,
763 TXVESC_ITALICS_END);
764}
765
766VOID TagU(PCOPYTARGET pct)
767{
768 AppendString(pct,
769 TXVESC_UNDERLINE_BEGIN);
770}
771
772VOID TagXU(PCOPYTARGET pct)
773{
774 AppendString(pct,
775 TXVESC_UNDERLINE_END);
776}
777
778VOID TagSTRIKE(PCOPYTARGET pct)
779{
780 AppendString(pct,
781 TXVESC_STRIKE_BEGIN);
782}
783
784VOID TagXSTRIKE(PCOPYTARGET pct)
785{
786 AppendString(pct,
787 TXVESC_STRIKE_END);
788}
789
790VOID TagCODE(PCOPYTARGET pct)
791{
792 AppendEscapeWith3Decimals(pct,
793 TXVESC_SET_FONT,
794 1); // monospaced font
795}
796
797VOID TagXCODE(PCOPYTARGET pct)
798{
799 AppendEscapeWith3Decimals(pct,
800 TXVESC_SET_FONT,
801 0); // regular font
802}
803
804VOID TagA(PCOPYTARGET pct)
805{
806 CHAR szAnchor[10];
807
808 pct->fInLink = FALSE;
809
810 if ((pct->pszAttributes) && (pct->pxhtml)) // points into main source buffer
811 {
812 // we have attributes:
813 PSZ pszClosingTag = strchr(pct->pszAttributes, '>');
814 if (pszClosingTag)
815 {
816 ULONG ulOfs = 0;
817
818 /*
819 * HREF attribute:
820 *
821 */
822
823 PSZ pHREF = strhGetTextAttr(pct->pszAttributes, "HREF", &ulOfs),
824 pNAME = 0;
825
826 // replace '>' with null char to mark end of search
827 *pszClosingTag = 0;
828
829 if (pHREF)
830 {
831 // OK, we got a link target:
832 // create a link item and append it to the output list
833 PXHTMLLINK pNewLink = (PXHTMLLINK)malloc(sizeof(XHTMLLINK));
834 memset(pNewLink, 0, sizeof(XHTMLLINK));
835
836 pct->fInLink = TRUE;
837
838 // this starts with anchor 1
839 pNewLink->usLinkIndex = ++pct->usAnchorIndex;
840 pNewLink->pszTargetFile = pHREF;
841 // do not free
842 lstAppendItem(&pct->pxhtml->llLinks, pNewLink);
843 }
844
845 /*
846 * NAME attribute:
847 *
848 */
849
850 pNAME = strhGetTextAttr(pct->pszAttributes, "NAME", &ulOfs);
851 if (pNAME)
852 {
853 AppendString(pct,
854 TXVESC_ANCHORNAME);
855 AppendString(pct,
856 pNAME);
857 // must be terminated with 0xFF
858 AppendChar(pct, 0xFF);
859 free(pNAME);
860 }
861 // restore '>'
862 *pszClosingTag = '>';
863 }
864 }
865
866 if (pct->fInLink)
867 {
868 sprintf(szAnchor, "%04hX", pct->usAnchorIndex);
869 AppendString(pct,
870 TXVESC_LINK);
871 AppendString(pct,
872 szAnchor);
873 }
874}
875
876VOID TagXA(PCOPYTARGET pct)
877{
878 if (pct->fInLink)
879 {
880 AppendString(pct,
881 TXVESC_LINK "####");
882 pct->fInLink = FALSE;
883 }
884}
885
886/* ******************************************************************
887 *
888 * Tag helpers
889 *
890 ********************************************************************/
891
892/*
893 *@@ FindTagProcessor:
894 * returns the Tag* function which handles the
895 * given tag or NULL if there's none.
896 *
897 *@@added V0.9.4 (2000-06-10) [umoeller]
898 */
899
900PFNPROCESSTAG FindTagProcessor(PSZ pszTag)
901{
902 PFNPROCESSTAG pProcessor = NULL;
903
904 CHAR c0,
905 c1;
906
907 BOOL fEndOfTag = FALSE;
908
909 PSZ pCheck = pszTag,
910 p2;
911 if (*pCheck == '/')
912 {
913 // end of tag:
914 fEndOfTag = TRUE;
915 pCheck++;
916 }
917
918 c0 = *pCheck;
919 c1 = *(pCheck + 1);
920
921 p2 = pCheck + 2;
922
923 switch (c0)
924 {
925 case 'A':
926 case 'a':
927 switch (c1)
928 {
929 case 0: // A
930 if (!fEndOfTag)
931 return TagA;
932 else
933 return TagXA;
934 case 'D': // ADDRESS
935 case 'd': // ADDRESS
936 if (stricmp(p2, "DRESS") == 0)
937 {
938 if (!fEndOfTag)
939 return TagI;
940 else
941 return TagXI;
942 }
943 }
944 break;
945
946 case 'B':
947 case 'b':
948 switch (c1)
949 {
950 case 0:
951 if (!fEndOfTag)
952 return TagB;
953 else
954 return TagXB;
955
956 case 'R': // BR
957 case 'r': // BR
958 if (*p2 == 0)
959 if (!fEndOfTag)
960 return TagBR;
961 }
962 break;
963
964 case 'C':
965 case 'c':
966 switch (c1)
967 {
968 case 'I': // CITE
969 case 'i': // CITE
970 if (stricmp(p2, "TE") == 0)
971 {
972 if (!fEndOfTag)
973 return TagI;
974 else
975 return TagXI;
976 }
977 break;
978
979 case 'O':
980 case 'o':
981 if (stricmp(p2, "DE") == 0)
982 {
983 if (!fEndOfTag)
984 return TagCODE;
985 else
986 return TagXCODE;
987 }
988 break;
989 }
990 break;
991
992 case 'D':
993 case 'd':
994 switch (c1)
995 {
996 case 'D': // DD
997 case 'd': // DD
998 if ((*p2 == 0) && (!fEndOfTag))
999 return (TagDD);
1000 break;
1001
1002 case 'I': // DIR
1003 case 'i': // DIR
1004 if (*p2 == 'R')
1005 if (*(pCheck + 3) == 0)
1006 {
1007 if (!fEndOfTag)
1008 return TagUL;
1009 else
1010 return TagXUL;
1011 }
1012 break;
1013
1014 case 'L': // DL
1015 case 'l': // DL
1016 if (*p2 == 0)
1017 {
1018 if (!fEndOfTag)
1019 return TagDL;
1020 else
1021 return TagXDL;
1022 }
1023 break;
1024
1025 case 'T': // DT
1026 case 't': // DT
1027 if ((*p2 == 0) && (!fEndOfTag))
1028 return TagDT;
1029 break;
1030 }
1031 break;
1032
1033 case 'E':
1034 case 'e':
1035 if ( (c1 == 'M') || (c1 == 'm') ) // EM
1036 if (*p2 == 0)
1037 {
1038 if (!fEndOfTag)
1039 return TagI;
1040 else
1041 return TagXI;
1042 }
1043 break;
1044
1045 case 'H':
1046 case 'h':
1047 if (c1)
1048 if (*p2 == 0)
1049 switch (c1)
1050 {
1051 case '1':
1052 if (!fEndOfTag)
1053 return TagH1;
1054 else
1055 return TagXH1;
1056 case '2':
1057 if (!fEndOfTag)
1058 return TagH2;
1059 else
1060 return TagXH2;
1061 case '3':
1062 if (!fEndOfTag)
1063 return TagH3;
1064 else
1065 return TagXH3;
1066 case '4':
1067 if (!fEndOfTag)
1068 return TagH4;
1069 else
1070 return TagXH4;
1071 case '5':
1072 if (!fEndOfTag)
1073 return TagH5;
1074 else
1075 return TagXH5;
1076 case '6':
1077 if (!fEndOfTag)
1078 return TagH6;
1079 else
1080 return TagXH6;
1081 }
1082 break;
1083
1084 case 'I':
1085 case 'i':
1086 if (c1 == 0)
1087 {
1088 if (!fEndOfTag)
1089 return TagI;
1090 else
1091 return TagXI;
1092 }
1093 break;
1094
1095 case 'L':
1096 case 'l':
1097 if ((c1 == 'I') || (c1 == 'i'))
1098 if (*p2 == 0)
1099 return TagLI;
1100 break;
1101
1102 case 'M':
1103 case 'm':
1104 if (stricmp(p2, "NU") == 0)
1105 {
1106 if (!fEndOfTag)
1107 return TagUL;
1108 else
1109 return TagXUL;
1110 }
1111 break;
1112
1113 case 'O':
1114 case 'o':
1115 if ((c1 == 'L') || (c1 == 'l'))
1116 if (*p2 == 0)
1117 {
1118 if (!fEndOfTag)
1119 return TagOL;
1120 else
1121 return TagXOL;
1122 }
1123 break;
1124
1125 case 'P':
1126 case 'p':
1127 switch (c1)
1128 {
1129 case 0:
1130 if (!fEndOfTag)
1131 return TagP;
1132 break;
1133
1134 case 'R': // PRE
1135 case 'r': // PRE
1136 if ((*p2 == 'E') || (*p2 == 'e'))
1137 if (*(pCheck + 3) == 0)
1138 {
1139 if (!fEndOfTag)
1140 return TagPRE;
1141 else
1142 return TagXPRE;
1143 }
1144 break;
1145 }
1146 break;
1147
1148 case 'S':
1149 case 's':
1150 switch (c1)
1151 {
1152 case 'T': // STRONG
1153 case 't': // STRONG
1154 if (stricmp(p2, "RONG") == 0)
1155 {
1156 if (!fEndOfTag)
1157 return TagB;
1158 else
1159 return TagXB;
1160 }
1161 else if (stricmp(p2, "RIKE") == 0)
1162 {
1163 if (!fEndOfTag)
1164 return TagSTRIKE;
1165 else
1166 return TagXSTRIKE;
1167 }
1168 break;
1169
1170 case 'A':
1171 case 'a':
1172 if (stricmp(p2, "MP") == 0)
1173 {
1174 if (!fEndOfTag)
1175 return TagCODE;
1176 else
1177 return TagXCODE;
1178 }
1179 break;
1180 }
1181 break;
1182
1183 case 'T':
1184 case 't':
1185 switch (c1)
1186 {
1187 case 'R':
1188 case 'r':
1189 if (*p2 == 0)
1190 return TagTR;
1191 break;
1192
1193 case 'I':
1194 case 'i':
1195 if (stricmp(p2, "TLE") == 0)
1196 return TagTITLE;
1197 break;
1198
1199 case 'T': // TT
1200 case 't':
1201 if (*p2 == 0)
1202 {
1203 if (!fEndOfTag)
1204 return TagCODE;
1205 else
1206 return TagXCODE;
1207 }
1208 break;
1209 }
1210 break;
1211
1212 case 'U':
1213 case 'u':
1214 switch (c1)
1215 {
1216 case 0:
1217 if (!fEndOfTag)
1218 return TagU;
1219 else
1220 return TagXU;
1221
1222 case 'L':
1223 case 'l':
1224 if (*p2 == 0)
1225 {
1226 if (!fEndOfTag)
1227 return TagUL;
1228 else
1229 return TagXUL;
1230 }
1231 break;
1232 }
1233 break;
1234
1235 case 'V':
1236 case 'v':
1237 if (stricmp(p2, "R") == 0)
1238 {
1239 if (!fEndOfTag)
1240 return TagI;
1241 else
1242 return TagXI;
1243 }
1244 break;
1245
1246 case 'X':
1247 case 'x':
1248 if (stricmp(p2, "MP") == 0) // XMP
1249 {
1250 if (!fEndOfTag)
1251 return TagPRE;
1252 else
1253 return TagXPRE;
1254 }
1255 break;
1256 }
1257
1258 return (pProcessor);
1259}
1260
1261/*
1262 *@@ HandleTag:
1263 * called by txvConvertFromHTML when a "<" character
1264 * is found in the source buffer. This calls
1265 * FindTagProcessor in turn to find the Tag*
1266 * function which handles the tag.
1267 *
1268 *@@added V0.9.3 (2000-05-18) [umoeller]
1269 */
1270
1271VOID HandleTag(PCOPYTARGET pct)
1272{
1273 PSZ pStartOfTag = pct->pSource;
1274 // '<' == begin of tag:
1275
1276 // is it a comment? <!-- ... -->
1277 if (strncmp(pStartOfTag + 1, "!--", 3) == 0)
1278 {
1279 // start of comment:
1280 // find end of comment
1281 PSZ pEnd = strstr(pStartOfTag, "-->");
1282 if (pEnd)
1283 // found:
1284 // search on after end of comment
1285 pct->pSource = pEnd + 3;
1286 else
1287 {
1288 // end of comment not found:
1289 // stop formatting...
1290 pct->pSource++;
1291 return;
1292 }
1293 }
1294 else
1295 {
1296 // no comment:
1297 // find end of tag
1298 PSZ p2 = pStartOfTag + 1,
1299 pNextClose = 0, // receives first '>' after '<'
1300 pNextSpace = 0; // receives first ' ' after '<'
1301 BOOL fCont = TRUE;
1302 while (fCont)
1303 {
1304 switch (*p2)
1305 {
1306 case ' ':
1307 case '\r':
1308 case '\n':
1309 // store first space after '<'
1310 if (!pNextSpace)
1311 pNextSpace = p2;
1312 // overwrite line breaks with spaces;
1313 // otherwise we cannot handle tags which go across
1314 // several lines, which is valid HTML
1315 *p2 = ' ';
1316 break;
1317
1318 case '>': // end of tag found:
1319 pNextClose = p2;
1320 fCont = FALSE;
1321 break;
1322
1323 case '<':
1324 // another opening tag:
1325 // that's an HTML error
1326 AppendChar(pct,
1327 *pct->pSource++);
1328 fCont = FALSE;
1329 break;
1330
1331 case 0:
1332 fCont = FALSE;
1333 break;
1334 }
1335 p2++;
1336 }
1337
1338 if (pNextClose)
1339 {
1340 // end of tag found:
1341 ULONG cbTag;
1342 PSZ pStartOfAttrs = 0;
1343
1344 if ((pNextSpace) && (pNextSpace < pNextClose))
1345 {
1346 // we have attributes:
1347 cbTag = pNextSpace - (pStartOfTag + 1);
1348 pStartOfAttrs = pNextSpace;
1349 }
1350 else
1351 cbTag = pNextClose - (pStartOfTag + 1);
1352
1353 if (!cbTag)
1354 {
1355 // happens if we have a "<>" in the text:
1356 // just insert the '<>' and go on, we have no tag here
1357 AppendChar(pct,
1358 *pct->pSource++);
1359 AppendChar(pct,
1360 *pct->pSource++);
1361 }
1362 else
1363 {
1364 PFNPROCESSTAG pTagProcessor;
1365
1366 pct->cSaved = *(pStartOfTag + cbTag + 1);
1367 // add a null terminator
1368 *(pStartOfTag + cbTag + 1) = 0;
1369
1370 // find corresponding tag converter function
1371 // from G_TagProcessors map
1372 pTagProcessor = FindTagProcessor(pStartOfTag + 1); // pszTag);
1373
1374 // restore char under null terminator
1375 *(pStartOfTag + cbTag + 1) = pct->cSaved;
1376
1377 // reset new source ptr; the tag handler
1378 // can modify this
1379 pct->pNewSource = NULL;
1380
1381 if (pTagProcessor)
1382 {
1383 // tag understood:
1384
1385 // terminate string after closing tag
1386 pct->cSaved = *(pNextClose + 1); // can be null byte!
1387 *(pNextClose + 1) = 0;
1388
1389 // did we have attributes?
1390 if (pNextSpace)
1391 pct->pszAttributes = pNextSpace;
1392
1393 // finally, call the tag handler
1394 (pTagProcessor) // function
1395 (pct); // argument
1396
1397 *(pNextClose + 1) = pct->cSaved;
1398 }
1399
1400 if (pct->pNewSource == NULL)
1401 // tag handler needs no special processing:
1402 // skip '>' too
1403 pct->pSource = pNextClose + 1;
1404 else
1405 // tag handler has skipped something:
1406 pct->pSource = pct->pNewSource;
1407 }
1408 }
1409 }
1410}
1411
1412/*
1413 *@@ ConvertEscape:
1414 * called by HandleEscape to find the ANSI (CP 1004)
1415 * character for the given escape sequence (pszTag).
1416 *
1417 * pszTag contains the stuff between "&" and ";".
1418 *
1419 * This is really ugly spaghetti, but it's the fastest
1420 * way to do it.
1421 *
1422 *@@added V0.9.4 (2000-06-10) [umoeller]
1423 */
1424
1425unsigned char ConvertEscape(PSZ pszTag)
1426{
1427 CHAR c0, c1;
1428 CHAR crc = 0;
1429
1430 PSZ p2 = pszTag + 2;
1431
1432 c0 = *pszTag;
1433 c1 = *(pszTag + 1);
1434
1435 switch (c0)
1436 {
1437 case 'a':
1438 switch (c1)
1439 {
1440 case 'a':
1441 if (strcmp(p2, "cute") == 0)
1442 return 225;
1443 break;
1444
1445 case 'c':
1446 if (strcmp(p2, "irc") == 0)
1447 return 226;
1448 else if (strcmp(p2, "ute") == 0)
1449 return 180;
1450 break;
1451
1452 case 'e':
1453 if (strcmp(p2, "lig") == 0)
1454 return 230;
1455 break;
1456
1457 case 'g':
1458 if (strcmp(p2, "rave") == 0)
1459 return 224;
1460 break;
1461
1462 case 'm':
1463 if (strcmp(p2, "p") == 0)
1464 return '&';
1465 break;
1466
1467 case 'r':
1468 if (strcmp(p2, "ing") == 0)
1469 return 229;
1470 break;
1471
1472 case 't':
1473 if (strcmp(p2, "ilde") == 0)
1474 return 227;
1475 break;
1476
1477 case 'u':
1478 if (strcmp(p2, "ml") == 0)
1479 return 228;
1480 break;
1481 }
1482 break;
1483
1484 case 'b':
1485 if (strcmp(pszTag + 1, "rvbar") == 0)
1486 return 166;
1487 break;
1488
1489 case 'c':
1490 switch (c1)
1491 {
1492 case 'c':
1493 if (strcmp(p2, "edil") == 0)
1494 return 231;
1495 break;
1496
1497 case 'e':
1498 if (strcmp(p2, "dil") == 0)
1499 return 184;
1500 else if (strcmp(p2, "nt") == 0)
1501 return 162;
1502 break;
1503
1504 case 'o':
1505 if (strcmp(p2, "py") == 0)
1506 return 169;
1507 break;
1508
1509 case 'u':
1510 if (strcmp(p2, "rren") == 0)
1511 return 164;
1512 }
1513 break;
1514
1515 case 'd':
1516 switch (c1)
1517 {
1518 case 'e':
1519 if (strcmp(p2, "g") == 0) return 176;
1520 break;
1521
1522 case 'i':
1523 if (strcmp(p2, "vide") == 0) return 247;
1524 break;
1525 }
1526 break;
1527
1528 case 'e':
1529 switch (c1)
1530 {
1531 case 'a':
1532 if (strcmp(p2, "cute") == 0) return 233;
1533 break;
1534
1535 case 'c':
1536 if (strcmp(p2, "irc") == 0) return 234;
1537 break;
1538
1539 case 'g':
1540 if (strcmp(p2, "rave") == 0) return 232;
1541 break;
1542
1543 case 't':
1544 if (strcmp(p2, "h") == 0) return 240;
1545 break;
1546
1547 case 'u':
1548 if (strcmp(p2, "ml") == 0) return 235;
1549 break;
1550 }
1551 break;
1552
1553 case 'f':
1554 switch (c1)
1555 {
1556 case 'r':
1557 if (strcmp(p2, "ac14") == 0) return 188;
1558 if (strcmp(p2, "ac12") == 0) return 189;
1559 if (strcmp(p2, "ac34") == 0) return 190;
1560 break;
1561 }
1562 break;
1563
1564 case 'g':
1565 switch (c1)
1566 {
1567 case 't':
1568 if (*p2 == 0) return '>';
1569 }
1570 break;
1571
1572 case 'i':
1573 switch (c1)
1574 {
1575 case 'a':
1576 if (strcmp(p2, "cute") == 0) return 237;
1577 break;
1578
1579 case 'c':
1580 if (strcmp(p2, "irc") == 0) return 238;
1581 break;
1582
1583 case 'g':
1584 if (strcmp(p2, "rave") == 0) return 236;
1585 break;
1586
1587 case 'e':
1588 if (strcmp(p2, "xcl") == 0) return 161;
1589 break;
1590
1591 case 'q':
1592 if (strcmp(p2, "uest") == 0) return 191;
1593 break;
1594
1595 case 'u':
1596 if (strcmp(p2, "ml") == 0) return 239;
1597 }
1598 break;
1599
1600 case 'l':
1601 switch (c1)
1602 {
1603 case 't':
1604 if (*p2 == 0)
1605 return '<';
1606 break;
1607
1608 case 'a':
1609 if (strcmp(p2, "quo") == 0) return 171;
1610 }
1611 break;
1612
1613 case 'm':
1614 switch (c1)
1615 {
1616 case 'a':
1617 if (strcmp(p2, "cr") == 0) return 175;
1618 break;
1619
1620 case 'i':
1621 if (strcmp(p2, "cro") == 0) return 181;
1622 if (strcmp(p2, "ddot") == 0) return 183;
1623 break;
1624 }
1625 break;
1626
1627 case 'n':
1628 switch (c1)
1629 {
1630 case 'b':
1631 if (strcmp(p2, "sp") == 0) return 160;
1632 break;
1633
1634 case 'o':
1635 if (strcmp(p2, "t") == 0) return 172;
1636 break;
1637
1638 case 't':
1639 if (strcmp(p2, "ilde") == 0) return 241;
1640 }
1641 break;
1642
1643 case 'o':
1644 switch (c1)
1645 {
1646 case 'a':
1647 if (strcmp(p2, "cute") == 0) return 243;
1648 break;
1649
1650 case 'c':
1651 if (strcmp(p2, "irc") == 0) return 244;
1652 break;
1653
1654 case 'g':
1655 if (strcmp(p2, "rave") == 0) return 242;
1656 break;
1657
1658 case 'r':
1659 if (strcmp(p2, "df") == 0) return 170;
1660 if (strcmp(p2, "dm") == 0) return 186;
1661 break;
1662
1663 case 's':
1664 if (strcmp(p2, "lash") == 0) return 248;
1665 break;
1666
1667 case 't':
1668 if (strcmp(p2, "ilde") == 0) return 245;
1669 break;
1670
1671 case 'u':
1672 if (strcmp(p2, "ml") == 0) return 246;
1673 }
1674 break;
1675
1676 case 'p':
1677 switch (c1)
1678 {
1679 case 'a':
1680 if (strcmp(p2, "ra") == 0) return 182;
1681 break;
1682
1683 case 'l':
1684 if (strcmp(p2, "usmn") == 0) return 177;
1685 break;
1686
1687 case 'o':
1688 if (strcmp(p2, "und") == 0) return 163;
1689 }
1690 break;
1691
1692 case 'q':
1693 if (strcmp(pszTag, "quot") == 0) return '"';
1694 break;
1695
1696 case 'r':
1697 if (strcmp(pszTag, "raquo") == 0) return 187;
1698 if (strcmp(pszTag, "reg") == 0) return 174;
1699 break;
1700
1701 case 's':
1702 switch (c1)
1703 {
1704 case 'z':
1705 if (strcmp(p2, "lig") == 0) return 223;
1706 break;
1707
1708 case 'e':
1709 if (strcmp(p2, "ct") == 0) return 167;
1710 break;
1711
1712 case 'h':
1713 if (strcmp(p2, "y") == 0) return 173;
1714 break;
1715
1716 case 'u':
1717 if (strcmp(p2, "p1") == 0) return 185;
1718 if (strcmp(p2, "p2") == 0) return 178;
1719 if (strcmp(p2, "p3") == 0) return 179;
1720 }
1721 break;
1722
1723 case 't':
1724 if (strcmp(pszTag, "thorn") == 0) return 254;
1725 if (strcmp(pszTag, "times") == 0) return 215;
1726 break;
1727
1728 case 'u':
1729 switch (c1)
1730 {
1731 case 'a':
1732 if (strcmp(p2, "cute") == 0) return 250;
1733 break;
1734
1735 case 'c':
1736 if (strcmp(p2, "irc") == 0) return 251;
1737 break;
1738
1739 case 'g':
1740 if (strcmp(p2, "rave") == 0) return 249;
1741 break;
1742
1743 case 'm':
1744 if (strcmp(p2, "l") == 0) return 168;
1745 break;
1746
1747 case 'u':
1748 if (strcmp(p2, "ml") == 0) return 252;
1749 }
1750 break;
1751
1752 case 'y':
1753 if (strcmp(pszTag, "yacute") == 0) return 253;
1754 if (strcmp(pszTag, "yen") == 0) return 165;
1755 if (strcmp(pszTag, "yuml") == 0) return 255;
1756 break;
1757
1758 case 'A':
1759 switch (c1)
1760 {
1761 case 'u':
1762 if (strcmp(p2, "ml") == 0) return 196;
1763 break;
1764
1765 case 'a':
1766 if (strcmp(p2, "cute") == 0) return 193;
1767 break;
1768
1769 case 'c':
1770 if (strcmp(p2, "irc") == 0) return 194;
1771 break;
1772
1773 case 'E':
1774 if (strcmp(p2, "lig") == 0) return 198;
1775 break;
1776
1777 case 'g':
1778 if (strcmp(p2, "rave") == 0) return 192;
1779 break;
1780
1781 case 'r':
1782 if (strcmp(p2, "ing") == 0) return 197;
1783 break;
1784
1785 case 't':
1786 if (strcmp(p2, "ilde") == 0) return 195;
1787 }
1788 break;
1789
1790 case 'C':
1791 if (strcmp(pszTag, "Ccedil") == 0) return 199;
1792 break;
1793
1794 case 'E':
1795 if (strcmp(pszTag, "Ecirc") == 0) return 202;
1796 if (strcmp(pszTag, "Eacute") == 0) return 201;
1797 if (strcmp(pszTag, "Egrave") == 0) return 200;
1798 if (strcmp(pszTag, "ETH") == 0) return 208;
1799 if (strcmp(pszTag, "Euml") == 0) return 203;
1800 break;
1801
1802 case 'I':
1803 if (strcmp(pszTag, "Icirc") == 0) return 206;
1804 if (strcmp(pszTag, "Iacute") == 0) return 205;
1805 if (strcmp(pszTag, "Igrave") == 0) return 204;
1806 if (strcmp(pszTag, "Iuml") == 0) return 207;
1807 break;
1808
1809 case 'N':
1810 if (strcmp(pszTag, "Ntilde") == 0) return 209;
1811 break;
1812
1813 case 'O':
1814 switch (c1)
1815 {
1816 case 'u':
1817 if (strcmp(p2, "ml") == 0) return 214;
1818 break;
1819
1820 case 'a':
1821 if (strcmp(p2, "cute") == 0) return 211;
1822 break;
1823
1824 case 'c':
1825 if (strcmp(p2, "irc") == 0) return 212;
1826 break;
1827
1828 case 'g':
1829 if (strcmp(p2, "rave") == 0) return 210;
1830 break;
1831
1832 case 't':
1833 if (strcmp(p2, "ilde") == 0) return 213;
1834 break;
1835
1836 case 's':
1837 if (strcmp(p2, "lash") == 0) return 216;
1838 }
1839 break;
1840
1841 case 'U':
1842 switch (c1)
1843 {
1844 case 'a':
1845 if (strcmp(p2, "cute") == 0) return 218;
1846 break;
1847
1848 case 'c':
1849 if (strcmp(p2, "irc") == 0) return 219;
1850 break;
1851
1852 case 'g':
1853 if (strcmp(p2, "rave") == 0) return 217;
1854 break;
1855
1856 case 'u':
1857 if (strcmp(p2, "ml") == 0) return 220;
1858 }
1859 break;
1860
1861 case 'T':
1862 if (strcmp(pszTag, "THORN") == 0) return 222;
1863 break;
1864
1865 case 'Y':
1866 if (strcmp(pszTag, "Yacute") == 0) return 221;
1867 break;
1868 }
1869
1870 return (crc);
1871}
1872
1873/*
1874 *@@ HandleEscape:
1875 * called by txvConvertFromHTML when a "&" character
1876 * is found in the source buffer. This calls
1877 * ConvertEscape in turn.
1878 *
1879 *@@added V0.9.3 (2000-05-18) [umoeller]
1880 */
1881
1882VOID HandleEscape(PCOPYTARGET pct)
1883{
1884 // ampersand:
1885 // replace special characters
1886 PSZ pStartOfTag = pct->pSource;
1887 // find end of tag
1888 PSZ p2 = pStartOfTag,
1889 pNextClose = 0,
1890 pNextSpace = 0;
1891 BOOL fCont = TRUE;
1892 while (fCont)
1893 {
1894 switch (*p2)
1895 {
1896 case 0:
1897 fCont = FALSE;
1898 break;
1899
1900 case ';':
1901 pNextClose = p2;
1902 fCont = FALSE;
1903 break;
1904
1905 case ' ':
1906 if (!pNextSpace)
1907 pNextSpace = p2;
1908 break;
1909 }
1910 p2++;
1911 }
1912
1913 if (!pNextClose)
1914 // no closing tag found:
1915 // just insert the '&' and go on, we have no tag here
1916 AppendChar(pct,
1917 *pct->pSource++);
1918 else
1919 {
1920 if ((pNextSpace) && (pNextSpace < pNextClose))
1921 // space before ';':
1922 // just insert the '&' and go on, we have no tag here
1923 AppendChar(pct,
1924 *pct->pSource++);
1925 else if ((!pNextClose) || (pNextClose <= pStartOfTag + 1))
1926 AppendChar(pct,
1927 *pct->pSource++);
1928 else
1929 {
1930 ULONG ulCode = 0;
1931
1932 // create substring with tag
1933 PSZ pszTag = pStartOfTag + 1;
1934 *pNextClose = 0;
1935
1936 if (*pszTag == '#')
1937 {
1938 // latin-1 or Unicode encoding (&#000;)
1939 ulCode = atoi(pszTag + 1);
1940
1941 // next input: char after ';'
1942 pct->pSource = pNextClose + 1;
1943 }
1944 else
1945 {
1946 // named entity:
1947 // find char code corresponding to escape
1948 // from G_EscapeProcessors map
1949 ulCode = ConvertEscape(pszTag);
1950 if (ulCode)
1951 // tag supported:
1952 pct->pSource = pNextClose + 1;
1953 else
1954 // tag not supported:
1955 ulCode = *pct->pSource++;
1956 }
1957
1958 // restore closing tag which we overwrote
1959 *pNextClose = ';';
1960
1961 if (ulCode)
1962 {
1963 AppendLinebreakCheck(pct);
1964
1965 AppendChar(pct,
1966 (CHAR)ulCode);
1967 pct->fSkipNextSpace = FALSE;
1968 }
1969 }
1970 }
1971}
1972
1973/* ******************************************************************
1974 *
1975 * Entry points
1976 *
1977 ********************************************************************/
1978
1979/*
1980 *@@ txvConvertFromHTML:
1981 * this modifies the given text string (which should
1982 * be the complete BODY block of any HTML file) so
1983 * that all HTML tags are removed and replaced with
1984 * escape sequences that the XTextView control understands.
1985 *
1986 * The buffer gets reallocated by this function, so it
1987 * must be free()'able.
1988 *
1989 * So, to have the XTextView control display an HTML file,
1990 * do this:
1991 *
1992 * 1) Load an HTML file into a buffer allocated by malloc().
1993 *
1994 * 2) Call txvConvertFromHTML.
1995 *
1996 * 3) Call WinSetWindowText on the XTextView control with
1997 * the modified buffer.
1998 *
1999 * This understands the following limited subset of HTML:
2000 *
2001 * Paragraph tags:
2002 *
2003 * -- P, BR
2004 * -- PRE, /PRE
2005 * -- UL, /UL, OL, /OL, LI
2006 * -- DL, /DL, DT, DD
2007 * -- H1, /H1 thru H6, /H6
2008 * -- Comments (<!-- .... -->)
2009 *
2010 * Character tags:
2011 *
2012 * -- B, /B, STRONG, /STRONG
2013 * -- I, /I, EM, /EM, VAR, /VAR, CITE, /CITE
2014 * -- CODE, /CODE, SAMP, /SAMP, KBD, /KBD, TT, /TT
2015 * -- U, /U
2016 * -- STRIKE, /STRIKE
2017 * -- CODE, /CODE
2018 *
2019 * The most obvious limitation is that neither tables
2020 * nor frames are supported. Also forget about CSS
2021 * and JavaScript, of course.
2022 *
2023 * All the ampersand (&amp; something) sequences defined
2024 * in HTML 3 are properly translated.
2025 *
2026 * Note: Those are translated to the ANSI (MS-Windows,
2027 * OS/2 codepage 1004) character set. This has the
2028 * following characteristics:
2029 *
2030 * -- Codes 0-127 are identical to ASCII and thus
2031 * ISO 8559-1 ("Latin 1") also.
2032 *
2033 * -- Codes 160-255 are identical to ISO 8559-1 ("Latin 1").
2034 *
2035 * -- Codes 128-159 are NOT defined in ISO 8559-1, but
2036 * Netscape treats those as ANSI as well, so we do too.
2037 *
2038 * As a result, consider the output to be in OS/2 codepage
2039 * 1004. Either set your codepage to that (WinSetCp)
2040 * or translate the output (WinCpTranslateString).
2041 *
2042 * &#xxx; tags (with xxx being a decimal) are considered
2043 * ANSI codes as well. Even though HTML 4.0 allows Unicode
2044 * characters > 255 to be inserted this way, we ignore
2045 * those. Unicode chars from 0 to 255 are identical to
2046 * ANSI, so for &#000; to &#255;, we are HTML-compliant.
2047 *
2048 * All other tags are completely thrown out.
2049 *
2050 *@@added V0.9.3 (2000-05-06) [umoeller]
2051 */
2052
2053BOOL txvConvertFromHTML(char **ppszText,
2054 PVOID pxhtml, // out: various config data (PXHTMLDATA)
2055 PULONG pulProgress, // out: progress (ptr can be NULL)
2056 PBOOL pfCancel) // in: cancel flag (ptr can be NULL)
2057{
2058 BOOL brc = TRUE;
2059
2060 ULONG cbSource = strlen(*ppszText);
2061
2062 COPYTARGET ct = {0};
2063
2064 lstInit(&ct.llLists,
2065 TRUE); // free items
2066
2067 ct.pSource = *ppszText;
2068 // skip leading spaces
2069 ct.fSkipNextSpace = TRUE;
2070 ct.pxhtml = (PXHTMLDATA)pxhtml;
2071
2072 // step 2:
2073 // actual tags formatting
2074
2075 while (TRUE)
2076 {
2077 CHAR c = *ct.pSource;
2078
2079 if (pfCancel)
2080 if (*pfCancel)
2081 {
2082 brc = FALSE;
2083 break;
2084 }
2085
2086 if (!c)
2087 // null terminator reached:
2088 break;
2089
2090 // calculate progress
2091 if (pulProgress)
2092 *pulProgress = ((ct.pSource - *ppszText) // characters done
2093 * 100
2094 / cbSource); // characters total
2095
2096 switch (c)
2097 {
2098 case '<':
2099 HandleTag(&ct);
2100 break;
2101
2102 case '&':
2103 HandleEscape(&ct);
2104 break;
2105
2106 case '\r':
2107 // skip
2108 if (!ct.fSkipNextSpace)
2109 {
2110 AppendChar(&ct,
2111 ' ');
2112 // ct.fNeedsLinebreak = FALSE;
2113 // but skip leading spaces which might follow
2114 if (!ct.fPRE)
2115 ct.fSkipNextSpace = TRUE;
2116 }
2117 ct.pSource++;
2118 break;
2119
2120 case '\t':
2121 {
2122 if (ct.fPRE)
2123 {
2124 ULONG ul;
2125 for (ul = 0;
2126 ul < 8;
2127 ul++)
2128 AppendChar(&ct,
2129 ' ');
2130 }
2131 else
2132 {
2133 // not in PRE block:
2134 if ( (!ct.fSkipNextSpace)
2135 // && (!ct.fNeedsLinebreak)
2136 )
2137 // last was not space: copy
2138 AppendChar(&ct,
2139 ' ');
2140
2141 ct.fSkipNextSpace = TRUE;
2142 }
2143
2144 // skip the tab
2145 ct.pSource++;
2146 break; }
2147
2148 case '\n':
2149 {
2150 // newline char:
2151 if (!ct.fPRE)
2152 {
2153 // if not in PRE mode, replace with space
2154 if (!ct.fSkipNextSpace)
2155 {
2156 AppendChar(&ct,
2157 ' ');
2158 // ct.fNeedsLinebreak = FALSE;
2159 // but skip leading spaces which might follow
2160 ct.fSkipNextSpace = TRUE;
2161 }
2162 }
2163 else
2164 // in PRE mode, preserve line breaks
2165 AppendChar(&ct, '\n'); // ct.fNeedsLinebreak = TRUE;
2166
2167 ct.pSource++;
2168 break; }
2169
2170 case '\xFF':
2171 {
2172 AppendChar(&ct,
2173 ' ');
2174 ct.pSource++;
2175 break; }
2176
2177 case ' ':
2178 if (!ct.fPRE)
2179 {
2180 // is space, and not in PRE block:
2181 if ( (!ct.fSkipNextSpace)
2182 // && (!ct.fNeedsLinebreak)
2183 )
2184 // last was not space: copy
2185 AppendChar(&ct,
2186 ' ');
2187
2188 ct.fSkipNextSpace = TRUE;
2189 }
2190 else
2191 // in PRE, always add all spaces
2192 AppendChar(&ct,
2193 ' ');
2194 ct.pSource++;
2195 break;
2196
2197 default:
2198 // if we're not inserting escapes or anything,
2199 // check if a linebreak is needed
2200 AppendLinebreakCheck(&ct);
2201
2202 AppendChar(&ct,
2203 *ct.pSource++);
2204 ct.fSkipNextSpace = FALSE;
2205 ct.fSkipNextLinebreak = FALSE;
2206
2207 } // end switch (*pSource);
2208 } // end while (*pSource)
2209 AppendChar(&ct,
2210 '\n');
2211 // append null-terminator
2212 AppendChar(&ct,
2213 0);
2214
2215 free(*ppszText);
2216 *ppszText = ct.pszNew;
2217
2218 lstClear(&ct.llLists);
2219
2220 return (brc);
2221}
2222
2223
Note: See TracBrowser for help on using the repository browser.