source: trunk/src/helpers/textv_html.c@ 9

Last change on this file since 9 was 8, checked in by umoeller, 25 years ago

Initial checkin of helpers code which used to be in WarpIN.

  • Property svn:eol-style set to CRLF
  • Property svn:keywords set to Author Date Id Revision
File size: 62.0 KB
Line 
1
2/*
3 *@@sourcefile textv_html.c:
4 * this code converts HTML code to escape sequences for the
5 * XTextView control (textview.c).
6 *
7 * This code is in part ugly spaghetti, but this is intentional to
8 * make this HTML parser FAST. In general, you get about double or
9 * triple the speed compared to Netscape 4.6 on OS/2. This code
10 * doesn't understand all of HTML though, but you get most of HTML 2.
11 * There's no tables or frames at this point.
12 *
13 * The entry point into this mess is txvConvertFromHTML, which
14 * is easy to use.
15 *
16 * Note: Version numbering in this file relates to XWorkplace version
17 * numbering.
18 *
19 *@@header "helpers\textv_html.h"
20 *
21 *@@added V0.9.3 (2000-05-10) [umoeller]
22 */
23
24/*
25 * Copyright (C) 2000 Ulrich M”ller.
26 * This program is part of the XWorkplace package.
27 * This program is free software; you can redistribute it and/or modify
28 * it under the terms of the GNU General Public License as published by
29 * the Free Software Foundation, in version 2 as it comes in the COPYING
30 * file of the XWorkplace main distribution.
31 * This program is distributed in the hope that it will be useful,
32 * but WITHOUT ANY WARRANTY; without even the implied warranty of
33 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34 * GNU General Public License for more details.
35 */
36
37#define OS2EMX_PLAIN_CHAR
38 // this is needed for "os2emx.h"; if this is defined,
39 // emx will define PSZ as _signed_ char, otherwise
40 // as unsigned char
41
42#include <os2.h>
43
44#include <stdlib.h>
45#include <stdio.h>
46#include <string.h>
47
48#include "setup.h" // code generation and debugging options
49
50#include "helpers\linklist.h"
51#include "helpers\stringh.h"
52#include "helpers\textview.h"
53
54#include "helpers\textv_html.h"
55
56/*
57 *@@category: Helpers\PM helpers\Window classes\XTextView control\HTML conversion
58 */
59
60/* ******************************************************************
61 * *
62 * Declarations *
63 * *
64 ********************************************************************/
65
66/*
67 *@@ LISTDESC:
68 * structure stored in COPYTARGET to
69 * hold list information (UL, OL, ... tags).
70 *
71 *@@added V0.9.3 (2000-05-07) [umoeller]
72 */
73
74typedef struct _LISTDESC
75{
76 ULONG ulListType; // 0: unordered (UL)
77 // 1: ordered (OL)
78 // 2: definition lists (DL)
79 ULONG ulItem; // list enumeration; 1 on first item,
80 // 2 on next, ...
81} LISTDESC, *PLISTDESC;
82
83/*
84 *@@ COPYTARGET:
85 * monster structure which holds the current
86 * status of the HTML converter while conversion
87 * is taking place. This stores input/output pointers
88 * and various flags to avoid duplicate line breaks
89 * and such.
90 *
91 * One instance of this is created in txvConvertFromHTML
92 * on the stack and then passed to all the sub-function
93 * calls.
94 *
95 *@@added V0.9.3 (2000-05-06) [umoeller]
96 */
97
98typedef struct _COPYTARGET
99{
100 PSZ pSource; // ptr into source string;
101 // valid ONLY while we're in a tag handler
102 PSZ pNewSource; // can be set by tag handler to skip characters;
103 // this is set to NULL before calling a tag
104 // handler; if this is still NULL, default
105 // processing occurs
106
107 // new string:
108 PSZ pszNew; // memory buffer
109 ULONG cbNew; // size of buffer (reallocated)
110 PSZ pTarget; // current char ptr into pszNew
111
112 // saved character while tag handler is being called
113 CHAR cSaved;
114
115 PXHTMLDATA pxhtml; // ptr to XHTMLDATA passed to txvConvertFromHTML
116
117 // formatting flags while going through the text
118 BOOL fSkipNextSpace;
119 // if TRUE, subsequent spaces are skipped
120 BOOL fNeedsLinebreak;
121 // if TRUE, \n is inserted before any other character
122 BOOL fSkipNextLinebreak;
123 // if TRUE, subsequent linebreaks are skipped
124 BOOL fPRE;
125 // are we currently in a PRE tag?
126 BOOL fInLink;
127 // are we currently in a A HREF= tag?
128
129 // arguments (attributes) for tag handlers
130 PSZ pszAttributes; // != NULL while a tag handler is being called
131 // and attributes exist for the tag
132
133 // anchors count
134 USHORT usAnchorIndex; // start with 1
135
136 // list maintenance
137 ULONG ulListLevel; // if > 0, we're in a UL or OL block;
138 // raised for each block
139 ULONG ulUnorderedListLevel; // raised with each UL block to keep track
140 // of bullets
141 ULONG ulOrderedListLevel; // raised with each UL block to keep track
142 // of 1), 2), a), b)... numbering
143 ULONG ulCurrentListType; // current list type (from highest LISTDESC)
144 BOOL fInDT; // TRUE if we're currently in a DT tag
145 LINKLIST llLists; // stack of LISTDESC items
146} COPYTARGET, *PCOPYTARGET;
147
148typedef VOID FNPROCESSTAG(PCOPYTARGET pct);
149typedef FNPROCESSTAG *PFNPROCESSTAG;
150
151/* ******************************************************************
152 * *
153 * Global variables *
154 * *
155 ********************************************************************/
156
157/* ******************************************************************
158 * *
159 * Append-char helpers *
160 * *
161 ********************************************************************/
162
163#define COPYTARGETALLOC 100000
164
165/*
166 *@@ AppendChar:
167 * helper for txvConvertFromHTML to
168 * append a char to the target string
169 * in COPYTARGET.
170 * This performs a few additional checks
171 * and manages memory.
172 *
173 *@@added V0.9.3 (2000-05-06) [umoeller]
174 */
175
176VOID AppendChar(PCOPYTARGET pct, // in/out: formatting buffer
177 CHAR c)
178{
179 // calculate ofs where to store next char
180 ULONG cbOfsNext = pct->pTarget - pct->pszNew;
181 if (cbOfsNext >= pct->cbNew) // have we reached the buffer size yet?
182 {
183 // more mem needed:
184 pct->cbNew += COPYTARGETALLOC;
185 pct->pszNew = (PSZ)realloc(pct->pszNew, pct->cbNew);
186 // if first call, pszNew is NULL, and realloc
187 // behaves just like malloc
188 // adjust target, because ptr might have changed
189 pct->pTarget = pct->pszNew + cbOfsNext;
190 }
191
192 // append character
193 *pct->pTarget++ = c;
194}
195
196/*
197 *@@ AppendString:
198 * appends the characters in *ach,
199 * which must be null-terminated.
200 * Does NOT append a null character though.
201 *
202 *@@added V0.9.3 (2000-05-06) [umoeller]
203 */
204
205VOID AppendString(PCOPYTARGET pct, // in/out: formatting buffer
206 char *ach)
207{
208 ULONG cbAppend = strlen(ach);
209 ULONG ul;
210 PSZ pSource;
211
212 // calculate ofs where to store next char
213 ULONG cbOfsNext = pct->pTarget - pct->pszNew;
214 while (cbOfsNext + cbAppend >= pct->cbNew)
215 {
216 // more mem needed:
217 pct->cbNew += COPYTARGETALLOC;
218 pct->pszNew = (PSZ)realloc(pct->pszNew, pct->cbNew);
219 // if first call, pszNew is NULL, and realloc
220 // behaves just like malloc
221 // adjust target, because ptr might have changed
222 pct->pTarget = pct->pszNew + cbOfsNext;
223 }
224
225 // append characters
226 pSource = ach;
227 for (ul = 0;
228 ul < cbAppend;
229 ul++)
230 *pct->pTarget++ = *pSource++;
231}
232
233/*
234 *@@ AppendLinebreakCheck:
235 * checks if a linebreak is needed and
236 * inserts one if so.
237 *
238 *@@added V0.9.3 (2000-05-17) [umoeller]
239 */
240
241VOID AppendLinebreakCheck(PCOPYTARGET pct)
242{
243 if ((!pct->fPRE) && (pct->fNeedsLinebreak))
244 {
245 // yes: insert linebreak; this resets pct->fNeedsLinebreak
246 if (!pct->fSkipNextLinebreak)
247 {
248 AppendChar(pct, '\n');
249
250 if ((pct->ulListLevel) && (!pct->fInDT))
251 // if we're in a list, add a tab also,
252 // because we'll have a negative first-line margin
253 AppendString(pct, TXVESC_TAB);
254 }
255 pct->fNeedsLinebreak = FALSE;
256 }
257}
258
259/*
260 *@@ AppendEscapeWithDecimal:
261 * appends the specified escape code
262 * with a three-digit decimal parameter.
263 * Calls AppendString in turn.
264 *
265 *@@added V0.9.3 (2000-05-07) [umoeller]
266 */
267
268VOID AppendEscapeWith3Decimals(PCOPYTARGET pct, // in/out: formatting buffer
269 char *ach,
270 USHORT us)
271{
272 CHAR szDecimal[10];
273 if (us > 999)
274 us = 999;
275 sprintf(szDecimal, "%03d", us);
276 // append escape
277 AppendString(pct, ach);
278 AppendString(pct, szDecimal);
279}
280
281/*
282 *@@ AppendEscapeWith4Decimals:
283 *
284 *@@added V0.9.3 (2000-05-07) [umoeller]
285 */
286
287VOID AppendEscapeWith4Decimals(PCOPYTARGET pct, // in/out: formatting buffer
288 char *ach,
289 USHORT us)
290{
291 CHAR szDecimal[10];
292 if (us > 9999)
293 us = 9999;
294 sprintf(szDecimal, "%04d", us);
295 // append escape
296 AppendString(pct, ach);
297 AppendString(pct, szDecimal);
298}
299
300/* ******************************************************************
301 * *
302 * Tag converter functions *
303 * *
304 ********************************************************************/
305
306/*
307 *@@ StartList:
308 * starts a list (UL or OL).
309 * This uses a linked list in COPYTARGET
310 * to keep a pseudo-stack for nested lists.
311 *
312 *@@added V0.9.3 (2000-05-08) [umoeller]
313 */
314
315VOID StartList(PCOPYTARGET pct, // in/out: formatting buffer
316 ULONG ulListType) // list type:
317 // 0: unordered (UL)
318 // 1: ordered (OL)
319 // 2: definition lists (DL)
320{
321 PLISTDESC pListDesc;
322
323 // raise list level
324 pct->ulListLevel++;
325
326 if (ulListType == 0)
327 // unordered:
328 pct->ulUnorderedListLevel++;
329 else if (ulListType == 1)
330 // ordered:
331 pct->ulOrderedListLevel++;
332
333 // create LISTDESC and store it on stack
334 pListDesc = (PLISTDESC)malloc(sizeof(LISTDESC));
335 pListDesc->ulListType
336 = pct->ulCurrentListType
337 = ulListType;
338 pListDesc->ulItem = 1;
339
340 lstAppendItem(&pct->llLists,
341 pListDesc);
342
343 AppendEscapeWith4Decimals(pct,
344 TXVESC_LEFTMARGIN,
345 pct->ulListLevel * 5);
346 AppendEscapeWith3Decimals(pct,
347 TXVESC_FIRSTLINEMARGIN_LEFT,
348 (ulListType == 2)
349 ? 5 // for definition lists
350 : 3); // negative!
351 // add \n before any other character
352 pct->fNeedsLinebreak = TRUE;
353}
354
355/*
356 *@@ StopList:
357 * stops a list (UL or OL).
358 *
359 *@@added V0.9.3 (2000-05-07) [umoeller]
360 */
361
362VOID StopList(PCOPYTARGET pct)
363{
364 if (pct->ulListLevel)
365 {
366 PLISTNODE pNode;
367
368 // lower list level
369 pct->ulListLevel--;
370 AppendEscapeWith4Decimals(pct,
371 TXVESC_LEFTMARGIN,
372 pct->ulListLevel * 5);
373 AppendEscapeWith3Decimals(pct,
374 TXVESC_FIRSTLINEMARGIN_LEFT,
375 (pct->ulListLevel)
376 ? 3 // we still have a list level (nested)
377 : 0);
378 pct->fNeedsLinebreak = TRUE;
379
380 // remove the LISTDESC from the stack
381 pNode = lstNodeFromIndex(&pct->llLists,
382 pct->ulListLevel); // this has been lowered already
383 if (pNode)
384 {
385 PLISTDESC pListDesc = (PLISTDESC)pNode->pItemData;
386 if (pListDesc->ulListType == 0)
387 // was unordered:
388 pct->ulUnorderedListLevel--;
389 else if (pListDesc->ulListType == 1)
390 // was ordered:
391 pct->ulOrderedListLevel--;
392
393 lstRemoveNode(&pct->llLists, pNode);
394
395 // update COPYTARGET with previous list level
396 if (pct->ulListLevel)
397 {
398 // we're still in a list (nested lists):
399 PLISTDESC pListDesc2 = (PLISTDESC)lstItemFromIndex(&pct->llLists,
400 pct->ulListLevel - 1);
401 if (pListDesc2)
402 pct->ulCurrentListType = pListDesc2->ulListType;
403 }
404 }
405 }
406 // else: buggy HTML code, ignore
407}
408
409/*
410 *@@ TagTITLE:
411 *
412 *@@added V0.9.3 (2000-05-19) [umoeller]
413 */
414
415VOID TagTITLE(PCOPYTARGET pct)
416{
417 // pSource currently points to <TITLE tag
418 PSZ pSource = pct->pSource + strlen(pct->pSource);
419 // points to temporary null byte in main buffer now
420 *pSource = pct->cSaved;
421
422 pSource = strchr(pct->pSource, '>');
423 if (pSource)
424 {
425 PSZ pNextOpen = strchr(pSource, '<');
426 if (pNextOpen)
427 {
428 // extract title
429 pct->pxhtml->pszTitle = strhSubstr(pSource + 1, pNextOpen);
430
431 if (strnicmp(pNextOpen + 1, "/TITLE", 6) == 0)
432 {
433 // closing /TITLE tag found:
434 // search on after that
435 pct->pNewSource = strchr(pNextOpen, '>');
436 if (pct->pNewSource)
437 pct->pNewSource++;
438 }
439 }
440 }
441}
442
443/*
444 *@@ TagP:
445 *
446 */
447
448VOID TagP(PCOPYTARGET pct)
449{
450 // append newline:
451 // add \n before any other character
452 pct->fNeedsLinebreak = TRUE;
453
454 /* if (pct->ulListLevel)
455 {
456 // if we are currently in a list, we must also
457 // add a tab escape, because we have set
458 // the first line margin to the left of the
459 // left margin
460 AppendString(pct,
461 TXVESC_TAB);
462 } */
463}
464
465VOID TagBR(PCOPYTARGET pct)
466{
467 AppendChar(pct,
468 '\r');
469
470 if (pct->ulListLevel)
471 {
472 // if we are currently in a list, we must also
473 // add a tab escape, because we have set
474 // the first line margin to the left of the
475 // left margin
476 AppendString(pct,
477 TXVESC_TAB);
478 }
479 if (!pct->fPRE)
480 pct->fSkipNextSpace = TRUE;
481}
482
483VOID TagPRE(PCOPYTARGET pct)
484{
485 // start of PRE tag:
486 // add \n before any other character
487 // pct->fNeedsLinebreak = TRUE;
488 AppendChar(pct, '\n');
489 pct->fNeedsLinebreak = FALSE;
490 /* AppendString(pct,
491 TXVESC_PRE_BEGIN); */
492 AppendEscapeWith3Decimals(pct,
493 TXVESC_SET_FONT,
494 1); // monospaced font
495 AppendEscapeWith4Decimals(pct,
496 TXVESC_SPACEBEFORE,
497 0); // no spacing before
498 AppendEscapeWith4Decimals(pct,
499 TXVESC_SPACEAFTER,
500 0); // no spacing after
501 // disable word-wrapping
502 AppendString(pct,
503 TXVESC_WORDWRAP "0");
504 pct->fPRE = TRUE;
505 pct->fSkipNextSpace = FALSE;
506}
507
508VOID TagXPRE(PCOPYTARGET pct)
509{
510 pct->fPRE = FALSE;
511 AppendEscapeWith3Decimals(pct,
512 TXVESC_SET_FONT,
513 0); // standard font
514 AppendString(pct, TXVESC_SPACEBEFORE);
515 AppendString(pct, "####"); // reset to default
516 AppendString(pct, TXVESC_SPACEAFTER);
517 AppendString(pct, "####"); // reset to default
518 // re-enable word-wrapping
519 AppendString(pct,
520 TXVESC_WORDWRAP "1"
521 "\n"); // force line break
522 pct->fNeedsLinebreak = FALSE;
523 // refuse to add \n even if we have another "p" coming up
524 pct->fSkipNextLinebreak = TRUE;
525 pct->fSkipNextSpace = TRUE;
526}
527
528VOID TagH1(PCOPYTARGET pct)
529{
530 pct->fNeedsLinebreak = TRUE;
531 AppendEscapeWith3Decimals(pct,
532 TXVESC_POINTSIZE_REL,
533 200); // double size
534 AppendString(pct,
535 TXVESC_BOLD_BEGIN);
536}
537
538VOID TagXH1(PCOPYTARGET pct)
539{
540 AppendString(pct,
541 TXVESC_BOLD_END);
542 AppendEscapeWith3Decimals(pct,
543 TXVESC_POINTSIZE_REL,
544 100); // regular size
545 // add \n before any other character
546 pct->fNeedsLinebreak = TRUE;
547}
548
549VOID TagH2(PCOPYTARGET pct)
550{
551 pct->fNeedsLinebreak = TRUE;
552 AppendEscapeWith3Decimals(pct,
553 TXVESC_POINTSIZE_REL,
554 175); // size in percent of regular point size
555 AppendString(pct,
556 TXVESC_BOLD_BEGIN);
557}
558
559VOID TagXH2(PCOPYTARGET pct)
560{
561 AppendString(pct,
562 TXVESC_BOLD_END);
563 AppendEscapeWith3Decimals(pct,
564 TXVESC_POINTSIZE_REL,
565 100); // regular size
566 // add \n before any other character
567 pct->fNeedsLinebreak = TRUE;
568}
569
570VOID TagH3(PCOPYTARGET pct)
571{
572 pct->fNeedsLinebreak = TRUE;
573 AppendEscapeWith3Decimals(pct,
574 TXVESC_POINTSIZE_REL,
575 150); // size in percent of regular point size
576 AppendString(pct,
577 TXVESC_BOLD_BEGIN);
578}
579
580VOID TagXH3(PCOPYTARGET pct)
581{
582 AppendString(pct,
583 TXVESC_BOLD_END);
584 AppendEscapeWith3Decimals(pct,
585 TXVESC_POINTSIZE_REL,
586 100); // size in percent of regular point size
587 // add \n before any other character
588 pct->fNeedsLinebreak = TRUE;
589}
590
591VOID TagH4(PCOPYTARGET pct)
592{
593 pct->fNeedsLinebreak = TRUE;
594 AppendEscapeWith3Decimals(pct,
595 TXVESC_POINTSIZE_REL,
596 125); // size in percent of regular point size
597 AppendString(pct,
598 TXVESC_BOLD_BEGIN);
599}
600
601VOID TagXH4(PCOPYTARGET pct)
602{
603 AppendString(pct,
604 TXVESC_BOLD_END);
605 AppendEscapeWith3Decimals(pct,
606 TXVESC_POINTSIZE_REL,
607 100); // regular size
608 // add \n before any other character
609 pct->fNeedsLinebreak = TRUE;
610}
611
612VOID TagH5(PCOPYTARGET pct)
613{
614 pct->fNeedsLinebreak = TRUE;
615 AppendEscapeWith3Decimals(pct,
616 TXVESC_POINTSIZE_REL,
617 100); // size in percent of regular point size
618 AppendString(pct,
619 TXVESC_BOLD_BEGIN);
620}
621
622VOID TagXH5(PCOPYTARGET pct)
623{
624 AppendString(pct,
625 TXVESC_BOLD_END);
626 AppendEscapeWith3Decimals(pct,
627 TXVESC_POINTSIZE_REL,
628 100); // regular size
629 // add \n before any other character
630 pct->fNeedsLinebreak = TRUE;
631}
632
633VOID TagH6(PCOPYTARGET pct)
634{
635 pct->fNeedsLinebreak = TRUE;
636 AppendEscapeWith3Decimals(pct,
637 TXVESC_POINTSIZE_REL,
638 80 ); // size in percent of regular point size
639 AppendString(pct,
640 TXVESC_BOLD_BEGIN);
641}
642
643VOID TagXH6(PCOPYTARGET pct)
644{
645 AppendString(pct,
646 TXVESC_BOLD_END);
647 AppendEscapeWith3Decimals(pct,
648 TXVESC_POINTSIZE_REL,
649 100); // regular size
650 // add \n before any other character
651 pct->fNeedsLinebreak = TRUE;
652}
653
654VOID TagUL(PCOPYTARGET pct)
655{
656 StartList(pct,
657 0); // unordered
658}
659
660VOID TagXUL(PCOPYTARGET pct)
661{
662 StopList(pct);
663}
664
665VOID TagOL(PCOPYTARGET pct)
666{
667 StartList(pct,
668 1); // ordered
669}
670
671VOID TagXOL(PCOPYTARGET pct)
672{
673 StopList(pct);
674}
675
676VOID TagLI(PCOPYTARGET pct)
677{
678 PLISTDESC pListDesc;
679 CHAR szMarker[20] = TXVESC_MARKER "\x01";
680
681 if (pct->ulListLevel)
682 {
683 // we're in a list:
684 pListDesc = (PLISTDESC)lstItemFromIndex(&pct->llLists,
685 pct->ulListLevel - 1);
686 if (pListDesc)
687 if (pListDesc->ulListType == 1)
688 // is ordered list:
689 sprintf(szMarker, "%d.", (pListDesc->ulItem)++);
690 else if (pListDesc->ulListType == 0)
691 // is unordered list:
692 // set bullet type according to unordered nesting
693 szMarker[2] = pct->ulUnorderedListLevel;
694 }
695
696 // add \n before any other character
697 // pct->fNeedsLinebreak = TRUE;
698 // if (pct->fNeedsLinebreak)
699 {
700 AppendChar(pct, '\n');
701 pct->fNeedsLinebreak = FALSE;
702 }
703
704 AppendString(pct, szMarker);
705 AppendString(pct, TXVESC_TAB);
706}
707
708VOID TagDL(PCOPYTARGET pct)
709{
710 StartList(pct,
711 2); // definition list
712}
713
714VOID TagXDL(PCOPYTARGET pct)
715{
716 StopList(pct);
717 pct->fInDT = FALSE;
718}
719
720VOID TagDT(PCOPYTARGET pct)
721{
722 pct->fNeedsLinebreak = TRUE;
723 pct->fInDT = TRUE;
724}
725
726VOID TagDD(PCOPYTARGET pct)
727{
728 pct->fNeedsLinebreak = TRUE;
729 AppendString(pct, TXVESC_TAB);
730 if (!pct->fPRE)
731 pct->fSkipNextSpace = TRUE;
732 pct->fInDT = FALSE;
733}
734
735VOID TagTR(PCOPYTARGET pct)
736{
737 pct->fNeedsLinebreak = TRUE;
738}
739
740VOID TagB(PCOPYTARGET pct)
741{
742 AppendString(pct,
743 TXVESC_BOLD_BEGIN);
744}
745
746VOID TagXB(PCOPYTARGET pct)
747{
748 AppendString(pct,
749 TXVESC_BOLD_END);
750}
751
752VOID TagI(PCOPYTARGET pct)
753{
754 AppendString(pct,
755 TXVESC_ITALICS_BEGIN);
756}
757
758VOID TagXI(PCOPYTARGET pct)
759{
760 AppendString(pct,
761 TXVESC_ITALICS_END);
762}
763
764VOID TagU(PCOPYTARGET pct)
765{
766 AppendString(pct,
767 TXVESC_UNDERLINE_BEGIN);
768}
769
770VOID TagXU(PCOPYTARGET pct)
771{
772 AppendString(pct,
773 TXVESC_UNDERLINE_END);
774}
775
776VOID TagSTRIKE(PCOPYTARGET pct)
777{
778 AppendString(pct,
779 TXVESC_STRIKE_BEGIN);
780}
781
782VOID TagXSTRIKE(PCOPYTARGET pct)
783{
784 AppendString(pct,
785 TXVESC_STRIKE_END);
786}
787
788VOID TagCODE(PCOPYTARGET pct)
789{
790 AppendEscapeWith3Decimals(pct,
791 TXVESC_SET_FONT,
792 1); // monospaced font
793}
794
795VOID TagXCODE(PCOPYTARGET pct)
796{
797 AppendEscapeWith3Decimals(pct,
798 TXVESC_SET_FONT,
799 0); // regular font
800}
801
802VOID TagA(PCOPYTARGET pct)
803{
804 CHAR szAnchor[10];
805
806 pct->fInLink = FALSE;
807
808 if ((pct->pszAttributes) && (pct->pxhtml)) // points into main source buffer
809 {
810 // we have attributes:
811 PSZ pszClosingTag = strchr(pct->pszAttributes, '>');
812 if (pszClosingTag)
813 {
814 ULONG ulOfs = 0;
815
816 /*
817 * HREF attribute:
818 *
819 */
820
821 PSZ pHREF = strhGetTextAttr(pct->pszAttributes, "HREF", &ulOfs),
822 pNAME = 0;
823
824 // replace '>' with null char to mark end of search
825 *pszClosingTag = 0;
826
827 if (pHREF)
828 {
829 // OK, we got a link target:
830 // create a link item and append it to the output list
831 PXHTMLLINK pNewLink = (PXHTMLLINK)malloc(sizeof(XHTMLLINK));
832 memset(pNewLink, 0, sizeof(XHTMLLINK));
833
834 pct->fInLink = TRUE;
835
836 // this starts with anchor 1
837 pNewLink->usLinkIndex = ++pct->usAnchorIndex;
838 pNewLink->pszTargetFile = pHREF;
839 // do not free
840 lstAppendItem(&pct->pxhtml->llLinks, pNewLink);
841 }
842
843 /*
844 * NAME attribute:
845 *
846 */
847
848 pNAME = strhGetTextAttr(pct->pszAttributes, "NAME", &ulOfs);
849 if (pNAME)
850 {
851 AppendString(pct,
852 TXVESC_ANCHORNAME);
853 AppendString(pct,
854 pNAME);
855 // must be terminated with 0xFF
856 AppendChar(pct, 0xFF);
857 free(pNAME);
858 }
859 // restore '>'
860 *pszClosingTag = '>';
861 }
862 }
863
864 if (pct->fInLink)
865 {
866 sprintf(szAnchor, "%04lX", pct->usAnchorIndex);
867 AppendString(pct,
868 TXVESC_LINK);
869 AppendString(pct,
870 szAnchor);
871 }
872}
873
874VOID TagXA(PCOPYTARGET pct)
875{
876 if (pct->fInLink)
877 {
878 AppendString(pct,
879 TXVESC_LINK "####");
880 pct->fInLink = FALSE;
881 }
882}
883
884/* ******************************************************************
885 * *
886 * Tag helpers *
887 * *
888 ********************************************************************/
889
890/*
891 *@@ FindTagProcessor:
892 * returns the Tag* function which handles the
893 * given tag or NULL if there's none.
894 *
895 *@@added V0.9.4 (2000-06-10) [umoeller]
896 */
897
898PFNPROCESSTAG FindTagProcessor(PSZ pszTag)
899{
900 PFNPROCESSTAG pProcessor = NULL;
901
902 CHAR c0,
903 c1;
904
905 BOOL fEndOfTag = FALSE;
906
907 PSZ pCheck = pszTag,
908 p2;
909 if (*pCheck == '/')
910 {
911 // end of tag:
912 fEndOfTag = TRUE;
913 pCheck++;
914 }
915
916 c0 = *pCheck;
917 c1 = *(pCheck + 1);
918
919 p2 = pCheck + 2;
920
921 switch (c0)
922 {
923 case 'A':
924 case 'a':
925 switch (c1)
926 {
927 case 0: // A
928 if (!fEndOfTag)
929 return TagA;
930 else
931 return TagXA;
932 case 'D': // ADDRESS
933 case 'd': // ADDRESS
934 if (stricmp(p2, "DRESS") == 0)
935 if (!fEndOfTag)
936 return TagI;
937 else
938 return TagXI;
939 }
940 break;
941
942 case 'B':
943 case 'b':
944 switch (c1)
945 {
946 case 0:
947 if (!fEndOfTag)
948 return TagB;
949 else
950 return TagXB;
951
952 case 'R': // BR
953 case 'r': // BR
954 if (*p2 == 0)
955 if (!fEndOfTag)
956 return TagBR;
957 }
958 break;
959
960 case 'C':
961 case 'c':
962 switch (c1)
963 {
964 case 'I': // CITE
965 case 'i': // CITE
966 if (stricmp(p2, "TE") == 0)
967 {
968 if (!fEndOfTag)
969 return TagI;
970 else
971 return TagXI;
972 }
973 break;
974
975 case 'O':
976 case 'o':
977 if (stricmp(p2, "DE") == 0)
978 if (!fEndOfTag)
979 return TagCODE;
980 else
981 return TagXCODE;
982 break;
983 }
984 break;
985
986 case 'D':
987 case 'd':
988 switch (c1)
989 {
990 case 'D': // DD
991 case 'd': // DD
992 if ((*p2 == 0) && (!fEndOfTag))
993 return (TagDD);
994 break;
995
996 case 'I': // DIR
997 case 'i': // DIR
998 if (*p2 == 'R')
999 if (*(pCheck + 3) == 0)
1000 if (!fEndOfTag)
1001 return TagUL;
1002 else
1003 return TagXUL;
1004 break;
1005
1006 case 'L': // DL
1007 case 'l': // DL
1008 if (*p2 == 0)
1009 if (!fEndOfTag)
1010 return TagDL;
1011 else
1012 return TagXDL;
1013 break;
1014
1015 case 'T': // DT
1016 case 't': // DT
1017 if ((*p2 == 0) && (!fEndOfTag))
1018 return TagDT;
1019 break;
1020 }
1021 break;
1022
1023 case 'E':
1024 case 'e':
1025 if ( (c1 == 'M') || (c1 == 'm') ) // EM
1026 if (*p2 == 0)
1027 if (!fEndOfTag)
1028 return TagI;
1029 else
1030 return TagXI;
1031 break;
1032
1033 case 'H':
1034 case 'h':
1035 if (c1)
1036 if (*p2 == 0)
1037 switch (c1)
1038 {
1039 case '1':
1040 if (!fEndOfTag)
1041 return TagH1;
1042 else
1043 return TagXH1;
1044 case '2':
1045 if (!fEndOfTag)
1046 return TagH2;
1047 else
1048 return TagXH2;
1049 case '3':
1050 if (!fEndOfTag)
1051 return TagH3;
1052 else
1053 return TagXH3;
1054 case '4':
1055 if (!fEndOfTag)
1056 return TagH4;
1057 else
1058 return TagXH4;
1059 case '5':
1060 if (!fEndOfTag)
1061 return TagH5;
1062 else
1063 return TagXH5;
1064 case '6':
1065 if (!fEndOfTag)
1066 return TagH6;
1067 else
1068 return TagXH6;
1069 }
1070 break;
1071
1072 case 'I':
1073 case 'i':
1074 if (c1 == 0)
1075 if (!fEndOfTag)
1076 return TagI;
1077 else
1078 return TagXI;
1079 break;
1080
1081 case 'L':
1082 case 'l':
1083 if ((c1 == 'I') || (c1 == 'i'))
1084 if (*p2 == 0)
1085 return TagLI;
1086 break;
1087
1088 case 'M':
1089 case 'm':
1090 if (stricmp(p2, "NU") == 0)
1091 if (!fEndOfTag)
1092 return TagUL;
1093 else
1094 return TagXUL;
1095 break;
1096
1097 case 'O':
1098 case 'o':
1099 if ((c1 == 'L') || (c1 == 'l'))
1100 if (*p2 == 0)
1101 if (!fEndOfTag)
1102 return TagOL;
1103 else
1104 return TagXOL;
1105 break;
1106
1107 case 'P':
1108 case 'p':
1109 switch (c1)
1110 {
1111 case 0:
1112 if (!fEndOfTag)
1113 return TagP;
1114 break;
1115
1116 case 'R': // PRE
1117 case 'r': // PRE
1118 if ((*p2 == 'E') || (*p2 == 'e'))
1119 if (*(pCheck + 3) == 0)
1120 if (!fEndOfTag)
1121 return TagPRE;
1122 else
1123 return TagXPRE;
1124 break;
1125 }
1126 break;
1127
1128 case 'S':
1129 case 's':
1130 switch (c1)
1131 {
1132 case 'T': // STRONG
1133 case 't': // STRONG
1134 if (stricmp(p2, "RONG") == 0)
1135 if (!fEndOfTag)
1136 return TagB;
1137 else
1138 return TagXB;
1139 else if (stricmp(p2, "RIKE") == 0)
1140 if (!fEndOfTag)
1141 return TagSTRIKE;
1142 else
1143 return TagXSTRIKE;
1144 break;
1145
1146 case 'A':
1147 case 'a':
1148 if (stricmp(p2, "MP") == 0)
1149 if (!fEndOfTag)
1150 return TagCODE;
1151 else
1152 return TagXCODE;
1153 break;
1154 }
1155 break;
1156
1157 case 'T':
1158 case 't':
1159 switch (c1)
1160 {
1161 case 'R':
1162 case 'r':
1163 if (*p2 == 0)
1164 return TagTR;
1165 break;
1166
1167 case 'I':
1168 case 'i':
1169 if (stricmp(p2, "TLE") == 0)
1170 return TagTITLE;
1171 break;
1172
1173 case 'T': // TT
1174 case 't':
1175 if (*p2 == 0)
1176 if (!fEndOfTag)
1177 return TagCODE;
1178 else
1179 return TagXCODE;
1180 break;
1181 }
1182 break;
1183
1184 case 'U':
1185 case 'u':
1186 switch (c1)
1187 {
1188 case 0:
1189 if (!fEndOfTag)
1190 return TagU;
1191 else
1192 return TagXU;
1193
1194 case 'L':
1195 case 'l':
1196 if (*p2 == 0)
1197 if (!fEndOfTag)
1198 return TagUL;
1199 else
1200 return TagXUL;
1201 break;
1202 }
1203 break;
1204
1205 case 'V':
1206 case 'v':
1207 if (stricmp(p2, "R") == 0)
1208 {
1209 if (!fEndOfTag)
1210 return TagI;
1211 else
1212 return TagXI;
1213 }
1214 break;
1215
1216 case 'X':
1217 case 'x':
1218 if (stricmp(p2, "MP") == 0) // XMP
1219 {
1220 if (!fEndOfTag)
1221 return TagPRE;
1222 else
1223 return TagXPRE;
1224 }
1225 break;
1226 }
1227
1228 return (pProcessor);
1229}
1230
1231/*
1232 *@@ HandleTag:
1233 * called by txvConvertFromHTML when a "<" character
1234 * is found in the source buffer. This calls
1235 * FindTagProcessor in turn to find the Tag*
1236 * function which handles the tag.
1237 *
1238 *@@added V0.9.3 (2000-05-18) [umoeller]
1239 */
1240
1241VOID HandleTag(PCOPYTARGET pct)
1242{
1243 PSZ pStartOfTag = pct->pSource;
1244 // '<' == begin of tag:
1245
1246 // is it a comment? <!-- ... -->
1247 if (strncmp(pStartOfTag + 1, "!--", 3) == 0)
1248 {
1249 // start of comment:
1250 // find end of comment
1251 PSZ pEnd = strstr(pStartOfTag, "-->");
1252 if (pEnd)
1253 // found:
1254 // search on after end of comment
1255 pct->pSource = pEnd + 3;
1256 else
1257 {
1258 // end of comment not found:
1259 // stop formatting...
1260 pct->pSource++;
1261 return;
1262 }
1263 }
1264 else
1265 {
1266 // no comment:
1267 // find end of tag
1268 PSZ p2 = pStartOfTag + 1,
1269 pNextClose = 0, // receives first '>' after '<'
1270 pNextSpace = 0; // receives first ' ' after '<'
1271 BOOL fCont = TRUE;
1272 while (fCont)
1273 {
1274 switch (*p2)
1275 {
1276 case ' ':
1277 case '\r':
1278 case '\n':
1279 // store first space after '<'
1280 if (!pNextSpace)
1281 pNextSpace = p2;
1282 // overwrite line breaks with spaces;
1283 // otherwise we cannot handle tags which go across
1284 // several lines, which is valid HTML
1285 *p2 = ' ';
1286 break;
1287
1288 case '>': // end of tag found:
1289 pNextClose = p2;
1290 fCont = FALSE;
1291 break;
1292
1293 case '<':
1294 // another opening tag:
1295 // that's an HTML error
1296 AppendChar(pct,
1297 *pct->pSource++);
1298 fCont = FALSE;
1299 break;
1300
1301 case 0:
1302 fCont = FALSE;
1303 break;
1304 }
1305 p2++;
1306 }
1307
1308 if (pNextClose)
1309 {
1310 // end of tag found:
1311 ULONG cbTag;
1312 PSZ pStartOfAttrs = 0;
1313
1314 if ((pNextSpace) && (pNextSpace < pNextClose))
1315 {
1316 // we have attributes:
1317 cbTag = pNextSpace - (pStartOfTag + 1);
1318 pStartOfAttrs = pNextSpace;
1319 }
1320 else
1321 cbTag = pNextClose - (pStartOfTag + 1);
1322
1323 if (!cbTag)
1324 {
1325 // happens if we have a "<>" in the text:
1326 // just insert the '<>' and go on, we have no tag here
1327 AppendChar(pct,
1328 *pct->pSource++);
1329 AppendChar(pct,
1330 *pct->pSource++);
1331 }
1332 else
1333 {
1334 PFNPROCESSTAG pTagProcessor;
1335
1336 pct->cSaved = *(pStartOfTag + cbTag + 1);
1337 // add a null terminator
1338 *(pStartOfTag + cbTag + 1) = 0;
1339
1340 // find corresponding tag converter function
1341 // from G_TagProcessors map
1342 pTagProcessor = FindTagProcessor(pStartOfTag + 1); // pszTag);
1343
1344 // restore char under null terminator
1345 *(pStartOfTag + cbTag + 1) = pct->cSaved;
1346
1347 // reset new source ptr; the tag handler
1348 // can modify this
1349 pct->pNewSource = NULL;
1350
1351 if (pTagProcessor)
1352 {
1353 // tag understood:
1354
1355 // terminate string after closing tag
1356 pct->cSaved = *(pNextClose + 1); // can be null byte!
1357 *(pNextClose + 1) = 0;
1358
1359 // did we have attributes?
1360 if (pNextSpace)
1361 pct->pszAttributes = pNextSpace;
1362
1363 // finally, call the tag handler
1364 (pTagProcessor) // function
1365 (pct); // argument
1366
1367 *(pNextClose + 1) = pct->cSaved;
1368 }
1369
1370 if (pct->pNewSource == NULL)
1371 // tag handler needs no special processing:
1372 // skip '>' too
1373 pct->pSource = pNextClose + 1;
1374 else
1375 // tag handler has skipped something:
1376 pct->pSource = pct->pNewSource;
1377 }
1378 }
1379 }
1380}
1381
1382/*
1383 *@@ ConvertEscape:
1384 * called by HandleEscape to find the ANSI (CP 1004)
1385 * character for the given escape sequence (pszTag).
1386 *
1387 * pszTag contains the stuff between "&" and ";".
1388 *
1389 * This is really ugly spaghetti, but it's the fastest
1390 * way to do it.
1391 *
1392 *@@added V0.9.4 (2000-06-10) [umoeller]
1393 */
1394
1395CHAR ConvertEscape(PSZ pszTag)
1396{
1397 CHAR c0, c1;
1398 CHAR crc = 0;
1399
1400 PSZ p2 = pszTag + 2;
1401
1402 c0 = *pszTag;
1403 c1 = *(pszTag + 1);
1404
1405 switch (c0)
1406 {
1407 case 'a':
1408 switch (c1)
1409 {
1410 case 'a':
1411 if (strcmp(p2, "cute") == 0)
1412 return 225;
1413 break;
1414
1415 case 'c':
1416 if (strcmp(p2, "irc") == 0)
1417 return 226;
1418 else if (strcmp(p2, "ute") == 0)
1419 return 180;
1420 break;
1421
1422 case 'e':
1423 if (strcmp(p2, "lig") == 0)
1424 return 230;
1425 break;
1426
1427 case 'g':
1428 if (strcmp(p2, "rave") == 0)
1429 return 224;
1430 break;
1431
1432 case 'm':
1433 if (strcmp(p2, "p") == 0)
1434 return '&';
1435 break;
1436
1437 case 'r':
1438 if (strcmp(p2, "ing") == 0)
1439 return 229;
1440 break;
1441
1442 case 't':
1443 if (strcmp(p2, "ilde") == 0)
1444 return 227;
1445 break;
1446
1447 case 'u':
1448 if (strcmp(p2, "ml") == 0)
1449 return 228;
1450 break;
1451 }
1452 break;
1453
1454 case 'b':
1455 if (strcmp(pszTag + 1, "rvbar") == 0)
1456 return 166;
1457 break;
1458
1459 case 'c':
1460 switch (c1)
1461 {
1462 case 'c':
1463 if (strcmp(p2, "edil") == 0)
1464 return 231;
1465 break;
1466
1467 case 'e':
1468 if (strcmp(p2, "dil") == 0)
1469 return 184;
1470 else if (strcmp(p2, "nt") == 0)
1471 return 162;
1472 break;
1473
1474 case 'o':
1475 if (strcmp(p2, "py") == 0)
1476 return 169;
1477 break;
1478
1479 case 'u':
1480 if (strcmp(p2, "rren") == 0)
1481 return 164;
1482 }
1483 break;
1484
1485 case 'd':
1486 switch (c1)
1487 {
1488 case 'e':
1489 if (strcmp(p2, "g") == 0) return 176;
1490 break;
1491
1492 case 'i':
1493 if (strcmp(p2, "vide") == 0) return 247;
1494 break;
1495 }
1496 break;
1497
1498 case 'e':
1499 switch (c1)
1500 {
1501 case 'a':
1502 if (strcmp(p2, "cute") == 0) return 233;
1503 break;
1504
1505 case 'c':
1506 if (strcmp(p2, "irc") == 0) return 234;
1507 break;
1508
1509 case 'g':
1510 if (strcmp(p2, "rave") == 0) return 232;
1511 break;
1512
1513 case 't':
1514 if (strcmp(p2, "h") == 0) return 240;
1515 break;
1516
1517 case 'u':
1518 if (strcmp(p2, "ml") == 0) return 235;
1519 break;
1520 }
1521 break;
1522
1523 case 'f':
1524 switch (c1)
1525 {
1526 case 'r':
1527 if (strcmp(p2, "ac14") == 0) return 188;
1528 if (strcmp(p2, "ac12") == 0) return 189;
1529 if (strcmp(p2, "ac34") == 0) return 190;
1530 break;
1531 }
1532 break;
1533
1534 case 'g':
1535 switch (c1)
1536 {
1537 case 't':
1538 if (*p2 == 0) return '>';
1539 }
1540 break;
1541
1542 case 'i':
1543 switch (c1)
1544 {
1545 case 'a':
1546 if (strcmp(p2, "cute") == 0) return 237;
1547 break;
1548
1549 case 'c':
1550 if (strcmp(p2, "irc") == 0) return 238;
1551 break;
1552
1553 case 'g':
1554 if (strcmp(p2, "rave") == 0) return 236;
1555 break;
1556
1557 case 'e':
1558 if (strcmp(p2, "xcl") == 0) return 161;
1559 break;
1560
1561 case 'q':
1562 if (strcmp(p2, "uest") == 0) return 191;
1563 break;
1564
1565 case 'u':
1566 if (strcmp(p2, "ml") == 0) return 239;
1567 }
1568 break;
1569
1570 case 'l':
1571 switch (c1)
1572 {
1573 case 't':
1574 if (*p2 == 0)
1575 return '<';
1576 break;
1577
1578 case 'a':
1579 if (strcmp(p2, "quo") == 0) return 171;
1580 }
1581 break;
1582
1583 case 'm':
1584 switch (c1)
1585 {
1586 case 'a':
1587 if (strcmp(p2, "cr") == 0) return 175;
1588 break;
1589
1590 case 'i':
1591 if (strcmp(p2, "cro") == 0) return 181;
1592 if (strcmp(p2, "ddot") == 0) return 183;
1593 break;
1594 }
1595 break;
1596
1597 case 'n':
1598 switch (c1)
1599 {
1600 case 'b':
1601 if (strcmp(p2, "sp") == 0) return 160;
1602 break;
1603
1604 case 'o':
1605 if (strcmp(p2, "t") == 0) return 172;
1606 break;
1607
1608 case 't':
1609 if (strcmp(p2, "ilde") == 0) return 241;
1610 }
1611 break;
1612
1613 case 'o':
1614 switch (c1)
1615 {
1616 case 'a':
1617 if (strcmp(p2, "cute") == 0) return 243;
1618 break;
1619
1620 case 'c':
1621 if (strcmp(p2, "irc") == 0) return 244;
1622 break;
1623
1624 case 'g':
1625 if (strcmp(p2, "rave") == 0) return 242;
1626 break;
1627
1628 case 'r':
1629 if (strcmp(p2, "df") == 0) return 170;
1630 if (strcmp(p2, "dm") == 0) return 186;
1631 break;
1632
1633 case 's':
1634 if (strcmp(p2, "lash") == 0) return 248;
1635 break;
1636
1637 case 't':
1638 if (strcmp(p2, "ilde") == 0) return 245;
1639 break;
1640
1641 case 'u':
1642 if (strcmp(p2, "ml") == 0) return 246;
1643 }
1644 break;
1645
1646 case 'p':
1647 switch (c1)
1648 {
1649 case 'a':
1650 if (strcmp(p2, "ra") == 0) return 182;
1651 break;
1652
1653 case 'l':
1654 if (strcmp(p2, "usmn") == 0) return 177;
1655 break;
1656
1657 case 'o':
1658 if (strcmp(p2, "und") == 0) return 163;
1659 }
1660 break;
1661
1662 case 'q':
1663 if (strcmp(pszTag, "quot") == 0) return '"';
1664 break;
1665
1666 case 'r':
1667 if (strcmp(pszTag, "raquo") == 0) return 187;
1668 if (strcmp(pszTag, "reg") == 0) return 174;
1669 break;
1670
1671 case 's':
1672 switch (c1)
1673 {
1674 case 'z':
1675 if (strcmp(p2, "lig") == 0) return 223;
1676 break;
1677
1678 case 'e':
1679 if (strcmp(p2, "ct") == 0) return 167;
1680 break;
1681
1682 case 'h':
1683 if (strcmp(p2, "y") == 0) return 173;
1684 break;
1685
1686 case 'u':
1687 if (strcmp(p2, "p1") == 0) return 185;
1688 if (strcmp(p2, "p2") == 0) return 178;
1689 if (strcmp(p2, "p3") == 0) return 179;
1690 }
1691 break;
1692
1693 case 't':
1694 if (strcmp(pszTag, "thorn") == 0) return 254;
1695 if (strcmp(pszTag, "times") == 0) return 215;
1696 break;
1697
1698 case 'u':
1699 switch (c1)
1700 {
1701 case 'a':
1702 if (strcmp(p2, "cute") == 0) return 250;
1703 break;
1704
1705 case 'c':
1706 if (strcmp(p2, "irc") == 0) return 251;
1707 break;
1708
1709 case 'g':
1710 if (strcmp(p2, "rave") == 0) return 249;
1711 break;
1712
1713 case 'm':
1714 if (strcmp(p2, "l") == 0) return 168;
1715 break;
1716
1717 case 'u':
1718 if (strcmp(p2, "ml") == 0) return 252;
1719 }
1720 break;
1721
1722 case 'y':
1723 if (strcmp(pszTag, "yacute") == 0) return 253;
1724 if (strcmp(pszTag, "yen") == 0) return 165;
1725 if (strcmp(pszTag, "yuml") == 0) return 255;
1726 break;
1727
1728 case 'A':
1729 switch (c1)
1730 {
1731 case 'u':
1732 if (strcmp(p2, "ml") == 0) return 196;
1733 break;
1734
1735 case 'a':
1736 if (strcmp(p2, "cute") == 0) return 193;
1737 break;
1738
1739 case 'c':
1740 if (strcmp(p2, "irc") == 0) return 194;
1741 break;
1742
1743 case 'E':
1744 if (strcmp(p2, "lig") == 0) return 198;
1745 break;
1746
1747 case 'g':
1748 if (strcmp(p2, "rave") == 0) return 192;
1749 break;
1750
1751 case 'r':
1752 if (strcmp(p2, "ing") == 0) return 197;
1753 break;
1754
1755 case 't':
1756 if (strcmp(p2, "ilde") == 0) return 195;
1757 }
1758 break;
1759
1760 case 'C':
1761 if (strcmp(pszTag, "Ccedil") == 0) return 199;
1762 break;
1763
1764 case 'E':
1765 if (strcmp(pszTag, "Ecirc") == 0) return 202;
1766 if (strcmp(pszTag, "Eacute") == 0) return 201;
1767 if (strcmp(pszTag, "Egrave") == 0) return 200;
1768 if (strcmp(pszTag, "ETH") == 0) return 208;
1769 if (strcmp(pszTag, "Euml") == 0) return 203;
1770 break;
1771
1772 case 'I':
1773 if (strcmp(pszTag, "Icirc") == 0) return 206;
1774 if (strcmp(pszTag, "Iacute") == 0) return 205;
1775 if (strcmp(pszTag, "Igrave") == 0) return 204;
1776 if (strcmp(pszTag, "Iuml") == 0) return 207;
1777 break;
1778
1779 case 'N':
1780 if (strcmp(pszTag, "Ntilde") == 0) return 209;
1781 break;
1782
1783 case 'O':
1784 switch (c1)
1785 {
1786 case 'u':
1787 if (strcmp(p2, "ml") == 0) return 214;
1788 break;
1789
1790 case 'a':
1791 if (strcmp(p2, "cute") == 0) return 211;
1792 break;
1793
1794 case 'c':
1795 if (strcmp(p2, "irc") == 0) return 212;
1796 break;
1797
1798 case 'g':
1799 if (strcmp(p2, "rave") == 0) return 210;
1800 break;
1801
1802 case 't':
1803 if (strcmp(p2, "ilde") == 0) return 213;
1804 break;
1805
1806 case 's':
1807 if (strcmp(p2, "lash") == 0) return 216;
1808 }
1809 break;
1810
1811 case 'U':
1812 switch (c1)
1813 {
1814 case 'a':
1815 if (strcmp(p2, "cute") == 0) return 218;
1816 break;
1817
1818 case 'c':
1819 if (strcmp(p2, "irc") == 0) return 219;
1820 break;
1821
1822 case 'g':
1823 if (strcmp(p2, "rave") == 0) return 217;
1824 break;
1825
1826 case 'u':
1827 if (strcmp(p2, "ml") == 0) return 220;
1828 }
1829 break;
1830
1831 case 'T':
1832 if (strcmp(pszTag, "THORN") == 0) return 222;
1833 break;
1834
1835 case 'Y':
1836 if (strcmp(pszTag, "Yacute") == 0) return 221;
1837 break;
1838 }
1839
1840 return (crc);
1841}
1842
1843/*
1844 *@@ HandleEscape:
1845 * called by txvConvertFromHTML when a "&" character
1846 * is found in the source buffer. This calls
1847 * ConvertEscape in turn.
1848 *
1849 *@@added V0.9.3 (2000-05-18) [umoeller]
1850 */
1851
1852VOID HandleEscape(PCOPYTARGET pct)
1853{
1854 // ampersand:
1855 // replace special characters
1856 PSZ pStartOfTag = pct->pSource;
1857 // find end of tag
1858 PSZ p2 = pStartOfTag,
1859 pNextClose = 0,
1860 pNextSpace = 0;
1861 BOOL fCont = TRUE;
1862 while (fCont)
1863 {
1864 switch (*p2)
1865 {
1866 case 0:
1867 fCont = FALSE;
1868 break;
1869
1870 case ';':
1871 pNextClose = p2;
1872 fCont = FALSE;
1873 break;
1874
1875 case ' ':
1876 if (!pNextSpace)
1877 pNextSpace = p2;
1878 break;
1879 }
1880 p2++;
1881 }
1882
1883 if (!pNextClose)
1884 // no closing tag found:
1885 // just insert the '&' and go on, we have no tag here
1886 AppendChar(pct,
1887 *pct->pSource++);
1888 else
1889 {
1890 if ((pNextSpace) && (pNextSpace < pNextClose))
1891 // space before ';':
1892 // just insert the '&' and go on, we have no tag here
1893 AppendChar(pct,
1894 *pct->pSource++);
1895 else if ((!pNextClose) || (pNextClose <= pStartOfTag + 1))
1896 AppendChar(pct,
1897 *pct->pSource++);
1898 else
1899 {
1900 ULONG ulCode = 0;
1901
1902 // create substring with tag
1903 PSZ pszTag = pStartOfTag + 1;
1904 *pNextClose = 0;
1905
1906 if (*pszTag == '#')
1907 {
1908 // latin-1 or Unicode encoding (&#000;)
1909 ulCode = atoi(pszTag + 1);
1910
1911 // next input: char after ';'
1912 pct->pSource = pNextClose + 1;
1913 }
1914 else
1915 {
1916 // named entity:
1917 // find char code corresponding to escape
1918 // from G_EscapeProcessors map
1919 ulCode = ConvertEscape(pszTag);
1920 if (ulCode)
1921 // tag supported:
1922 pct->pSource = pNextClose + 1;
1923 else
1924 // tag not supported:
1925 ulCode = *pct->pSource++;
1926 }
1927
1928 // restore closing tag which we overwrote
1929 *pNextClose = ';';
1930
1931 if (ulCode)
1932 {
1933 AppendLinebreakCheck(pct);
1934
1935 AppendChar(pct,
1936 (CHAR)ulCode);
1937 pct->fSkipNextSpace = FALSE;
1938 }
1939 }
1940 }
1941}
1942
1943/* ******************************************************************
1944 * *
1945 * Entry points *
1946 * *
1947 ********************************************************************/
1948
1949/*
1950 *@@ txvConvertFromHTML:
1951 * this modifies the given text string (which should
1952 * be the complete BODY block of any HTML file) so
1953 * that all HTML tags are removed and replaced with
1954 * escape sequences that the XTextView control understands.
1955 *
1956 * The buffer gets reallocated by this function, so it
1957 * must be free()'able.
1958 *
1959 * So, to have the XTextView control display an HTML file,
1960 * do this:
1961 *
1962 * 1) Load an HTML file into a buffer allocated by malloc().
1963 *
1964 * 2) Call txvConvertFromHTML.
1965 *
1966 * 3) Call WinSetWindowText on the XTextView control with
1967 * the modified buffer.
1968 *
1969 * This understands the following limited subset of HTML:
1970 *
1971 * Paragraph tags:
1972 *
1973 * -- P, BR
1974 * -- PRE, /PRE
1975 * -- UL, /UL, OL, /OL, LI
1976 * -- DL, /DL, DT, DD
1977 * -- H1, /H1 thru H6, /H6
1978 * -- Comments (<!-- .... -->)
1979 *
1980 * Character tags:
1981 *
1982 * -- B, /B, STRONG, /STRONG
1983 * -- I, /I, EM, /EM, VAR, /VAR, CITE, /CITE
1984 * -- CODE, /CODE, SAMP, /SAMP, KBD, /KBD, TT, /TT
1985 * -- U, /U
1986 * -- STRIKE, /STRIKE
1987 * -- CODE, /CODE
1988 *
1989 * The most obvious limitation is that neither tables
1990 * nor frames are supported. Also forget about CSS
1991 * and JavaScript, of course.
1992 *
1993 * All the ampersand (&amp; something) sequences defined
1994 * in HTML 3 are properly translated.
1995 *
1996 * Note: Those are translated to the ANSI (MS-Windows,
1997 * OS/2 codepage 1004) character set. This has the
1998 * following characteristics:
1999 *
2000 * -- Codes 0-127 are identical to ASCII and thus
2001 * ISO 8559-1 ("Latin 1") also.
2002 *
2003 * -- Codes 160-255 are identical to ISO 8559-1 ("Latin 1").
2004 *
2005 * -- Codes 128-159 are NOT defined in ISO 8559-1, but
2006 * Netscape treats those as ANSI as well, so we do too.
2007 *
2008 * As a result, consider the output to be in OS/2 codepage
2009 * 1004. Either set your codepage to that (WinSetCp)
2010 * or translate the output (WinCpTranslateString).
2011 *
2012 * &#xxx; tags (with xxx being a decimal) are considered
2013 * ANSI codes as well. Even though HTML 4.0 allows Unicode
2014 * characters > 255 to be inserted this way, we ignore
2015 * those. Unicode chars from 0 to 255 are identical to
2016 * ANSI, so for &#000; to &#255;, we are HTML-compliant.
2017 *
2018 * All other tags are completely thrown out.
2019 *
2020 *@@added V0.9.3 (2000-05-06) [umoeller]
2021 */
2022
2023BOOL txvConvertFromHTML(char **ppszText,
2024 PVOID pxhtml, // out: various config data (PXHTMLDATA)
2025 PULONG pulProgress, // out: progress (ptr can be NULL)
2026 PBOOL pfCancel) // in: cancel flag (ptr can be NULL)
2027{
2028 BOOL brc = TRUE;
2029
2030 PSZ pszNew,
2031 pTarget;
2032 ULONG cbSource = strlen(*ppszText);
2033
2034 COPYTARGET ct = {0};
2035
2036 lstInit(&ct.llLists,
2037 TRUE); // free items
2038
2039 ct.pSource = *ppszText;
2040 // skip leading spaces
2041 ct.fSkipNextSpace = TRUE;
2042 ct.pxhtml = (PXHTMLDATA)pxhtml;
2043
2044 // step 2:
2045 // actual tags formatting
2046
2047 while (TRUE)
2048 {
2049 CHAR c = *ct.pSource;
2050
2051 if (pfCancel)
2052 if (*pfCancel)
2053 {
2054 brc = FALSE;
2055 break;
2056 }
2057
2058 if (!c)
2059 // null terminator reached:
2060 break;
2061
2062 // calculate progress
2063 if (pulProgress)
2064 *pulProgress = ((ct.pSource - *ppszText) // characters done
2065 * 100
2066 / cbSource); // characters total
2067
2068 switch (c)
2069 {
2070 case '<':
2071 HandleTag(&ct);
2072 break;
2073
2074 case '&':
2075 HandleEscape(&ct);
2076 break;
2077
2078 case '\r':
2079 // skip
2080 if (!ct.fSkipNextSpace)
2081 {
2082 AppendChar(&ct,
2083 ' ');
2084 // ct.fNeedsLinebreak = FALSE;
2085 // but skip leading spaces which might follow
2086 if (!ct.fPRE)
2087 ct.fSkipNextSpace = TRUE;
2088 }
2089 ct.pSource++;
2090 break;
2091
2092 case '\t':
2093 {
2094 if (ct.fPRE)
2095 {
2096 ULONG ul;
2097 for (ul = 0;
2098 ul < 8;
2099 ul++)
2100 AppendChar(&ct,
2101 ' ');
2102 }
2103 else
2104 {
2105 // not in PRE block:
2106 if ( (!ct.fSkipNextSpace)
2107 // && (!ct.fNeedsLinebreak)
2108 )
2109 // last was not space: copy
2110 AppendChar(&ct,
2111 ' ');
2112
2113 ct.fSkipNextSpace = TRUE;
2114 }
2115
2116 // skip the tab
2117 ct.pSource++;
2118 break; }
2119
2120 case '\n':
2121 {
2122 // newline char:
2123 if (!ct.fPRE)
2124 {
2125 // if not in PRE mode, replace with space
2126 if (!ct.fSkipNextSpace)
2127 {
2128 AppendChar(&ct,
2129 ' ');
2130 // ct.fNeedsLinebreak = FALSE;
2131 // but skip leading spaces which might follow
2132 ct.fSkipNextSpace = TRUE;
2133 }
2134 }
2135 else
2136 // in PRE mode, preserve line breaks
2137 AppendChar(&ct, '\n'); // ct.fNeedsLinebreak = TRUE;
2138
2139 ct.pSource++;
2140 break; }
2141
2142 case '\xFF':
2143 {
2144 AppendChar(&ct,
2145 ' ');
2146 ct.pSource++;
2147 break; }
2148
2149 case ' ':
2150 if (!ct.fPRE)
2151 {
2152 // is space, and not in PRE block:
2153 if ( (!ct.fSkipNextSpace)
2154 // && (!ct.fNeedsLinebreak)
2155 )
2156 // last was not space: copy
2157 AppendChar(&ct,
2158 ' ');
2159
2160 ct.fSkipNextSpace = TRUE;
2161 }
2162 else
2163 // in PRE, always add all spaces
2164 AppendChar(&ct,
2165 ' ');
2166 ct.pSource++;
2167 break;
2168
2169 default:
2170 // if we're not inserting escapes or anything,
2171 // check if a linebreak is needed
2172 AppendLinebreakCheck(&ct);
2173
2174 AppendChar(&ct,
2175 *ct.pSource++);
2176 ct.fSkipNextSpace = FALSE;
2177 ct.fSkipNextLinebreak = FALSE;
2178
2179 } // end switch (*pSource);
2180 } // end while (*pSource)
2181 AppendChar(&ct,
2182 '\n');
2183 // append null-terminator
2184 AppendChar(&ct,
2185 0);
2186
2187 free(*ppszText);
2188 *ppszText = ct.pszNew;
2189
2190 lstClear(&ct.llLists);
2191
2192 return (brc);
2193}
2194
2195
Note: See TracBrowser for help on using the repository browser.