Changeset 35 for trunk/src/helpers/xml.c
- Timestamp:
- Feb 14, 2001, 10:01:57 PM (25 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/helpers/xml.c
r14 r35 2 2 /* 3 3 *@@sourcefile xml.c: 4 * XML parsing. 5 * 6 * This is vaguely modelled after the Document Object Model 7 * (DOM) standardized by the W3C. 4 * XML document handling. 5 * 6 * XML support in the XWorkplace Helpers is broken into two 7 * layers: 8 * 9 * -- The bottom layer is implemented by expat, which I have 10 * ported to this library. See xmlparse.c for an introduction. 11 * 12 * -- Because expat requires so many callbacks and is non-validating, 13 * I have added a top layer above the expat library 14 * which is vaguely modelled after the Document Object Model 15 * (DOM) standardized by the W3C. That's this file. 16 * 17 * To understand and use this code, you should be familiar with 18 * the following: 19 * 20 * -- XML parsers operate on XML @documents. 21 * 22 * -- Each XML document has both a logical and a physical 23 * structure. 24 * 25 * Physically, the document is composed of units called 26 * @entities. 27 * 28 * Logically, the document is composed of @markup and 29 * @content. Among other things, markup separates the content 30 * into @elements. 31 * 32 * -- The logical and physical structures must nest properly (be 33 * @well-formed) for each entity, which results in the entire 34 * XML document being well-formed as well. 35 * 36 * <B>Document Object Model (DOM)</B> 8 37 * 9 38 * In short, DOM specifies that an XML document is broken 10 39 * up into a tree of nodes, representing the various parts 11 * of an XML document. Most importantly, we have: 12 * 13 * -- ELEMENT: some XML tag or a pair of tags (e.g. <LI>...<LI>. 14 * 15 * -- ATTRIBUTE: an attribute to an element. 16 * 17 * -- TEXT: a piece of, well, text. 18 * 19 * -- COMMENT: a comment. 20 * 21 * See xmlParse() for a more detailed explanation. 22 * 23 * However, since this implementation was supposed to be a 24 * C-only interface, we do not implement inheritance. Instead, 25 * each XML document is broken up into a tree of DOMNODE's only, 26 * each of which has a special type. 27 * 28 * It shouldn't be too difficult to write a C++ encapsulation 29 * of this which implements all the methods required by the DOM 30 * standard. 31 * 32 * The main entry point into this is xmlParse or 33 * xmlCreateDocumentFromString. See remarks there for details. 34 * 35 * Limitations: 36 * 37 * 1) This presently only parses ELEMENT, ATTRIBUTE, TEXT, 38 * and COMMENT nodes. 39 * 40 * 2) This doesn't use 16-bit characters, but 8-bit characters. 41 * 42 *@@header "helpers\xml.h" 43 *@@added V0.9.6 (2000-10-29) [umoeller] 44 */ 45 46 /* 47 * Copyright (C) 2000 Ulrich Mller. 48 * This file is part of the "XWorkplace helpers" source package. 49 * This is free software; you can redistribute it and/or modify 50 * it under the terms of the GNU General Public License as published 51 * by the Free Software Foundation, in version 2 as it comes in the 52 * "COPYING" file of the XWorkplace main distribution. 53 * This program is distributed in the hope that it will be useful, 54 * but WITHOUT ANY WARRANTY; without even the implied warranty of 55 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 56 * GNU General Public License for more details. 57 */ 58 59 #define OS2EMX_PLAIN_CHAR 60 // this is needed for "os2emx.h"; if this is defined, 61 // emx will define PSZ as _signed_ char, otherwise 62 // as unsigned char 63 64 #define INCL_DOSERRORS 65 #include <os2.h> 66 67 #include <stdlib.h> 68 #include <string.h> 69 70 #include "setup.h" // code generation and debugging options 71 72 #include "helpers\linklist.h" 73 #include "helpers\stringh.h" 74 #include "helpers\xml.h" 75 76 #pragma hdrstop 77 78 /* 79 *@@category: Helpers\C helpers\XML\Node management 80 */ 81 82 /* ****************************************************************** 83 * 84 * Node Management 85 * 86 ********************************************************************/ 87 88 /* 89 *@@ xmlCreateNode: 90 * creates a new DOMNODE with the specified 91 * type and parent. 92 */ 93 94 PDOMNODE xmlCreateNode(PDOMNODE pParentNode, 95 ULONG ulNodeType) 96 { 97 PDOMNODE pNewNode = (PDOMNODE)malloc(sizeof(DOMNODE)); 98 if (pNewNode) 99 { 100 memset(pNewNode, 0, sizeof(DOMNODE)); 101 pNewNode->ulNodeType = ulNodeType; 102 pNewNode->pParentNode = pParentNode; 103 if (pParentNode) 104 { 105 // parent specified: 106 // append this new node to the parent's 107 // list of child nodes 108 lstAppendItem(&pParentNode->listChildNodes, 109 pNewNode); 110 } 111 112 lstInit(&pNewNode->listChildNodes, FALSE); 113 lstInit(&pNewNode->listAttributeNodes, FALSE); 114 } 115 116 return (pNewNode); 117 } 118 119 /* 120 *@@ xmlDeleteNode: 121 * deletes the specified node. 122 * 123 * If the node has child nodes, all of them are deleted 124 * as well. This recurses, if necessary. 125 * 126 * As a result, if the node is a document node, this 127 * deletes an entire document, including all of its 128 * child nodes. 129 * 130 * Returns: 131 * 132 * -- 0: NO_ERROR. 133 */ 134 135 ULONG xmlDeleteNode(PDOMNODE pNode) 136 { 137 ULONG ulrc = 0; 138 139 if (!pNode) 140 { 141 ulrc = DOMERR_NOT_FOUND; 142 } 143 else 144 { 145 // recurse into child nodes 146 PLISTNODE pNodeThis = lstQueryFirstNode(&pNode->listChildNodes); 147 while (pNodeThis) 148 { 149 // recurse!! 150 xmlDeleteNode((PDOMNODE)(pNodeThis->pItemData)); 151 152 pNodeThis = pNodeThis->pNext; 153 } 154 155 // delete attribute nodes 156 pNodeThis = lstQueryFirstNode(&pNode->listAttributeNodes); 157 while (pNodeThis) 158 { 159 // recurse!! 160 xmlDeleteNode((PDOMNODE)(pNodeThis->pItemData)); 161 162 pNodeThis = pNodeThis->pNext; 163 } 164 165 if (pNode->pParentNode) 166 { 167 // node has a parent: 168 // remove this node from the parent's list 169 // of child nodes before deleting this node 170 lstRemoveItem(&pNode->pParentNode->listChildNodes, 171 pNode); 172 pNode->pParentNode = NULL; 173 } 174 175 if (pNode->pszNodeName) 176 { 177 free(pNode->pszNodeName); 178 pNode->pszNodeName = NULL; 179 } 180 if (pNode->pszNodeValue) 181 { 182 free(pNode->pszNodeValue); 183 pNode->pszNodeValue = NULL; 184 } 185 186 free(pNode); 187 } 188 189 return (ulrc); 190 } 191 192 /* 193 *@@category: Helpers\C helpers\XML\Parsing 194 */ 195 196 /* ****************************************************************** 197 * 198 * Tokenizing (Compiling) 199 * 200 ********************************************************************/ 201 202 /* 203 *@@ xmlTokenize: 204 * this takes any block of XML text and "tokenizes" 205 * it. 206 * 207 * Tokenizing (or compiling, or "scanning" in bison/flex 208 * terms) means preparing the XML code for parsing later. 209 * This finds all tags and tag attributes and creates 210 * special codes for them in the output buffer. 211 * 212 * For example: 213 + 214 + <TAG ATTR="text"> block </TAG> 215 + 216 * becomes 217 * 218 + 0xFF escape code 219 + 0x01 tag start code 220 + "TAG" tag name 221 + 0xFF end of tag name code 222 + 223 + 0xFF escape code 224 + 0x03 attribute name code 225 + "ATTR" attribute name 226 + 0xFF 227 + "text" attribute value (without quotes) 228 + 0xFF end of attribute code 229 + 230 + " block " regular text 231 + 232 + 0xFF escape code 233 + 0x01 tag start code 234 + "/TAG" tag name 235 + 0xFF end of tag name code 236 * 237 *@@added V0.9.6 (2000-11-01) [umoeller] 238 */ 239 240 PSZ xmlTokenize(const char *pcszXML) 241 { 242 return (0); 243 } 244 245 /* ****************************************************************** 246 * 247 * Parsing 248 * 249 ********************************************************************/ 250 251 /* 252 * TAGFOUND: 253 * structure created for each tag by BuildTagsList. 254 */ 255 256 typedef struct _TAGFOUND 257 { 258 BOOL fIsComment; 259 const char *pOpenBrck; 260 const char *pStartOfTagName; 261 const char *pFirstAfterTagName; 262 const char *pCloseBrck; // ptr to '>' char; this plus one should 263 // point to after the tag 264 } TAGFOUND, *PTAGFOUND; 265 266 /* 267 * BuildTagsList: 268 * builds a LINKLIST containing TAGFOUND structs for 269 * each tag found in the specified buffer. 270 * 271 * This is a flat list without any tree structure. This 272 * only searches for the tags and doesn't create any 273 * hierarchy. 274 * 275 * The tags are simply added to the list in the order 276 * in which they are found in pcszBuffer. 277 * 278 * The list is auto-free, you can simply do a lstFree 279 * to clean up. 280 */ 281 282 PLINKLIST BuildTagsList(const char *pcszBuffer) 283 { 284 PLINKLIST pllTags = lstCreate(TRUE); 285 286 const char *pSearchPos = pcszBuffer; 287 288 while ((pSearchPos) && (*pSearchPos)) 289 { 290 // find first '<' 291 PSZ pOpenBrck = strchr(pSearchPos, '<'); 292 if (!pOpenBrck) 293 // no open bracket found: stop search 294 pSearchPos = 0; 295 else 296 { 297 if (strncmp(pOpenBrck + 1, "!--", 3) == 0) 298 { 299 // it's a comment: 300 // treat that differently 301 const char *pEndOfComment = strstr(pOpenBrck + 4, "-->"); 302 const char *pCloseBrck = 0; 303 const char *pFirstAfterTagName = 0; 304 PTAGFOUND pTagFound; 305 if (!pEndOfComment) 306 { 307 // no end of comment found: 308 // skip entire rest of string 309 pCloseBrck = pOpenBrck + strlen(pOpenBrck); 310 pFirstAfterTagName = pCloseBrck; 311 pSearchPos = 0; 312 } 313 else 314 { 315 pCloseBrck = pEndOfComment + 2; // point directly to '>' 316 pFirstAfterTagName = pCloseBrck + 1; 317 } 318 319 // append it to the list 320 pTagFound = (PTAGFOUND)malloc(sizeof(TAGFOUND)); 321 if (!pTagFound) 322 // error: 323 pSearchPos = 0; 324 else 325 { 326 pTagFound->fIsComment = TRUE; 327 pTagFound->pOpenBrck = pOpenBrck; 328 pTagFound->pStartOfTagName = pOpenBrck + 1; 329 pTagFound->pFirstAfterTagName = pFirstAfterTagName; 330 pTagFound->pCloseBrck = pCloseBrck; 331 332 lstAppendItem(pllTags, pTagFound); 333 } 334 335 pSearchPos = pFirstAfterTagName; 336 } 337 else 338 { 339 // no comment: 340 // find matching closing bracket 341 const char *pCloseBrck = strchr(pOpenBrck + 1, '>'); 342 if (!pCloseBrck) 343 pSearchPos = 0; 344 else 345 { 346 const char *pNextOpenBrck = strchr(pOpenBrck + 1, '<'); 347 // if we have another opening bracket before the closing bracket, 348 if ((pNextOpenBrck) && (pNextOpenBrck < pCloseBrck)) 349 // ignore this one 350 pSearchPos = pNextOpenBrck; 351 else 352 { 353 // OK, apparently we have a tag. 354 // Skip all spaces after the tag. 355 const char *pTagName = pOpenBrck + 1; 356 while ( (*pTagName) 357 && ( (*pTagName == ' ') 358 || (*pTagName == '\r') 359 || (*pTagName == '\n') 360 ) 361 ) 362 pTagName++; 363 if (!*pTagName) 364 // no tag name: stop 365 pSearchPos = 0; 366 else 367 { 368 // ookaaayyy, we got a tag now. 369 // Find first space or ">" after tag name: 370 const char *pFirstAfterTagName = pTagName + 1; 371 while ( (*pFirstAfterTagName) 372 && (*pFirstAfterTagName != ' ') 373 && (*pFirstAfterTagName != '\n') 374 && (*pFirstAfterTagName != '\r') 375 && (*pFirstAfterTagName != '\t') // tab 376 && (*pFirstAfterTagName != '>') 377 ) 378 pFirstAfterTagName++; 379 if (!*pFirstAfterTagName) 380 // no closing bracket found: 381 pSearchPos = 0; 382 else 383 { 384 // got a tag name: 385 // append it to the list 386 PTAGFOUND pTagFound = (PTAGFOUND)malloc(sizeof(TAGFOUND)); 387 if (!pTagFound) 388 // error: 389 pSearchPos = 0; 390 else 391 { 392 pTagFound->fIsComment = FALSE; 393 pTagFound->pOpenBrck = pOpenBrck; 394 pTagFound->pStartOfTagName = pTagName; 395 pTagFound->pFirstAfterTagName = pFirstAfterTagName; 396 pTagFound->pCloseBrck = pCloseBrck; 397 398 lstAppendItem(pllTags, pTagFound); 399 400 // search on after closing bracket 401 pSearchPos = pCloseBrck + 1; 402 } 403 } 404 } 405 } 406 } // end else if (!pCloseBrck) 407 } // end else if (strncmp(pOpenBrck + 1, "!--")) 408 } // end if (pOpenBrck) 409 } // end while 410 411 return (pllTags); 412 } 413 414 /* 415 *@@ CreateTextNode: 416 * shortcut for creating a TEXT node. Calls 417 * xmlCreateNode in turn. 418 * 419 * The text is extracted from in between the 420 * two pointers using strhSubstr. 421 */ 422 423 PDOMNODE CreateTextNode(PDOMNODE pParentNode, 424 const char *pStart, 425 const char *pEnd) 426 { 427 PDOMNODE pNewTextNode = xmlCreateNode(pParentNode, 428 DOMNODE_TEXT); 429 if (pNewTextNode) 430 pNewTextNode->pszNodeValue = strhSubstr(pStart, 431 pEnd); 432 433 return (pNewTextNode); 434 } 435 436 /* 437 *@@ CreateElementNode: 438 * shortcut for creating a new ELEMENT node and 439 * parsing attributes at the same time. 440 * 441 * pszTagName is assumed to be static (no copy 442 * is made). 443 * 444 * pAttribs is assumed to point to an attributes 445 * string. This function creates ATTRIBUTE nodes 446 * from that string until either a null character 447 * or '>' is found. 448 */ 449 450 PDOMNODE CreateElementNode(PDOMNODE pParentNode, 451 PSZ pszTagName, 452 const char *pAttribs) // in: ptr to attribs; can be NULL 453 { 454 PDOMNODE pNewNode = xmlCreateNode(pParentNode, 455 DOMNODE_ELEMENT); 456 if (pNewNode) 457 { 458 const char *p = pAttribs; 459 460 pNewNode->pszNodeName = pszTagName; 461 462 // find-start-of-attribute loop 463 while (p) 464 { 465 switch (*p) 466 { 467 case 0: 468 case '>': 469 p = 0; 470 break; 471 472 case ' ': 473 case '\t': // tab 474 case '\n': 475 case '\r': 476 p++; 477 break; 478 479 default: 480 { 481 // first (or next) non-space: 482 // that's the start of an attrib, probably 483 // go until we find a space or '>' 484 485 const char *pNameStart = p, 486 *p2 = p; 487 488 const char *pEquals = 0, 489 *pFirstQuote = 0, 490 *pEnd = 0; // last char... non-inclusive! 491 492 // copy-rest-of-attribute loop 493 while (p2) 494 { 495 switch (*p2) 496 { 497 case '"': 498 if (!pEquals) 499 { 500 // '"' cannot appear before '=' 501 p2 = 0; 502 p = 0; 503 } 504 else 505 { 506 if (pFirstQuote) 507 { 508 // second quote: 509 // get value between quotes 510 pEnd = p2; 511 // we're done with this one 512 p = p2 + 1; 513 p2 = 0; 514 } 515 else 516 { 517 // first quote: 518 pFirstQuote = p2; 519 p2++; 520 } 521 } 522 break; 523 524 case '=': 525 if (!pEquals) 526 { 527 // first equals sign: 528 pEquals = p2; 529 // extract name 530 p2++; 531 } 532 else 533 if (pFirstQuote) 534 p2++; 535 else 536 { 537 // error 538 p2 = 0; 539 p = 0; 540 } 541 break; 542 543 case ' ': 544 case '\t': // tab 545 case '\n': 546 case '\r': 547 // spaces can appear in quotes 548 if (pFirstQuote) 549 // just continue 550 p2++; 551 else 552 { 553 // end of it! 554 pEnd = p2; 555 p = p2 + 1; 556 p2 = 0; 557 } 558 break; 559 560 case 0: 561 case '>': 562 { 563 pEnd = p2; 564 // quit inner AND outer loop 565 p2 = 0; 566 p = 0; 567 break; } 568 569 default: 570 p2++; 571 } 572 } // end while (p2) 573 574 if (pEnd) 575 { 576 PDOMNODE pAttribNode = xmlCreateNode(pNewNode, 577 DOMNODE_ATTRIBUTE); 578 if (pAttribNode) 579 { 580 if (pEquals) 581 { 582 pAttribNode->pszNodeName 583 = strhSubstr(pNameStart, pEquals); 584 585 // did we have quotes? 586 if (pFirstQuote) 587 pAttribNode->pszNodeValue 588 = strhSubstr(pFirstQuote + 1, pEnd); 589 else 590 pAttribNode->pszNodeValue 591 = strhSubstr(pEquals + 1, pEnd); 592 } 593 else 594 // no "equals": 595 pAttribNode->pszNodeName 596 = strhSubstr(pNameStart, pEnd); 597 } 598 } 599 break; } 600 } 601 } 602 } 603 604 return (pNewNode); 605 } 606 607 /* 608 *@@ CreateNodesForBuf: 609 * this gets called (recursively) for a piece of text 610 * for which we need to create TEXT and ELEMENT DOMNODE's. 611 * 612 * This does the heavy work for xmlParse. 613 * 614 * If an error (!= 0) is returned, *ppError points to 615 * the code part that failed. 616 */ 617 618 ULONG CreateNodesForBuf(const char *pcszBufStart, 619 const char *pcszBufEnd, // in: can be NULL 620 PLINKLIST pllTagsList, 621 PDOMNODE pParentNode, 622 PFNVALIDATE pfnValidateTag, 623 const char **ppError) 624 { 625 ULONG ulrc = 0; 626 PLISTNODE pCurrentTagListNode = lstQueryFirstNode(pllTagsList); 627 const char *pBufCurrent = pcszBufStart; 628 BOOL fContinue = TRUE; 629 630 if (pcszBufEnd == NULL) 631 pcszBufEnd = pcszBufStart + strlen(pcszBufStart); 632 633 while (fContinue) 634 { 635 if ( (!*pBufCurrent) 636 || (pBufCurrent == pcszBufEnd) 637 ) 638 // end of buf reached: 639 fContinue = FALSE; 640 641 else if (!pCurrentTagListNode) 642 { 643 // no (more) tags for this buffer: 644 CreateTextNode(pParentNode, 645 pBufCurrent, 646 pcszBufEnd); 647 fContinue = FALSE; 648 } 649 else 650 { 651 // another tag found: 652 PTAGFOUND pFoundTag = (PTAGFOUND)pCurrentTagListNode->pItemData; 653 const char *pStartOfTag = pFoundTag->pOpenBrck; 654 if (pStartOfTag > pBufCurrent + 1) 655 { 656 // we have text before the opening tag: 657 // make a DOMTEXT out of this 658 CreateTextNode(pParentNode, 659 pBufCurrent, 660 pStartOfTag); 661 pBufCurrent = pStartOfTag; 662 } 663 else 664 { 665 // OK, go for this tag... 666 667 if (*(pFoundTag->pStartOfTagName) == '/') 668 { 669 // this is a closing tag: that's an error 670 ulrc = 1; 671 *ppError = pFoundTag->pStartOfTagName; 672 fContinue = FALSE; 673 } 674 else if (pFoundTag->fIsComment) 675 { 676 // it's a comment: that's simple 677 PDOMNODE pCommentNode = xmlCreateNode(pParentNode, 678 DOMNODE_COMMENT); 679 if (!pCommentNode) 680 ulrc = ERROR_NOT_ENOUGH_MEMORY; 681 else 682 { 683 pCommentNode->pszNodeValue = strhSubstr(pFoundTag->pOpenBrck + 4, 684 pFoundTag->pCloseBrck - 2); 685 } 686 pBufCurrent = pFoundTag->pCloseBrck + 1; 687 } 688 else 689 { 690 BOOL fKeepTagName = FALSE; // free pszTagName below 691 PSZ pszTagName = strhSubstr(pFoundTag->pStartOfTagName, 692 pFoundTag->pFirstAfterTagName); 693 if (!pszTagName) 694 // zero-length string: 695 // go ahead after that 696 pBufCurrent = pFoundTag->pCloseBrck + 1; 697 else 698 { 699 // XML knows two types of elements: 700 701 // a) Element pairs, which have opening and closing tags 702 // (<TAG> and </TAG> 703 // b) Single elements, which must have "/" as their last 704 // character; these have no closing tag 705 // (<TAG/>) 706 707 // However, HTML doesn't usually tag single elements 708 // with a trailing '/'. To maintain compatibility, 709 // if we don't find a matching closing tag, we extract 710 // everything up to the end of the buffer. 711 712 ULONG ulTagNameLen = strlen(pszTagName); 713 714 // search for closing tag first... 715 // create string with closing tag to search for; 716 // that's '/' plus opening tag name 717 ULONG ulClosingTagLen2Find = ulTagNameLen + 1; 718 PSZ pszClosingTag2Find = (PSZ)malloc(ulClosingTagLen2Find + 1); // plus null byte 719 PLISTNODE pTagListNode2 = pCurrentTagListNode->pNext; 720 PLISTNODE pTagListNodeForChildren = pTagListNode2; 721 722 BOOL fClosingTagFound = FALSE; 723 724 *pszClosingTag2Find = '/'; 725 strcpy(pszClosingTag2Find + 1, pszTagName); 726 727 // now find matching closing tag 728 while (pTagListNode2) 729 { 730 PTAGFOUND pFoundTag2 = (PTAGFOUND)pTagListNode2->pItemData; 731 ULONG ulFoundTag2Len = (pFoundTag2->pFirstAfterTagName - pFoundTag2->pStartOfTagName); 732 // compare tag name lengths 733 if (ulFoundTag2Len == ulClosingTagLen2Find) 734 { 735 // same length: 736 // compare 737 if (memcmp(pFoundTag2->pStartOfTagName, 738 pszClosingTag2Find, 739 ulClosingTagLen2Find) 740 == 0) 741 { 742 // found matching closing tag: 743 744 // we now have 745 // -- pCurrentTagListNode pointing to the opening tag 746 // (pFoundTag has its PTAGFOUND item data) 747 // -- pTagListNode2 pointing to the closing tag 748 // (pFoundTag2 has its PTAGFOUND item data) 749 750 // create DOM node 751 PDOMNODE pNewNode = CreateElementNode(pParentNode, 752 pszTagName, 753 pFoundTag->pFirstAfterTagName); 754 if (pNewNode) 755 { 756 ULONG ulAction = XMLACTION_BREAKUP; 757 758 fKeepTagName = TRUE; // do not free below 759 760 // validate tag 761 if (pfnValidateTag) 762 { 763 // validator specified: 764 ulAction = pfnValidateTag(pszTagName); 765 } 766 767 if (ulAction == XMLACTION_COPYASTEXT) 768 { 769 CreateTextNode(pNewNode, 770 pFoundTag->pCloseBrck + 1, 771 pFoundTag2->pOpenBrck - 1); 772 } 773 else if (ulAction == XMLACTION_BREAKUP) 774 { 775 PLINKLIST pllSubList = lstCreate(FALSE); 776 PLISTNODE pSubNode = 0; 777 ULONG cSubNodes = 0; 778 779 // text buffer to search 780 const char *pSubBufStart = pFoundTag->pCloseBrck + 1; 781 const char *pSubBufEnd = pFoundTag2->pOpenBrck; 782 783 // create a child list containing 784 // all tags from the first tag after 785 // the current opening tag to the closing tag 786 for (pSubNode = pTagListNodeForChildren; 787 pSubNode != pTagListNode2; 788 pSubNode = pSubNode->pNext) 789 { 790 lstAppendItem(pllSubList, 791 pSubNode->pItemData); 792 cSubNodes++; 793 } 794 795 // now recurse to build child nodes 796 // (text and elements), even if the 797 // list is empty, we can have text! 798 CreateNodesForBuf(pSubBufStart, 799 pSubBufEnd, 800 pllSubList, 801 pNewNode, 802 pfnValidateTag, 803 ppError); 804 805 lstFree(pllSubList); 806 } // end if (ulAction == XMLACTION_BREAKUP) 807 808 // now search on after the closing tag 809 // we've found; the next tag will be set below 810 pCurrentTagListNode = pTagListNode2; 811 pBufCurrent = pFoundTag2->pCloseBrck + 1; 812 813 fClosingTagFound = TRUE; 814 815 break; // // while (pTagListNode2) 816 } // end if (pNewNode) 817 } // end if (memcmp(pFoundTag2->pStartOfTagName, 818 } // if (ulFoundTag2Len == ulClosingTagLen2Find) 819 820 pTagListNode2 = pTagListNode2->pNext; 821 822 } // while (pTagListNode2) 823 824 if (!fClosingTagFound) 825 { 826 // no matching closing tag found: 827 // that's maybe a block of not well-formed XML 828 829 // e.g. with WarpIN: 830 // <README> <-- we start after this 831 // block of plain HTML with <P> tags and such 832 // </README> 833 834 // just create an element 835 PDOMNODE pNewNode = CreateElementNode(pParentNode, 836 pszTagName, 837 pFoundTag->pFirstAfterTagName); 838 if (pNewNode) 839 fKeepTagName = TRUE; 840 841 // now search on after the closing tag 842 // we've found; the next tag will be set below 843 // pCurrentTagListNode = pTagListNodeForChildren; 844 pBufCurrent = pFoundTag->pCloseBrck + 1; 845 } 846 847 free(pszClosingTag2Find); 848 849 if (!fKeepTagName) 850 free(pszTagName); 851 } // end if (pszTagName) 852 } 853 854 pCurrentTagListNode = pCurrentTagListNode->pNext; 855 } 856 } 857 } 858 859 return (ulrc); 860 } 861 862 /* 863 * xmlParse: 864 * generic XML parser. 865 * 866 * This takes the specified zero-terminated string 867 * in pcszBuf and parses it, adding DOMNODE's as 868 * children to pNode. 869 * 870 * This recurses, if necessary, to build a node tree. 40 * of an XML document. The W3C calls this "a platform- and 41 * language-neutral interface that allows programs and scripts 42 * to dynamically access and update the content, structure 43 * and style of documents. The Document Object Model provides 44 * a standard set of objects for representing HTML and XML 45 * documents, a standard model of how these objects can 46 * be combined, and a standard interface for accessing and 47 * manipulating them. Vendors can support the DOM as an 48 * interface to their proprietary data structures and APIs, 49 * and content authors can write to the standard DOM 50 * interfaces rather than product-specific APIs, thus 51 * increasing interoperability on the Web." 871 52 * 872 53 * Example: Take this HTML table definition: … … 887 68 * This function will create a tree as follows: 888 69 + 889 + ÚÄÄÄÄÄÄÄÄÄÄÄÄ¿ 890 + ³ TABLE ³ (only ELEMENT node in root DOCUMENT node) 891 + ÀÄÄÄÄÄÂÄÄÄÄÄÄÙ 892 + ³ 893 + ÚÄÄÄÄÄÁÄÄÄÄÄÄ¿ 894 + ³ TBODY ³ (only ELEMENT node in root "TABLE" node) 895 + ÀÄÄÄÄÄÂÄÄÄÄÄÄÙ 896 + ÚÄÄÄÄÄÄÄÄÄÄÄÁÄÄÄÄÄÄÄÄÄÄÄ¿ 897 + ÚÄÄÄÄÄÁÄÄÄÄÄÄ¿ ÚÄÄÄÄÄÁÄÄÄÄÄÄ¿ 898 + ³ TR ³ ³ TR ³ 899 + ÀÄÄÄÄÄÂÄÄÄÄÄÄÙ ÀÄÄÄÄÄÂÄÄÄÄÄÄÙ 900 + ÚÄÄÄÁÄÄÄÄÄÄ¿ ÚÄÄÄÁÄÄÄÄÄÄ¿ 901 + ÚÄÄÄÁÄ¿ ÚÄÄÁÄÄ¿ ÚÄÄÄÁÄ¿ ÚÄÄÁÄÄ¿ 902 + ³ TD ³ ³ TD ³ ³ TD ³ ³ TD ³ 903 + ÀÄÄÂÄÄÙ ÀÄÄÂÄÄÙ ÀÄÄÄÂÄÙ ÀÄÄÂÄÄÙ 904 + ÉÍÍÍÍÍÊÍÍÍÍ» ÉÍÍÍÍÊÍÍÍÍÍ» ÉÍÍÍÍÊÍÍÍÍÍ» ÉÍÍÊÍÍÍÍÍÍÍ» 905 + ºColumn 1-1º ºColumn 1-2º ºColumn 2-1º ºColumn 2-2º (one TEXT node in each parent node) 906 + ÈÍÍÍÍÍÍÍÍÍÍŒ ÈÍÍÍÍÍÍÍÍÍÍŒ ÈÍÍÍÍÍÍÍÍÍÍŒ ÈÍÍÍÍÍÍÍÍÍÍŒ 907 */ 908 909 ULONG xmlParse(PDOMNODE pParentNode, // in: node to append children to; must not be NULL 910 const char *pcszBuf, // in: buffer to search 911 PFNVALIDATE pfnValidateTag) 912 { 913 ULONG ulrc = 0; 914 915 PLINKLIST pllTags = BuildTagsList(pcszBuf); 916 917 // now create DOMNODE's according to that list... 918 const char *pcszError = 0; 919 CreateNodesForBuf(pcszBuf, 920 NULL, // enitre buffer 921 pllTags, 922 pParentNode, 923 pfnValidateTag, 924 &pcszError); 925 926 lstFree(pllTags); 70 + ÚÄÄÄÄÄÄÄÄÄÄÄÄ¿ 71 + ³ TABLE ³ (only ELEMENT node in root DOCUMENT node) 72 + ÀÄÄÄÄÄÂÄÄÄÄÄÄÙ 73 + ³ 74 + ÚÄÄÄÄÄÁÄÄÄÄÄÄ¿ 75 + ³ TBODY ³ (only ELEMENT node in root "TABLE" node) 76 + ÀÄÄÄÄÄÂÄÄÄÄÄÄÙ 77 + ÚÄÄÄÄÄÄÄÄÄÄÄÁÄÄÄÄÄÄÄÄÄÄÄ¿ 78 + ÚÄÄÄÄÄÁÄÄÄÄÄÄ¿ ÚÄÄÄÄÄÁÄÄÄÄÄÄ¿ 79 + ³ TR ³ ³ TR ³ 80 + ÀÄÄÄÄÄÂÄÄÄÄÄÄÙ ÀÄÄÄÄÄÂÄÄÄÄÄÄÙ 81 + ÚÄÄÄÁÄÄÄÄÄÄ¿ ÚÄÄÄÁÄÄÄÄÄÄ¿ 82 + ÚÄÄÄÁÄ¿ ÚÄÄÁÄÄ¿ ÚÄÄÄÁÄ¿ ÚÄÄÁÄÄ¿ 83 + ³ TD ³ ³ TD ³ ³ TD ³ ³ TD ³ 84 + ÀÄÄÂÄÄÙ ÀÄÄÂÄÄÙ ÀÄÄÄÂÄÙ ÀÄÄÂÄÄÙ 85 + ÉÍÍÍÍÍÊÍÍÍÍ» ÉÍÍÍÍÊÍÍÍÍÍ» ÉÍÍÍÍÊÍÍÍÍÍ» ÉÍÍÊÍÍÍÍÍÍÍ» 86 + ºColumn 1-1º ºColumn 1-2º ºColumn 2-1º ºColumn 2-2º (one TEXT node in each parent node) 87 + ÈÍÍÍÍÍÍÍÍÍÍŒ ÈÍÍÍÍÍÍÍÍÍÍŒ ÈÍÍÍÍÍÍÍÍÍÍŒ ÈÍÍÍÍÍÍÍÍÍÍŒ 88 * 89 * DOM really calls for object oriented programming so the various 90 * structs can inherit from each other. Since this implementation 91 * was supposed to be a C-only interface, we do not implement 92 * inheritance. Instead, each XML document is broken up into a tree 93 * of DOMNODE's only, each of which has a special type. 94 * 95 * It shouldn't be too difficult to write a C++ encapsulation 96 * of this which implements all the methods required by the DOM 97 * standard. 98 * 99 * The main entry point into this is xmlParse or 100 * xmlCreateDocumentFromString. See remarks there for details. 101 * 102 * Limitations: 103 * 104 * 1) This presently only parses ELEMENT, ATTRIBUTE, TEXT, 105 * and COMMENT nodes. 106 * 107 * 2) This doesn't use 16-bit characters, but 8-bit characters. 108 * 109 *@@header "helpers\xml.h" 110 *@@added V0.9.6 (2000-10-29) [umoeller] 111 */ 112 113 /* 114 * Copyright (C) 2000-2001 Ulrich Mller. 115 * This file is part of the "XWorkplace helpers" source package. 116 * This is free software; you can redistribute it and/or modify 117 * it under the terms of the GNU General Public License as published 118 * by the Free Software Foundation, in version 2 as it comes in the 119 * "COPYING" file of the XWorkplace main distribution. 120 * This program is distributed in the hope that it will be useful, 121 * but WITHOUT ANY WARRANTY; without even the implied warranty of 122 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 123 * GNU General Public License for more details. 124 */ 125 126 #define OS2EMX_PLAIN_CHAR 127 // this is needed for "os2emx.h"; if this is defined, 128 // emx will define PSZ as _signed_ char, otherwise 129 // as unsigned char 130 131 #define INCL_DOSERRORS 132 #include <os2.h> 133 134 #include <stdlib.h> 135 #include <string.h> 136 137 #include "setup.h" // code generation and debugging options 138 139 #include "expat\expat.h" 140 141 #include "helpers\linklist.h" 142 #include "helpers\stringh.h" 143 #include "helpers\xstring.h" 144 #include "helpers\xml.h" 145 146 #pragma hdrstop 147 148 /* 149 *@@category: Helpers\C helpers\XML 150 * see xml.c. 151 */ 152 153 /* 154 *@@category: Helpers\C helpers\XML\Document Object Model (DOM) 155 * see xml.c. 156 */ 157 158 /* ****************************************************************** 159 * 160 * Node management 161 * 162 ********************************************************************/ 163 164 /* 165 *@@ xmlCreateNode: 166 * creates a new DOMNODE with the specified 167 * type and parent. Other than that, the 168 * node is zeroed. 169 */ 170 171 PDOMNODE xmlCreateNode(PDOMNODE pParentNode, // in: parent node or NULL if root 172 ULONG ulNodeType) // in: DOMNODE_* type 173 { 174 PDOMNODE pNewNode = (PDOMNODE)malloc(sizeof(DOMNODE)); 175 if (pNewNode) 176 { 177 memset(pNewNode, 0, sizeof(DOMNODE)); 178 pNewNode->ulNodeType = ulNodeType; 179 pNewNode->pParentNode = pParentNode; 180 if (pParentNode) 181 { 182 // parent specified: 183 // append this new node to the parent's 184 // list of child nodes 185 lstAppendItem(&pParentNode->llChildNodes, 186 pNewNode); 187 } 188 189 lstInit(&pNewNode->llChildNodes, FALSE); 190 lstInit(&pNewNode->llAttributeNodes, FALSE); 191 } 192 193 return (pNewNode); 194 } 195 196 /* 197 *@@ xmlDeleteNode: 198 * deletes the specified node. 199 * 200 * If the node has child nodes, all of them are deleted 201 * as well. This recurses, if necessary. 202 * 203 * As a result, if the node is a document node, this 204 * deletes an entire document, including all of its 205 * child nodes. 206 * 207 * Returns: 208 * 209 * -- 0: NO_ERROR. 210 */ 211 212 ULONG xmlDeleteNode(PDOMNODE pNode) 213 { 214 ULONG ulrc = 0; 215 216 if (!pNode) 217 { 218 ulrc = ERROR_DOM_NOT_FOUND; 219 } 220 else 221 { 222 // recurse into child nodes 223 PLISTNODE pNodeThis = lstQueryFirstNode(&pNode->llChildNodes); 224 while (pNodeThis) 225 { 226 // recurse!! 227 xmlDeleteNode((PDOMNODE)(pNodeThis->pItemData)); 228 229 pNodeThis = pNodeThis->pNext; 230 } 231 232 // delete attribute nodes 233 pNodeThis = lstQueryFirstNode(&pNode->llAttributeNodes); 234 while (pNodeThis) 235 { 236 // recurse!! 237 xmlDeleteNode((PDOMNODE)(pNodeThis->pItemData)); 238 239 pNodeThis = pNodeThis->pNext; 240 } 241 242 if (pNode->pParentNode) 243 { 244 // node has a parent: 245 // remove this node from the parent's list 246 // of child nodes before deleting this node 247 lstRemoveItem(&pNode->pParentNode->llChildNodes, 248 pNode); 249 pNode->pParentNode = NULL; 250 } 251 252 xstrClear(&pNode->strNodeName); 253 xstrClear(&pNode->strNodeValue); 254 255 lstClear(&pNode->llChildNodes); 256 lstClear(&pNode->llAttributeNodes); 257 258 free(pNode); 259 } 927 260 928 261 return (ulrc); 929 262 } 930 263 931 /* 932 *@@ xmlCreateDocumentFromString: 933 * creates a DOCUMENT DOMNODE and calls xmlParse 934 * to break down the specified buffer into that 935 * node. 936 */ 937 938 PDOMNODE xmlCreateDocumentFromString(const char *pcszXML, 939 PFNVALIDATE pfnValidateTag) 940 { 941 PDOMNODE pDocument = xmlCreateNode(NULL, // no parent 942 DOMNODE_DOCUMENT); 943 xmlParse(pDocument, 944 pcszXML, 945 pfnValidateTag); 946 947 return (pDocument); 948 } 949 950 264 /* ****************************************************************** 265 * 266 * Expat handlers 267 * 268 ********************************************************************/ 269 270 /* 271 *@@ StartElementHandler: 272 * expat handler called when a new element is 273 * found. 274 * 275 * We create a new record in the container and 276 * push it onto our stack so we can insert 277 * children into it. We first start with the 278 * attributes. 279 */ 280 281 void EXPATENTRY StartElementHandler(void *data, // in: our PXMLFILE really 282 const char *pcszElement, 283 const char **papcszAttribs) 284 { 285 PXMLDOM pDom = (PXMLDOM)data; 286 287 ULONG i; 288 289 PDOMNODE pParent = NULL, 290 pNew = NULL; 291 292 PLISTNODE pParentNode = lstPop(&pDom->llStack); 293 294 if (pParentNode) 295 { 296 // non-root level: 297 pParent = (PDOMNODE)pParentNode->pItemData; 298 299 pNew = xmlCreateNode(pParent, 300 DOMNODE_ELEMENT); 301 302 if (pNew) 303 xstrcpy(&pNew->strNodeName, pcszElement, 0); 304 305 // push this on the stack so we can add child elements 306 lstPush(&pDom->llStack, pNew); 307 308 // now for the attribs 309 for (i = 0; 310 papcszAttribs[i]; 311 i += 2) 312 { 313 PDOMNODE pAttrNode = xmlCreateNode(pNew, // element 314 DOMNODE_ATTRIBUTE); 315 if (pAttrNode) 316 { 317 xstrcpy(&pAttrNode->strNodeName, papcszAttribs[i], 0); 318 xstrcpy(&pAttrNode->strNodeValue, papcszAttribs[i + 1], 0); 319 } 320 } 321 } 322 323 pDom->pLastWasTextNode = NULL; 324 } 325 326 /* 327 *@@ EndElementHandler: 328 * 329 */ 330 331 void EXPATENTRY EndElementHandler(void *data, // in: our PXMLFILE really 332 const XML_Char *name) 333 { 334 PXMLDOM pDom = (PXMLDOM)data; 335 PLISTNODE pNode = lstPop(&pDom->llStack); 336 if (pNode) 337 lstRemoveNode(&pDom->llStack, pNode); 338 339 pDom->pLastWasTextNode = NULL; 340 } 341 342 /* 343 *@@ CharacterDataHandler: 344 * 345 */ 346 347 void EXPATENTRY CharacterDataHandler(void *userData, // in: our PXMLFILE really 348 const XML_Char *s, 349 int len) 350 { 351 PXMLDOM pDom = (PXMLDOM)userData; 352 353 ULONG i; 354 355 if (len) 356 { 357 if (pDom->pLastWasTextNode) 358 { 359 // we had a text node, and no elements or other 360 // stuff in between: 361 xstrcat(&pDom->pLastWasTextNode->strNodeValue, 362 s, 363 len); 364 } 365 else 366 { 367 // we need a new text node: 368 PDOMNODE pNew, 369 pParent; 370 // non-root level: 371 PLISTNODE pParentNode = lstPop(&pDom->llStack); 372 pParent = (PDOMNODE)pParentNode->pItemData; 373 374 pNew = xmlCreateNode(pParent, 375 DOMNODE_TEXT); 376 if (pNew) 377 { 378 PSZ pszNodeValue = (PSZ)malloc(len + 1); 379 memcpy(pszNodeValue, s, len); 380 pszNodeValue[len] = '\0'; 381 xstrInitSet(&pNew->strNodeValue, pszNodeValue); 382 } 383 384 pDom->pLastWasTextNode = pNew; 385 } 386 } 387 } 388 389 /* ****************************************************************** 390 * 391 * DOM APIs 392 * 393 ********************************************************************/ 394 395 /* 396 *@@ xmlCreateDOM: 397 * 398 * Usage: 399 * 400 * 1) Create a DOM instance. 401 * 402 + PXMLDOM pDom = NULL; 403 + APIRET arc = xmlCreateDom(flags, &pDom); 404 + 405 * 2) Give chunks of data (or an entire buffer) 406 * to the DOM instance for parsing. 407 * 408 + arc = xmlParse(pDom, 409 + pBuf, 410 + TRUE); // if last, this will clean up the parser 411 * 412 * 3) Process the data in the DOM tree. When done, 413 * call xmlFreeDOM, which will free all memory. 414 * 415 *@@added V0.9.9 (2000-02-14) [umoeller] 416 */ 417 418 APIRET xmlCreateDOM(ULONG flParserFlags, 419 PXMLDOM *ppDom) 420 { 421 APIRET arc = NO_ERROR; 422 423 PXMLDOM pDom = (PXMLDOM)malloc(sizeof(XMLDOM)); 424 if (!pDom) 425 arc = ERROR_NOT_ENOUGH_MEMORY; 426 else 427 { 428 memset(pDom, 0, sizeof(XMLDOM)); 429 430 lstInit(&pDom->llStack, 431 FALSE); // no auto-free 432 433 // create the document node 434 pDom->pDocumentNode = xmlCreateNode(NULL, // no parent 435 DOMNODE_DOCUMENT); 436 437 if (!pDom->pDocumentNode) 438 arc = ERROR_NOT_ENOUGH_MEMORY; 439 else 440 { 441 // push the document on the stack so the handlers 442 // will append to that 443 lstPush(&pDom->llStack, 444 pDom->pDocumentNode); 445 446 pDom->pParser = XML_ParserCreate(NULL); 447 448 if (!pDom->pParser) 449 arc = ERROR_NOT_ENOUGH_MEMORY; 450 else 451 { 452 XML_SetElementHandler(pDom->pParser, 453 StartElementHandler, 454 EndElementHandler); 455 456 XML_SetCharacterDataHandler(pDom->pParser, 457 CharacterDataHandler); 458 459 // pass the XMLDOM as user data to the handlers 460 XML_SetUserData(pDom->pParser, 461 pDom); 462 463 } 464 } 465 } 466 467 if (arc == NO_ERROR) 468 *ppDom = pDom; 469 else 470 xmlFreeDOM(pDom); 471 472 return (arc); 473 } 474 475 /* 476 *@@ xmlParse: 477 * parses another piece of XML data. 478 * 479 * If (fIsLast == TRUE), the internal expat parser 480 * will be freed, but not the DOM itself. 481 * 482 * You can pass an XML document to this function 483 * in one flush. Set fIsLast = TRUE on the first 484 * and only call then. 485 * 486 * This returns NO_ERROR if the chunk was successfully 487 * parsed. Otherwise ERROR_DOM_PARSING is returned, 488 * and you will find error information in the XMLDOM 489 * fields. 490 * 491 *@@added V0.9.9 (2000-02-14) [umoeller] 492 */ 493 494 APIRET xmlParse(PXMLDOM pDom, 495 const char *pcszBuf, 496 ULONG cb, 497 BOOL fIsLast) 498 { 499 APIRET arc = NO_ERROR; 500 501 if (!pDom) 502 arc = ERROR_INVALID_PARAMETER; 503 else 504 { 505 BOOL fSuccess = XML_Parse(pDom->pParser, 506 pcszBuf, 507 cb, 508 fIsLast); 509 510 if (!fSuccess) 511 { 512 // error: 513 pDom->Error = XML_GetErrorCode(pDom->pParser); 514 pDom->pcszErrorDescription = XML_ErrorString(pDom->Error); 515 pDom->ulErrorLine = XML_GetCurrentLineNumber(pDom->pParser); 516 pDom->ulErrorColumn = XML_GetCurrentColumnNumber(pDom->pParser); 517 518 if (pDom->pDocumentNode) 519 { 520 xmlDeleteNode(pDom->pDocumentNode); 521 pDom->pDocumentNode = NULL; 522 } 523 524 arc = ERROR_DOM_PARSING; 525 } 526 527 528 if (!fSuccess && fIsLast) 529 { 530 // last call or error: clean up 531 XML_ParserFree(pDom->pParser); 532 pDom->pParser = NULL; 533 534 // clean up the stack (but not the DOM itself) 535 lstClear(&pDom->llStack); 536 } 537 } 538 539 return (arc); 540 } 541 542 /* 543 *@@ xmlFreeDOM: 544 * cleans up all resources allocated by 545 * xmlCreateDOM and xmlParse, including 546 * the entire DOM tree. 547 * 548 * If you wish to keep any data, make 549 * a copy of the respective pointers in pDom 550 * or subitems and set them to NULL before 551 * calling this function. 552 * 553 *@@added V0.9.9 (2000-02-14) [umoeller] 554 */ 555 556 APIRET xmlFreeDOM(PXMLDOM pDom) 557 { 558 APIRET arc = NO_ERROR; 559 if (pDom) 560 { 561 // if the parser is still alive for some reason, close it. 562 if (pDom->pParser) 563 { 564 XML_ParserFree(pDom->pParser); 565 pDom->pParser = NULL; 566 } 567 568 free(pDom); 569 } 570 571 return (arc); 572 }
Note:
See TracChangeset
for help on using the changeset viewer.