HTML Parser
/******************************************************************************* * Copyright (c) 2004 Actuate Corporation. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * Actuate Corporation - initial API and implementation *******************************************************************************/ import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.LineNumberReader; import java.util.ArrayList; public class HTMLParser { FileReader reader; LineNumberReader in; String token; ArrayList attribs = new ArrayList( ); int pushC = -1; private boolean ignoreWhitespace = true; public static final int EOF = -1; public static final int TEXT = 1; public static final int DOCTYPE = 2; public static final int ELEMENT = 3; public static final int COMMENT = 4; public static final int SPECIAL_ELEMENT = 5; public static final int START_ELEMENT = 0; public static final int END_ELEMENT = 1; public static final int SINGLE_ELEMENT = 2; public HTMLParser( ) { } public void open( String fileName ) throws FileNotFoundException { reader = new FileReader( fileName ); in = new LineNumberReader( reader ); } /** * */ public void close( ) { try { in.close( ); reader.close( ); } catch ( IOException e1 ) { // Ignore } } public String getTokenText( ) { return token; } public int getElementType( ) { if ( token.startsWith( "/" ) ) //$NON-NLS-1$ return END_ELEMENT; if ( token.endsWith( "/" ) ) //$NON-NLS-1$ return SINGLE_ELEMENT; return START_ELEMENT; } public String getElement( ) { if ( token.startsWith( "/" ) ) //$NON-NLS-1$ return token.substring( 1 ); if ( token.endsWith( "/" ) ) //$NON-NLS-1$ return token.substring( 0, token.length( ) - 1 ); return token; } public ArrayList getAttribs( ) { return attribs; } public String getAttrib( String name ) { for ( int i = 0; i < attribs.size( ); i++ ) { AttribPair a = (AttribPair) attribs.get( i ); if ( a.attrib.equalsIgnoreCase( name ) ) return a.value; } return null; } private int getC( ) { if ( pushC != -1 ) { int c = pushC; pushC = -1; return c; } try { return in.read( ); } catch ( IOException e ) { return EOF; } } private void pushC( int c ) { pushC = c; } public int getToken( ) { for ( ; ; ) { int c = getC( ); switch ( c ) { case -1: return EOF; case '<': return getElement( c ); default: { parseText( c ); if ( ! ignoreWhitespace || token.trim( ).length( ) > 0 ) return TEXT; } } } } private int parseText( int c ) { StringBuffer text = new StringBuffer( ); for ( ; ; ) { if ( c == EOF ) break; if ( c == '<' ) { pushC( c ); break; } // Convert MS-Word-style quotes. if ( c == 8220 || c == 8221 ) text.append( """ ); else text.append( (char) c ); c = getC( ); } token = text.toString( ); return TEXT; } private int skipSpace( int c ) { while ( c != EOF && Character.isWhitespace( (char)c ) ) { c = getC( ); } return c; } private int getElement( int c ) { c = getC( ); // Broken element if ( c == EOF ) return EOF; if ( c == '!' ) return getSpecialElement( ); attribs.clear( ); c = skipSpace( c ); if ( c == EOF ) return EOF; StringBuffer tag = new StringBuffer( ); if ( c == '/' ) { tag.append( (char) c ); c = skipSpace( getC( ) ); while ( c != EOF && c != '>' && ! Character.isWhitespace( (char)c ) ) { tag.append( (char) c ); c = getC( ); } token = tag.toString( ); for ( ; ; ) { if ( c == '>' || c == -1 ) break; c = getC( ); } return ELEMENT; } while ( c != EOF && c != '>' && c != '/' && ! Character.isWhitespace( (char)c ) ) { tag.append( (char) c ); c = getC( ); } if ( c == EOF ) { token = tag.toString( ); return ELEMENT; } for ( ; ; ) { c = skipSpace( c ); if ( c == EOF || c == '>' || c == '/' ) break; c = getAttrib( c ); } if ( c == '/' ) { tag.append( (char) c ); for ( ; ; ) { c = getC( ); if ( c == -1 || c == '>' ) break; } } token = tag.toString( ); return ELEMENT; } private int getAttrib( int c ) { AttribPair a = new AttribPair( ); StringBuffer s = new StringBuffer( ); while ( c != EOF && c != '=' && ! Character.isWhitespace( (char)c ) ) { s.append( (char) c ); c = getC( ); } a.attrib = s.toString( ); c = skipSpace( c ); if ( c != '=' ) { attribs.add( a ); return c; } s = new StringBuffer( ); c = skipSpace( getC( ) ); if ( c == '\'' || c == '"' ) { int quote = c; for ( ; ; ) { c = getC( ); if ( c == -1 ) break; if ( c == quote ) { c = getC( ); break; } if ( c == '\\' ) { c = getC( ); if ( c == EOF ) break; s.append( '\\' ); s.append( (char) c ); } else { s.append( (char) c ); } } } else { for ( ; ; ) { c = getC( ); if ( c == -1 ) break; if ( c == '>' || c == '/' || Character.isWhitespace( (char)c ) ) { c = getC( ); break; } s.append( (char) c ); } } a.value = s.toString( ); attribs.add( a ); return c; } class AttribPair { String attrib; String value; } private int getSpecialElement( ) { StringBuffer text = new StringBuffer( ); text.append( "<!" ); //$NON-NLS-1$ for ( ; ; ) { int c = getC( ); if ( c == EOF || c == '>' ) break; text.append( (char) c ); } text.append( '>' ); token = text.toString( ); if ( token.startsWith( "<!--" ) ) //$NON-NLS-1$ return COMMENT; return SPECIAL_ELEMENT; } static String formatTags[ ] = { "i", "b", //$NON-NLS-1$//$NON-NLS-2$ "strong", "em", //$NON-NLS-1$//$NON-NLS-2$ "code", "span", //$NON-NLS-1$ //$NON-NLS-2$ "a" //$NON-NLS-1$ }; public boolean isFormatTag( ) { return isFormatTag( getElement( ) ); } public boolean isFormatTag( String tag ) { for ( int i = 0; i < formatTags.length; i++ ) { if ( formatTags[ i ].equalsIgnoreCase( tag ) ) return true; } return false; } public Object getFullElement( ) { StringBuffer text = new StringBuffer( ); text.append( '<' ); int elementType = getElementType( ); if ( elementType == END_ELEMENT ) text.append( '/' ); text.append( getElement( ) ); for ( int i = 0; i < attribs.size( ); i++ ) { text.append( ' ' ); AttribPair a = (AttribPair) attribs.get( i ); text.append( a.attrib ); text.append( "=\"" ); //$NON-NLS-1$ if ( a.value != null ) text.append( a.value ); text.append( "\"" ); //$NON-NLS-1$ } if ( elementType == SINGLE_ELEMENT ) text.append( '/' ); text.append( '>' ); return text.toString( ); } public int getLineNo( ) { return in.getLineNumber( ); } public void ignoreWhitespace( boolean b ) { ignoreWhitespace = b; } }