Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

StreamTokenizer.java

Visit:

Last change on this file was 1392, checked in by bird, 21 years ago
This commit was generated by cvs2svn to compensate for changes in r1391, which included commits to RCS files with non-trunk default branches.
Property cvs2svn:cvs-rev set to `1.1.1.2` Property svn:eol-style set to `native` Property svn:executable set to ``*
File size: 20.5 KB

Line
1	/* StreamTokenizer.java -- parses streams of characters into tokens
2	Copyright (C) 1998, 1999, 2000, 2001, 2002 Free Software Foundation
3
4	This file is part of GNU Classpath.
5
6	GNU Classpath is free software; you can redistribute it and/or modify
7	it under the terms of the GNU General Public License as published by
8	the Free Software Foundation; either version 2, or (at your option)
9	any later version.
10
11	GNU Classpath is distributed in the hope that it will be useful, but
12	WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	General Public License for more details.
15
16	You should have received a copy of the GNU General Public License
17	along with GNU Classpath; see the file COPYING. If not, write to the
18	Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19	02111-1307 USA.
20
21	Linking this library statically or dynamically with other modules is
22	making a combined work based on this library. Thus, the terms and
23	conditions of the GNU General Public License cover the whole
24	combination.
25
26	As a special exception, the copyright holders of this library give you
27	permission to link this library with independent modules to produce an
28	executable, regardless of the license terms of these independent
29	modules, and to copy and distribute the resulting executable under
30	terms of your choice, provided that you also meet, for each linked
31	independent module, the terms and conditions of the license of that
32	module. An independent module is a module which is not derived from
33	or based on this library. If you modify this library, you may extend
34	this exception to your version of the library, but you are not
35	obligated to do so. If you do not wish to do so, delete this
36	exception statement from your version. */
37
38	package java.io;
39
40	/**
41	* This class parses streams of characters into tokens. There are a
42	* million-zillion flags that can be set to control the parsing, as
43	* described under the various method headings.
44	*
45	* @author Warren Levy <warrenl@cygnus.com>
46	* @date October 25, 1998.
47	*/
48	/* Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3
49	* "The Java Language Specification", ISBN 0-201-63451-1
50	* plus online API docs for JDK 1.2 beta from http://www.javasoft.com.
51	* Status: Believed complete and correct.
52	*/
53
54	public class StreamTokenizer
55	{
56	/** A constant indicating that the end of the stream has been read. */
57	public static final int TT_EOF = -1;
58
59	/** A constant indicating that the end of the line has been read. */
60	public static final int TT_EOL = '\n';
61
62	/** A constant indicating that a number token has been read. */
63	public static final int TT_NUMBER = -2;
64
65	/** A constant indicating that a word token has been read. */
66	public static final int TT_WORD = -3;
67
68	/** A constant indicating that no tokens have been read yet. */
69	private static final int TT_NONE = -4;
70
71	/**
72	* Contains the type of the token read resulting from a call to nextToken
73	* The rules are as follows:
74	* <ul>
75	* <li>For a token consisting of a single ordinary character, this is the
76	* value of that character.
77	* <li>For a quoted string, this is the value of the quote character
78	* <li>For a word, this is TT_WORD
79	* <li>For a number, this is TT_NUMBER
80	* <li>For the end of the line, this is TT_EOL
81	* <li>For the end of the stream, this is TT_EOF
82	* </ul>
83	*/
84	public int ttype = TT_NONE;
85
86	/** The String associated with word and string tokens. */
87	public String sval;
88
89	/** The numeric value associated with number tokens. */
90	public double nval;
91
92	/* Indicates whether end-of-line is recognized as a token. */
93	private boolean eolSignificant = false;
94
95	/* Indicates whether word tokens are automatically made lower case. */
96	private boolean lowerCase = false;
97
98	/* Indicates whether C++ style comments are recognized and skipped. */
99	private boolean slashSlash = false;
100
101	/* Indicates whether C style comments are recognized and skipped. */
102	private boolean slashStar = false;
103
104	/* Attribute tables of each byte from 0x00 to 0xFF. */
105	private boolean[] whitespace = new boolean[256];
106	private boolean[] alphabetic = new boolean[256];
107	private boolean[] numeric = new boolean[256];
108	private boolean[] quote = new boolean[256];
109	private boolean[] comment = new boolean[256];
110
111	/* The Reader associated with this class. */
112	private PushbackReader in;
113
114	/* Indicates if a token has been pushed back. */
115	private boolean pushedBack = false;
116
117	/* Contains the current line number of the reader. */
118	private int lineNumber = 1;
119
120	/**
121	* This method reads bytes from an <code>InputStream</code> and tokenizes
122	* them. For details on how this method operates by default, see
123	* <code>StreamTokenizer(Reader)</code>.
124	*
125	* @param in The <code>InputStream</code> to read from
126	*
127	* @deprecated Since JDK 1.1.
128	*/
129	public StreamTokenizer(InputStream is)
130	{
131	this(new InputStreamReader(is));
132	}
133
134	/**
135	* This method initializes a new <code>StreamTokenizer</code> to read
136	* characters from a <code>Reader</code> and parse them. The char values
137	* have their hight bits masked so that the value is treated a character
138	* in the range of 0x0000 to 0x00FF.
139	* <p>
140	* This constructor sets up the parsing table to parse the stream in the
141	* following manner:
142	* <ul>
143	* <li>The values 'A' through 'Z', 'a' through 'z' and 0xA0 through 0xFF
144	* are initialized as alphabetic
145	* <li>The values 0x00 through 0x20 are initialized as whitespace
146	* <li>The values '\'' and '"' are initialized as quote characters
147	* <li>'/' is a comment character
148	* <li>Numbers will be parsed
149	* <li>EOL is not treated as significant
150	* <li>C and C++ (//) comments are not recognized
151	* </ul>
152	*
153	* @param in The <code>Reader</code> to read chars from
154	*/
155	public StreamTokenizer(Reader r)
156	{
157	in = new PushbackReader(r);
158
159	whitespaceChars(0x00, 0x20);
160	wordChars('A', 'Z');
161	wordChars('a', 'z');
162	wordChars(0xA0, 0xFF);
163	commentChar('/');
164	quoteChar('\'');
165	quoteChar('"');
166	parseNumbers();
167	}
168
169	/**
170	* This method sets the comment attribute on the specified character.
171	*
172	* @param c The character to set the comment attribute for, passed as an int
173	*/
174	public void commentChar(int ch)
175	{
176	if (ch >= 0 && ch <= 255)
177	comment[ch] = true;
178	}
179
180	/**
181	* This method sets a flag that indicates whether or not the end of line
182	* sequence terminates and is a token. The defaults to <code>false</code>
183	*
184	* @param flag <code>true</code> if EOF is significant, <code>false</code>
185	* otherwise
186	*/
187	public void eolIsSignificant(boolean flag)
188	{
189	eolSignificant = flag;
190	}
191
192	/**
193	* This method returns the current line number. Note that if the
194	* <code>pushBack()</code> method is called, it has no effect on the
195	* line number returned by this method.
196	*
197	* @return The current line number
198	*/
199	public int lineno()
200	{
201	return lineNumber;
202	}
203
204	/**
205	* This method sets a flag that indicates whether or not alphabetic
206	* tokens that are returned should be converted to lower case.
207	*
208	* @param flag <code>true</code> to convert to lower case,
209	* <code>false</code> otherwise
210	*/
211	public void lowerCaseMode(boolean flag)
212	{
213	lowerCase = flag;
214	}
215
216	private boolean isWhitespace(int ch)
217	{
218	return (ch >= 0 && ch <= 255 && whitespace[ch]);
219	}
220
221	private boolean isAlphabetic(int ch)
222	{
223	return ((ch > 255) \|\| (ch >= 0 && alphabetic[ch]));
224	}
225
226	private boolean isNumeric(int ch)
227	{
228	return (ch >= 0 && ch <= 255 && numeric[ch]);
229	}
230
231	private boolean isQuote(int ch)
232	{
233	return (ch >= 0 && ch <= 255 && quote[ch]);
234	}
235
236	private boolean isComment(int ch)
237	{
238	return (ch >= 0 && ch <= 255 && comment[ch]);
239	}
240
241	/**
242	* This method reads the next token from the stream. It sets the
243	* <code>ttype</code> variable to the appropriate token type and
244	* returns it. It also can set <code>sval</code> or <code>nval</code>
245	* as described below. The parsing strategy is as follows:
246	* <ul>
247	* <li>Skip any whitespace characters.
248	* <li>If a numeric character is encountered, attempt to parse a numeric
249	* value. Leading '-' characters indicate a numeric only if followed by
250	* another non-'-' numeric. The value of the numeric token is terminated
251	* by either the first non-numeric encountered, or the second occurrence of
252	* '-' or '.'. The token type returned is TT_NUMBER and <code>nval</code>
253	* is set to the value parsed.
254	* <li>If an alphabetic character is parsed, all subsequent characters
255	* are read until the first non-alphabetic or non-numeric character is
256	* encountered. The token type returned is TT_WORD and the value parsed
257	* is stored in <code>sval</code>. If lower case mode is set, the token
258	* stored in <code>sval</code> is converted to lower case. The end of line
259	* sequence terminates a word only if EOL signficance has been turned on.
260	* The start of a comment also terminates a word. Any character with a
261	* non-alphabetic and non-numeric attribute (such as white space, a quote,
262	* or a commet) are treated as non-alphabetic and terminate the word.
263	* <li>If a comment character is parsed, then all remaining characters on
264	* the current line are skipped and another token is parsed. Any EOL or
265	* EOF's encountered are not discarded, but rather terminate the comment.
266	* <li>If a quote character is parsed, then all characters up to the
267	* second occurrence of the same quote character are parsed into a
268	* <code>String</code>. This <code>String</code> is stored as
269	* <code>sval</code>, but is not converted to lower case, even if lower case
270	* mode is enabled. The token type returned is the value of the quote
271	* character encountered. Any escape sequences
272	* (\b (backspace), \t (HTAB), \n (linefeed), \f (form feed), \r
273	* (carriage return), \" (double quote), \' (single quote), \\
274	* (backslash), \XXX (octal esacpe)) are converted to the appropriate
275	* char values. Invalid esacape sequences are left in untranslated.
276	* Unicode characters like ('\ u0000') are not recognized.
277	* <li>If the C++ comment sequence "//" is encountered, and the parser
278	* is configured to handle that sequence, then the remainder of the line
279	* is skipped and another token is read exactly as if a character with
280	* the comment attribute was encountered.
281	* <li>If the C comment sequence "/*" is encountered, and the parser
282	* is configured to handle that sequence, then all characters up to and
283	* including the comment terminator sequence are discarded and another
284	* token is parsed.
285	* <li>If all cases above are not met, then the character is an ordinary
286	* character that is parsed as a token by itself. The char encountered
287	* is returned as the token type.
288	* </ul>
289	*
290	* @return The token type
291	* @exception IOException If an I/O error occurs
292	*/
293	public int nextToken() throws IOException
294	{
295	if (pushedBack)
296	{
297	pushedBack = false;
298	if (ttype != TT_NONE)
299	return ttype;
300	}
301
302	sval = null;
303	int ch;
304
305	// Skip whitespace. Deal with EOL along the way.
306	while (isWhitespace(ch = in.read()))
307	if (ch == '\n' \|\| ch == '\r')
308	{
309	lineNumber++;
310
311	// Throw away \n if in combination with \r.
312	if (ch == '\r' && (ch = in.read()) != '\n')
313	{
314	if (ch != TT_EOF)
315	in.unread(ch);
316	}
317	if (eolSignificant)
318	return (ttype = TT_EOL);
319	}
320
321	if (ch == '/')
322	if ((ch = in.read()) == '/' && slashSlash)
323	{
324	while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF)
325	;
326	if (ch != TT_EOF)
327	in.unread(ch);
328	return nextToken(); // Recursive, but not too deep in normal cases
329	}
330	else if (ch == '*' && slashStar)
331	{
332	while (true)
333	{
334	ch = in.read();
335	if (ch == '*')
336	{
337	if ((ch = in.read()) == '/')
338	break;
339	else if (ch != TT_EOF)
340	in.unread(ch);
341	}
342	else if (ch == '\n' \|\| ch == '\r')
343	{
344	lineNumber++;
345	if (ch == '\r' && (ch = in.read()) != '\n')
346	{
347	if (ch != TT_EOF)
348	in.unread(ch);
349	}
350	}
351	else if (ch == TT_EOF)
352	{
353	break;
354	}
355	}
356	return nextToken(); // Recursive, but not too deep in normal cases
357	}
358	else
359	{
360	if (ch != TT_EOF)
361	in.unread(ch);
362	ch = '/';
363	}
364
365	if (ch == TT_EOF)
366	ttype = TT_EOF;
367	else if (isNumeric(ch))
368	{
369	boolean isNegative = false;
370	if (ch == '-')
371	{
372	// Read ahead to see if this is an ordinary '-' rather than numeric.
373	ch = in.read();
374	if (isNumeric(ch) && ch != '-')
375	{
376	isNegative = true;
377	}
378	else
379	{
380	if (ch != TT_EOF)
381	in.unread(ch);
382	return (ttype = '-');
383	}
384	}
385
386	StringBuffer tokbuf = new StringBuffer();
387	tokbuf.append((char) ch);
388
389	int decCount = 0;
390	while (isNumeric(ch = in.read()) && ch != '-')
391	if (ch == '.' && decCount++ > 0)
392	break;
393	else
394	tokbuf.append((char) ch);
395
396	if (ch != TT_EOF)
397	in.unread(ch);
398	ttype = TT_NUMBER;
399	try
400	{
401	nval = Double.valueOf(tokbuf.toString()).doubleValue();
402	}
403	catch (NumberFormatException _)
404	{
405	nval = 0.0;
406	}
407	if (isNegative)
408	nval = -nval;
409	}
410	else if (isAlphabetic(ch))
411	{
412	StringBuffer tokbuf = new StringBuffer();
413	tokbuf.append((char) ch);
414	while (isAlphabetic(ch = in.read()) \|\| isNumeric(ch))
415	tokbuf.append((char) ch);
416	if (ch != TT_EOF)
417	in.unread(ch);
418	ttype = TT_WORD;
419	sval = tokbuf.toString();
420	if (lowerCase)
421	sval = sval.toLowerCase();
422	}
423	else if (isComment(ch))
424	{
425	while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF)
426	;
427	if (ch != TT_EOF)
428	in.unread(ch);
429	return nextToken(); // Recursive, but not too deep in normal cases.
430	}
431	else if (isQuote(ch))
432	{
433	ttype = ch;
434	StringBuffer tokbuf = new StringBuffer();
435	while ((ch = in.read()) != ttype && ch != '\n' && ch != '\r' &&
436	ch != TT_EOF)
437	{
438	if (ch == '\\')
439	switch (ch = in.read())
440	{
441	case 'a': ch = 0x7;
442	break;
443	case 'b': ch = '\b';
444	break;
445	case 'f': ch = 0xC;
446	break;
447	case 'n': ch = '\n';
448	break;
449	case 'r': ch = '\r';
450	break;
451	case 't': ch = '\t';
452	break;
453	case 'v': ch = 0xB;
454	break;
455	case '\n': ch = '\n';
456	break;
457	case '\r': ch = '\r';
458	break;
459	case '\"':
460	case '\'':
461	case '\\':
462	break;
463	default:
464	int ch1, nextch;
465	if ((nextch = ch1 = ch) >= '0' && ch <= '7')
466	{
467	ch -= '0';
468	if ((nextch = in.read()) >= '0' && nextch <= '7')
469	{
470	ch = ch * 8 + nextch - '0';
471	if ((nextch = in.read()) >= '0' && nextch <= '7' &&
472	ch1 >= '0' && ch1 <= '3')
473	{
474	ch = ch * 8 + nextch - '0';
475	nextch = in.read();
476	}
477	}
478	}
479
480	if (nextch != TT_EOF)
481	in.unread(nextch);
482	}
483
484	tokbuf.append((char) ch);
485	}
486
487	// Throw away matching quote char.
488	if (ch != ttype && ch != TT_EOF)
489	in.unread(ch);
490
491	sval = tokbuf.toString();
492	}
493	else
494	{
495	ttype = ch;
496	}
497
498	return ttype;
499	}
500
501	private void resetChar(int ch)
502	{
503	whitespace[ch] = alphabetic[ch] = numeric[ch] = quote[ch] = comment[ch] =
504	false;
505	}
506
507	/**
508	* This method makes the specified character an ordinary character. This
509	* means that none of the attributes (whitespace, alphabetic, numeric,
510	* quote, or comment) will be set on this character. This character will
511	* parse as its own token.
512	*
513	* @param c The character to make ordinary, passed as an int
514	*/
515	public void ordinaryChar(int ch)
516	{
517	if (ch >= 0 && ch <= 255)
518	resetChar(ch);
519	}
520
521	/**
522	* This method makes all the characters in the specified range, range
523	* terminators included, ordinary. This means the none of the attributes
524	* (whitespace, alphabetic, numeric, quote, or comment) will be set on
525	* any of the characters in the range. This makes each character in this
526	* range parse as its own token.
527	*
528	* @param low The low end of the range of values to set the whitespace
529	* attribute for
530	* @param high The high end of the range of values to set the whitespace
531	* attribute for
532	*/
533	public void ordinaryChars(int low, int hi)
534	{
535	if (low < 0)
536	low = 0;
537	if (hi > 255)
538	hi = 255;
539	for (int i = low; i <= hi; i++)
540	resetChar(i);
541	}
542
543	/**
544	* This method sets the numeric attribute on the characters '0' - '9' and
545	* the characters '.' and '-'.
546	*/
547	public void parseNumbers()
548	{
549	for (int i = 0; i <= 9; i++)
550	numeric['0' + i] = true;
551
552	numeric['.'] = true;
553	numeric['-'] = true;
554	}
555
556	/**
557	* Puts the current token back into the StreamTokenizer so
558	* <code>nextToken</code> will return the same value on the next call.
559	* May cause the lineno method to return an incorrect value
560	* if lineno is called before the next call to nextToken.
561	*/
562	public void pushBack()
563	{
564	pushedBack = true;
565	}
566
567	/**
568	* This method sets the quote attribute on the specified character.
569	*
570	* @param c The character to set the quote attribute for, passed as an int.
571	*/
572	public void quoteChar(int ch)
573	{
574	if (ch >= 0 && ch <= 255)
575	quote[ch] = true;
576	}
577
578	/**
579	* This method removes all attributes (whitespace, alphabetic, numeric,
580	* quote, and comment) from all characters. It is equivalent to calling
581	* <code>ordinaryChars(0x00, 0xFF)</code>.
582	*
583	* @see ordinaryChars
584	*/
585	public void resetSyntax()
586	{
587	ordinaryChars(0x00, 0xFF);
588	}
589
590	/**
591	* This method sets a flag that indicates whether or not "C++" language style
592	* comments ("//" comments through EOL ) are handled by the parser.
593	* If this is <code>true</code> commented out sequences are skipped and
594	* ignored by the parser. This defaults to <code>false</code>.
595	*
596	* @param flag <code>true</code> to recognized and handle "C++" style
597	* comments, <code>false</code> otherwise
598	*/
599	public void slashSlashComments(boolean flag)
600	{
601	slashSlash = flag;
602	}
603
604	/**
605	* This method sets a flag that indicates whether or not "C" language style
606	* comments (with nesting not allowed) are handled by the parser.
607	* If this is <code>true</code> commented out sequences are skipped and
608	* ignored by the parser. This defaults to <code>false</code>.
609	*
610	* @param flag <code>true</code> to recognized and handle "C" style comments,
611	* <code>false</code> otherwise
612	*/
613	public void slashStarComments(boolean flag)
614	{
615	slashStar = flag;
616	}
617
618	/**
619	* This method returns the current token value as a <code>String</code> in
620	* the form "Token[x], line n", where 'n' is the current line numbers and
621	* 'x' is determined as follows.
622	* <p>
623	* <ul>
624	* <li>If no token has been read, then 'x' is "NOTHING" and 'n' is 0
625	* <li>If <code>ttype</code> is TT_EOF, then 'x' is "EOF"
626	* <li>If <code>ttype</code> is TT_EOL, then 'x' is "EOL"
627	* <li>If <code>ttype</code> is TT_WORD, then 'x' is <code>sval</code>
628	* <li>If <code>ttype</code> is TT_NUMBER, then 'x' is "n=strnval" where
629	* 'strnval' is <code>String.valueOf(nval)</code>.
630	* <li>If <code>ttype</code> is a quote character, then 'x' is
631	* <code>sval</code>
632	* <li>For all other cases, 'x' is <code>ttype</code>
633	* </ul>
634	*/
635	public String toString()
636	{
637	String tempstr;
638	if (ttype == TT_EOF)
639	tempstr = "EOF";
640	else if (ttype == TT_EOL)
641	tempstr = "EOL";
642	else if (ttype == TT_WORD)
643	tempstr = sval;
644	else if (ttype == TT_NUMBER)
645	tempstr = "n=" + nval;
646	else if (ttype == TT_NONE)
647	tempstr = "NOTHING";
648	else // must be an ordinary char.
649	tempstr = "\'" + (char) ttype + "\'";
650
651	return "Token[" + tempstr + "], line " + lineno();
652	}
653
654	/**
655	* This method sets the whitespace attribute for all characters in the
656	* specified range, range terminators included.
657	*
658	* @param low The low end of the range of values to set the whitespace
659	* attribute for
660	* @param high The high end of the range of values to set the whitespace
661	* attribute for
662	*/
663	public void whitespaceChars(int low, int hi)
664	{
665	if (low < 0)
666	low = 0;
667	if (hi > 255)
668	hi = 255;
669	for (int i = low; i <= hi; i++)
670	{
671	resetChar(i);
672	whitespace[i] = true;
673	}
674	}
675
676	/**
677	* This method sets the alphabetic attribute for all characters in the
678	* specified range, range terminators included.
679	*
680	* @param low The low end of the range of values to set the alphabetic
681	* attribute for
682	* @param high The high end of the range of values to set the alphabetic
683	* attribute for
684	*/
685	public void wordChars(int low, int hi)
686	{
687	if (low < 0)
688	low = 0;
689	if (hi > 255)
690	hi = 255;
691	for (int i = low; i <= hi; i++)
692	alphabetic[i] = true;
693	}
694	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/gcc/libjava/java/io/StreamTokenizer.java

Download in other formats: