Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

tokenizer.c

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 47.4 KB

Rev	Line
[2]	1
	2	/* Tokenizer implementation */
	3
	4	#include "Python.h"
	5	#include "pgenheaders.h"
	6
	7	#include <ctype.h>
	8	#include <assert.h>
	9
	10	#include "tokenizer.h"
	11	#include "errcode.h"
	12
	13	#ifndef PGEN
	14	#include "unicodeobject.h"
	15	#include "stringobject.h"
	16	#include "fileobject.h"
	17	#include "codecs.h"
	18	#include "abstract.h"
	19	#include "pydebug.h"
	20	#endif /* PGEN */
	21
	22	extern char PyOS_Readline(FILE , FILE , char );
	23	/* Return malloc'ed string including trailing \n;
	24	empty malloc'ed string for EOF;
	25	NULL if interrupted */
	26
	27	/* Don't ever change this -- it would break the portability of Python code */
	28	#define TABSIZE 8
	29
	30	/* Forward */
	31	static struct tok_state *tok_new(void);
	32	static int tok_nextc(struct tok_state *tok);
	33	static void tok_backup(struct tok_state *tok, int c);
	34
	35	/* Token names */
	36
	37	char *_PyParser_TokenNames[] = {
[391]	38	"ENDMARKER",
	39	"NAME",
	40	"NUMBER",
	41	"STRING",
	42	"NEWLINE",
	43	"INDENT",
	44	"DEDENT",
	45	"LPAR",
	46	"RPAR",
	47	"LSQB",
	48	"RSQB",
	49	"COLON",
	50	"COMMA",
	51	"SEMI",
	52	"PLUS",
	53	"MINUS",
	54	"STAR",
	55	"SLASH",
	56	"VBAR",
	57	"AMPER",
	58	"LESS",
	59	"GREATER",
	60	"EQUAL",
	61	"DOT",
	62	"PERCENT",
	63	"BACKQUOTE",
	64	"LBRACE",
	65	"RBRACE",
	66	"EQEQUAL",
	67	"NOTEQUAL",
	68	"LESSEQUAL",
	69	"GREATEREQUAL",
	70	"TILDE",
	71	"CIRCUMFLEX",
	72	"LEFTSHIFT",
	73	"RIGHTSHIFT",
	74	"DOUBLESTAR",
	75	"PLUSEQUAL",
	76	"MINEQUAL",
	77	"STAREQUAL",
	78	"SLASHEQUAL",
	79	"PERCENTEQUAL",
	80	"AMPEREQUAL",
	81	"VBAREQUAL",
	82	"CIRCUMFLEXEQUAL",
	83	"LEFTSHIFTEQUAL",
	84	"RIGHTSHIFTEQUAL",
	85	"DOUBLESTAREQUAL",
	86	"DOUBLESLASH",
	87	"DOUBLESLASHEQUAL",
	88	"AT",
	89	/* This table must match the #defines in token.h! */
	90	"OP",
	91	"<ERRORTOKEN>",
	92	"<N_TOKENS>"
[2]	93	};
	94
	95	/* Create and initialize a new tok_state structure */
	96
	97	static struct tok_state *
	98	tok_new(void)
	99	{
[391]	100	struct tok_state tok = (struct tok_state )PyMem_MALLOC(
	101	sizeof(struct tok_state));
	102	if (tok == NULL)
	103	return NULL;
	104	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
	105	tok->done = E_OK;
	106	tok->fp = NULL;
	107	tok->input = NULL;
	108	tok->tabsize = TABSIZE;
	109	tok->indent = 0;
	110	tok->indstack[0] = 0;
	111	tok->atbol = 1;
	112	tok->pendin = 0;
	113	tok->prompt = tok->nextprompt = NULL;
	114	tok->lineno = 0;
	115	tok->level = 0;
	116	tok->filename = NULL;
	117	tok->altwarning = 0;
	118	tok->alterror = 0;
	119	tok->alttabsize = 1;
	120	tok->altindstack[0] = 0;
	121	tok->decoding_state = 0;
	122	tok->decoding_erred = 0;
	123	tok->read_coding_spec = 0;
	124	tok->encoding = NULL;
	125	tok->cont_line = 0;
[2]	126	#ifndef PGEN
[391]	127	tok->decoding_readline = NULL;
	128	tok->decoding_buffer = NULL;
[2]	129	#endif
[391]	130	return tok;
[2]	131	}
	132
[391]	133	static char *
	134	new_string(const char *s, Py_ssize_t len)
	135	{
	136	char* result = (char *)PyMem_MALLOC(len + 1);
	137	if (result != NULL) {
	138	memcpy(result, s, len);
	139	result[len] = '\0';
	140	}
	141	return result;
	142	}
	143
[2]	144	#ifdef PGEN
	145
	146	static char *
	147	decoding_fgets(char s, int size, struct tok_state tok)
	148	{
[391]	149	return fgets(s, size, tok->fp);
[2]	150	}
	151
	152	static int
	153	decoding_feof(struct tok_state *tok)
	154	{
[391]	155	return feof(tok->fp);
[2]	156	}
	157
[391]	158	static char *
	159	decode_str(const char str, int exec_input, struct tok_state tok)
[2]	160	{
[391]	161	return new_string(str, strlen(str));
[2]	162	}
	163
	164	#else /* PGEN */
	165
	166	static char *
	167	error_ret(struct tok_state tok) / XXX */
	168	{
[391]	169	tok->decoding_erred = 1;
	170	if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
	171	PyMem_FREE(tok->buf);
	172	tok->buf = NULL;
	173	return NULL; /* as if it were EOF */
[2]	174	}
	175
	176
	177	static char *
[391]	178	get_normal_name(char s) / for utf-8 and latin-1 */
[2]	179	{
[391]	180	char buf[13];
	181	int i;
	182	for (i = 0; i < 12; i++) {
	183	int c = s[i];
	184	if (c == '\0')
	185	break;
	186	else if (c == '_')
	187	buf[i] = '-';
	188	else
	189	buf[i] = tolower(c);
	190	}
	191	buf[i] = '\0';
	192	if (strcmp(buf, "utf-8") == 0 \|\|
	193	strncmp(buf, "utf-8-", 6) == 0)
	194	return "utf-8";
	195	else if (strcmp(buf, "latin-1") == 0 \|\|
	196	strcmp(buf, "iso-8859-1") == 0 \|\|
	197	strcmp(buf, "iso-latin-1") == 0 \|\|
	198	strncmp(buf, "latin-1-", 8) == 0 \|\|
	199	strncmp(buf, "iso-8859-1-", 11) == 0 \|\|
	200	strncmp(buf, "iso-latin-1-", 12) == 0)
	201	return "iso-8859-1";
	202	else
	203	return s;
[2]	204	}
	205
	206	/* Return the coding spec in S, or NULL if none is found. */
	207
	208	static char *
	209	get_coding_spec(const char *s, Py_ssize_t size)
	210	{
[391]	211	Py_ssize_t i;
	212	/* Coding spec must be in a comment, and that comment must be
	213	* the only statement on the source code line. */
	214	for (i = 0; i < size - 6; i++) {
	215	if (s[i] == '#')
	216	break;
	217	if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
	218	return NULL;
	219	}
	220	for (; i < size - 6; i++) { /* XXX inefficient search */
	221	const char* t = s + i;
	222	if (strncmp(t, "coding", 6) == 0) {
	223	const char* begin = NULL;
	224	t += 6;
	225	if (t[0] != ':' && t[0] != '=')
	226	continue;
	227	do {
	228	t++;
	229	} while (t[0] == '\x20' \|\| t[0] == '\t');
[2]	230
[391]	231	begin = t;
	232	while (Py_ISALNUM(t[0]) \|\|
	233	t[0] == '-' \|\| t[0] == '_' \|\| t[0] == '.')
	234	t++;
[2]	235
[391]	236	if (begin < t) {
	237	char* r = new_string(begin, t - begin);
	238	char* q = get_normal_name(r);
	239	if (r != q) {
	240	PyMem_FREE(r);
	241	r = new_string(q, strlen(q));
	242	}
	243	return r;
	244	}
	245	}
	246	}
	247	return NULL;
[2]	248	}
	249
	250	/* Check whether the line contains a coding spec. If it does,
	251	invoke the set_readline function for the new encoding.
	252	This function receives the tok_state and the new encoding.
	253	Return 1 on success, 0 on failure. */
	254
	255	static int
	256	check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
[391]	257	int set_readline(struct tok_state , const char ))
[2]	258	{
[391]	259	char * cs;
	260	int r = 1;
[2]	261
[391]	262	if (tok->cont_line)
	263	/* It's a continuation line, so it can't be a coding spec. */
	264	return 1;
	265	cs = get_coding_spec(line, size);
	266	if (cs != NULL) {
	267	tok->read_coding_spec = 1;
	268	if (tok->encoding == NULL) {
	269	assert(tok->decoding_state == 1); /* raw */
	270	if (strcmp(cs, "utf-8") == 0 \|\|
	271	strcmp(cs, "iso-8859-1") == 0) {
	272	tok->encoding = cs;
	273	} else {
[2]	274	#ifdef Py_USING_UNICODE
[391]	275	r = set_readline(tok, cs);
	276	if (r) {
	277	tok->encoding = cs;
	278	tok->decoding_state = -1;
	279	}
	280	else {
	281	PyErr_Format(PyExc_SyntaxError,
	282	"encoding problem: %s", cs);
	283	PyMem_FREE(cs);
	284	}
[2]	285	#else
[391]	286	/* Without Unicode support, we cannot
	287	process the coding spec. Since there
	288	won't be any Unicode literals, that
	289	won't matter. */
	290	PyMem_FREE(cs);
[2]	291	#endif
[391]	292	}
	293	} else { /* then, compare cs with BOM */
	294	r = (strcmp(tok->encoding, cs) == 0);
	295	if (!r)
	296	PyErr_Format(PyExc_SyntaxError,
	297	"encoding problem: %s with BOM", cs);
	298	PyMem_FREE(cs);
	299	}
	300	}
	301	return r;
[2]	302	}
	303
	304	/* See whether the file starts with a BOM. If it does,
	305	invoke the set_readline function with the new encoding.
	306	Return 1 on success, 0 on failure. */
	307
	308	static int
	309	check_bom(int get_char(struct tok_state *),
[391]	310	void unget_char(int, struct tok_state *),
	311	int set_readline(struct tok_state , const char ),
	312	struct tok_state *tok)
[2]	313	{
[391]	314	int ch1, ch2, ch3;
	315	ch1 = get_char(tok);
	316	tok->decoding_state = 1;
	317	if (ch1 == EOF) {
	318	return 1;
	319	} else if (ch1 == 0xEF) {
	320	ch2 = get_char(tok);
	321	if (ch2 != 0xBB) {
	322	unget_char(ch2, tok);
	323	unget_char(ch1, tok);
	324	return 1;
	325	}
	326	ch3 = get_char(tok);
	327	if (ch3 != 0xBF) {
	328	unget_char(ch3, tok);
	329	unget_char(ch2, tok);
	330	unget_char(ch1, tok);
	331	return 1;
	332	}
[2]	333	#if 0
[391]	334	/* Disable support for UTF-16 BOMs until a decision
	335	is made whether this needs to be supported. */
	336	} else if (ch1 == 0xFE) {
	337	ch2 = get_char(tok);
	338	if (ch2 != 0xFF) {
	339	unget_char(ch2, tok);
	340	unget_char(ch1, tok);
	341	return 1;
	342	}
	343	if (!set_readline(tok, "utf-16-be"))
	344	return 0;
	345	tok->decoding_state = -1;
	346	} else if (ch1 == 0xFF) {
	347	ch2 = get_char(tok);
	348	if (ch2 != 0xFE) {
	349	unget_char(ch2, tok);
	350	unget_char(ch1, tok);
	351	return 1;
	352	}
	353	if (!set_readline(tok, "utf-16-le"))
	354	return 0;
	355	tok->decoding_state = -1;
[2]	356	#endif
[391]	357	} else {
	358	unget_char(ch1, tok);
	359	return 1;
	360	}
	361	if (tok->encoding != NULL)
	362	PyMem_FREE(tok->encoding);
	363	tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
	364	return 1;
[2]	365	}
	366
	367	/* Read a line of text from TOK into S, using the stream in TOK.
	368	Return NULL on failure, else S.
	369
	370	On entry, tok->decoding_buffer will be one of:
	371	1) NULL: need to call tok->decoding_readline to get a new line
	372	2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
[391]	373	stored the result in tok->decoding_buffer
[2]	374	3) PyStringObject *: previous call to fp_readl did not have enough room
[391]	375	(in the s buffer) to copy entire contents of the line read
	376	by tok->decoding_readline. tok->decoding_buffer has the overflow.
	377	In this case, fp_readl is called in a loop (with an expanded buffer)
	378	until the buffer ends with a '\n' (or until the end of the file is
	379	reached): see tok_nextc and its calls to decoding_fgets.
[2]	380	*/
	381
	382	static char *
	383	fp_readl(char s, int size, struct tok_state tok)
	384	{
	385	#ifndef Py_USING_UNICODE
[391]	386	/* In a non-Unicode built, this should never be called. */
	387	Py_FatalError("fp_readl should not be called in this build.");
	388	return NULL; /* Keep compiler happy (not reachable) */
[2]	389	#else
[391]	390	PyObject* utf8 = NULL;
	391	PyObject* buf = tok->decoding_buffer;
	392	char *str;
	393	Py_ssize_t utf8len;
[2]	394
[391]	395	/* Ask for one less byte so we can terminate it */
	396	assert(size > 0);
	397	size--;
[2]	398
[391]	399	if (buf == NULL) {
	400	buf = PyObject_CallObject(tok->decoding_readline, NULL);
	401	if (buf == NULL)
	402	return error_ret(tok);
	403	} else {
	404	tok->decoding_buffer = NULL;
	405	if (PyString_CheckExact(buf))
	406	utf8 = buf;
	407	}
	408	if (utf8 == NULL) {
	409	utf8 = PyUnicode_AsUTF8String(buf);
	410	Py_DECREF(buf);
	411	if (utf8 == NULL)
	412	return error_ret(tok);
	413	}
	414	str = PyString_AsString(utf8);
	415	utf8len = PyString_GET_SIZE(utf8);
	416	if (utf8len > size) {
	417	tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
	418	if (tok->decoding_buffer == NULL) {
	419	Py_DECREF(utf8);
	420	return error_ret(tok);
	421	}
	422	utf8len = size;
	423	}
	424	memcpy(s, str, utf8len);
	425	s[utf8len] = '\0';
	426	Py_DECREF(utf8);
	427	if (utf8len == 0)
	428	return NULL; /* EOF */
	429	return s;
[2]	430	#endif
	431	}
	432
	433	/* Set the readline function for TOK to a StreamReader's
	434	readline function. The StreamReader is named ENC.
	435
	436	This function is called from check_bom and check_coding_spec.
	437
	438	ENC is usually identical to the future value of tok->encoding,
	439	except for the (currently unsupported) case of UTF-16.
	440
	441	Return 1 on success, 0 on failure. */
	442
	443	static int
	444	fp_setreadl(struct tok_state tok, const char enc)
	445	{
[391]	446	PyObject reader, stream, *readline;
[2]	447
[391]	448	/* XXX: constify filename argument. */
	449	stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
	450	if (stream == NULL)
	451	return 0;
[2]	452
[391]	453	reader = PyCodec_StreamReader(enc, stream, NULL);
	454	Py_DECREF(stream);
	455	if (reader == NULL)
	456	return 0;
[2]	457
[391]	458	readline = PyObject_GetAttrString(reader, "readline");
	459	Py_DECREF(reader);
	460	if (readline == NULL)
	461	return 0;
[2]	462
[391]	463	tok->decoding_readline = readline;
	464	return 1;
[2]	465	}
	466
	467	/* Fetch the next byte from TOK. */
	468
	469	static int fp_getc(struct tok_state *tok) {
[391]	470	return getc(tok->fp);
[2]	471	}
	472
	473	/* Unfetch the last byte back into TOK. */
	474
	475	static void fp_ungetc(int c, struct tok_state *tok) {
[391]	476	ungetc(c, tok->fp);
[2]	477	}
	478
	479	/* Read a line of input from TOK. Determine encoding
	480	if necessary. */
	481
	482	static char *
	483	decoding_fgets(char s, int size, struct tok_state tok)
	484	{
[391]	485	char *line = NULL;
	486	int badchar = 0;
	487	for (;;) {
	488	if (tok->decoding_state < 0) {
	489	/* We already have a codec associated with
	490	this input. */
	491	line = fp_readl(s, size, tok);
	492	break;
	493	} else if (tok->decoding_state > 0) {
	494	/* We want a 'raw' read. */
	495	line = Py_UniversalNewlineFgets(s, size,
	496	tok->fp, NULL);
	497	break;
	498	} else {
	499	/* We have not yet determined the encoding.
	500	If an encoding is found, use the file-pointer
	501	reader functions from now on. */
	502	if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
	503	return error_ret(tok);
	504	assert(tok->decoding_state != 0);
	505	}
	506	}
	507	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
	508	if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
	509	return error_ret(tok);
	510	}
	511	}
[2]	512	#ifndef PGEN
[391]	513	/* The default encoding is ASCII, so make sure we don't have any
	514	non-ASCII bytes in it. */
	515	if (line && !tok->encoding) {
	516	unsigned char *c;
	517	for (c = (unsigned char )line; c; c++)
	518	if (*c > 127) {
	519	badchar = *c;
	520	break;
	521	}
	522	}
	523	if (badchar) {
	524	char buf[500];
	525	/* Need to add 1 to the line number, since this line
	526	has not been counted, yet. */
	527	sprintf(buf,
	528	"Non-ASCII character '\\x%.2x' "
	529	"in file %.200s on line %i, "
	530	"but no encoding declared; "
	531	"see http://www.python.org/peps/pep-0263.html for details",
	532	badchar, tok->filename, tok->lineno + 1);
	533	PyErr_SetString(PyExc_SyntaxError, buf);
	534	return error_ret(tok);
	535	}
[2]	536	#endif
[391]	537	return line;
[2]	538	}
	539
	540	static int
	541	decoding_feof(struct tok_state *tok)
	542	{
[391]	543	if (tok->decoding_state >= 0) {
	544	return feof(tok->fp);
	545	} else {
	546	PyObject* buf = tok->decoding_buffer;
	547	if (buf == NULL) {
	548	buf = PyObject_CallObject(tok->decoding_readline, NULL);
	549	if (buf == NULL) {
	550	error_ret(tok);
	551	return 1;
	552	} else {
	553	tok->decoding_buffer = buf;
	554	}
	555	}
	556	return PyObject_Length(buf) == 0;
	557	}
[2]	558	}
	559
	560	/* Fetch a byte from TOK, using the string buffer. */
	561
	562	static int
	563	buf_getc(struct tok_state *tok) {
[391]	564	return Py_CHARMASK(*tok->str++);
[2]	565	}
	566
	567	/* Unfetch a byte from TOK, using the string buffer. */
	568
	569	static void
	570	buf_ungetc(int c, struct tok_state *tok) {
[391]	571	tok->str--;
	572	assert(Py_CHARMASK(tok->str) == c); / tok->cur may point to read-only segment */
[2]	573	}
	574
	575	/* Set the readline function for TOK to ENC. For the string-based
	576	tokenizer, this means to just record the encoding. */
	577
	578	static int
	579	buf_setreadl(struct tok_state tok, const char enc) {
[391]	580	tok->enc = enc;
	581	return 1;
[2]	582	}
	583
	584	/* Return a UTF-8 encoding Python string object from the
	585	C byte string STR, which is encoded with ENC. */
	586
	587	#ifdef Py_USING_UNICODE
	588	static PyObject *
	589	translate_into_utf8(const char* str, const char* enc) {
[391]	590	PyObject *utf8;
	591	PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
	592	if (buf == NULL)
	593	return NULL;
	594	utf8 = PyUnicode_AsUTF8String(buf);
	595	Py_DECREF(buf);
	596	return utf8;
[2]	597	}
	598	#endif
	599
[391]	600
	601	static char *
	602	translate_newlines(const char s, int exec_input, struct tok_state tok) {
	603	int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
	604	char buf, current;
	605	char c = '\0';
	606	buf = PyMem_MALLOC(needed_length);
	607	if (buf == NULL) {
	608	tok->done = E_NOMEM;
	609	return NULL;
	610	}
	611	for (current = buf; *s; s++, current++) {
	612	c = *s;
	613	if (skip_next_lf) {
	614	skip_next_lf = 0;
	615	if (c == '\n') {
	616	c = *++s;
	617	if (!c)
	618	break;
	619	}
	620	}
	621	if (c == '\r') {
	622	skip_next_lf = 1;
	623	c = '\n';
	624	}
	625	*current = c;
	626	}
	627	/* If this is exec input, add a newline to the end of the string if
	628	there isn't one already. */
	629	if (exec_input && c != '\n') {
	630	*current = '\n';
	631	current++;
	632	}
	633	*current = '\0';
	634	final_length = current - buf + 1;
	635	if (final_length < needed_length && final_length)
	636	/* should never fail */
	637	buf = PyMem_REALLOC(buf, final_length);
	638	return buf;
	639	}
	640
[2]	641	/* Decode a byte string STR for use as the buffer of TOK.
	642	Look for encoding declarations inside STR, and record them
	643	inside TOK. */
	644
	645	static const char *
[391]	646	decode_str(const char input, int single, struct tok_state tok)
[2]	647	{
[391]	648	PyObject* utf8 = NULL;
	649	const char *str;
	650	const char *s;
	651	const char *newl[2] = {NULL, NULL};
	652	int lineno = 0;
	653	tok->input = str = translate_newlines(input, single, tok);
	654	if (str == NULL)
	655	return NULL;
	656	tok->enc = NULL;
	657	tok->str = str;
	658	if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
	659	return error_ret(tok);
	660	str = tok->str; /* string after BOM if any */
	661	assert(str);
[2]	662	#ifdef Py_USING_UNICODE
[391]	663	if (tok->enc != NULL) {
	664	utf8 = translate_into_utf8(str, tok->enc);
	665	if (utf8 == NULL)
	666	return error_ret(tok);
	667	str = PyString_AsString(utf8);
	668	}
[2]	669	#endif
[391]	670	for (s = str;; s++) {
	671	if (*s == '\0') break;
	672	else if (*s == '\n') {
	673	assert(lineno < 2);
	674	newl[lineno] = s;
	675	lineno++;
	676	if (lineno == 2) break;
	677	}
	678	}
	679	tok->enc = NULL;
	680	/* need to check line 1 and 2 separately since check_coding_spec
	681	assumes a single line as input */
	682	if (newl[0]) {
	683	if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
	684	return error_ret(tok);
	685	if (tok->enc == NULL && newl[1]) {
	686	if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
	687	tok, buf_setreadl))
	688	return error_ret(tok);
	689	}
	690	}
[2]	691	#ifdef Py_USING_UNICODE
[391]	692	if (tok->enc != NULL) {
	693	assert(utf8 == NULL);
	694	utf8 = translate_into_utf8(str, tok->enc);
	695	if (utf8 == NULL)
	696	return error_ret(tok);
	697	str = PyString_AsString(utf8);
	698	}
[2]	699	#endif
[391]	700	assert(tok->decoding_buffer == NULL);
	701	tok->decoding_buffer = utf8; /* CAUTION */
	702	return str;
[2]	703	}
	704
	705	#endif /* PGEN */
	706
	707	/* Set up tokenizer for string */
	708
	709	struct tok_state *
[391]	710	PyTokenizer_FromString(const char *str, int exec_input)
[2]	711	{
[391]	712	struct tok_state *tok = tok_new();
	713	if (tok == NULL)
	714	return NULL;
	715	str = (char *)decode_str(str, exec_input, tok);
	716	if (str == NULL) {
	717	PyTokenizer_Free(tok);
	718	return NULL;
	719	}
[2]	720
[391]	721	/* XXX: constify members. */
	722	tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
	723	return tok;
[2]	724	}
	725
	726
	727	/* Set up tokenizer for file */
	728
	729	struct tok_state *
	730	PyTokenizer_FromFile(FILE fp, char ps1, char *ps2)
	731	{
[391]	732	struct tok_state *tok = tok_new();
	733	if (tok == NULL)
	734	return NULL;
	735	if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
	736	PyTokenizer_Free(tok);
	737	return NULL;
	738	}
	739	tok->cur = tok->inp = tok->buf;
	740	tok->end = tok->buf + BUFSIZ;
	741	tok->fp = fp;
	742	tok->prompt = ps1;
	743	tok->nextprompt = ps2;
	744	return tok;
[2]	745	}
	746
	747
	748	/* Free a tok_state structure */
	749
	750	void
	751	PyTokenizer_Free(struct tok_state *tok)
	752	{
[391]	753	if (tok->encoding != NULL)
	754	PyMem_FREE(tok->encoding);
[2]	755	#ifndef PGEN
[391]	756	Py_XDECREF(tok->decoding_readline);
	757	Py_XDECREF(tok->decoding_buffer);
[2]	758	#endif
[391]	759	if (tok->fp != NULL && tok->buf != NULL)
	760	PyMem_FREE(tok->buf);
	761	if (tok->input)
	762	PyMem_FREE((char *)tok->input);
	763	PyMem_FREE(tok);
[2]	764	}
	765
	766	#if !defined(PGEN) && defined(Py_USING_UNICODE)
	767	static int
	768	tok_stdin_decode(struct tok_state tok, char *inp)
	769	{
[391]	770	PyObject enc, sysstdin, decoded, utf8;
	771	const char *encoding;
	772	char *converted;
[2]	773
[391]	774	if (PySys_GetFile((char *)"stdin", NULL) != stdin)
	775	return 0;
	776	sysstdin = PySys_GetObject("stdin");
	777	if (sysstdin == NULL \|\| !PyFile_Check(sysstdin))
	778	return 0;
[2]	779
[391]	780	enc = ((PyFileObject *)sysstdin)->f_encoding;
	781	if (enc == NULL \|\| !PyString_Check(enc))
	782	return 0;
	783	Py_INCREF(enc);
[2]	784
[391]	785	encoding = PyString_AsString(enc);
	786	decoded = PyUnicode_Decode(inp, strlen(inp), encoding, NULL);
	787	if (decoded == NULL)
	788	goto error_clear;
[2]	789
[391]	790	utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
	791	Py_DECREF(decoded);
	792	if (utf8 == NULL)
	793	goto error_clear;
[2]	794
[391]	795	assert(PyString_Check(utf8));
	796	converted = new_string(PyString_AS_STRING(utf8),
	797	PyString_GET_SIZE(utf8));
	798	Py_DECREF(utf8);
	799	if (converted == NULL)
	800	goto error_nomem;
[2]	801
[391]	802	PyMem_FREE(*inp);
	803	*inp = converted;
	804	if (tok->encoding != NULL)
	805	PyMem_FREE(tok->encoding);
	806	tok->encoding = new_string(encoding, strlen(encoding));
	807	if (tok->encoding == NULL)
	808	goto error_nomem;
[2]	809
[391]	810	Py_DECREF(enc);
	811	return 0;
[2]	812
	813	error_nomem:
[391]	814	Py_DECREF(enc);
	815	tok->done = E_NOMEM;
	816	return -1;
[2]	817
	818	error_clear:
[391]	819	Py_DECREF(enc);
	820	if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
	821	tok->done = E_ERROR;
	822	return -1;
	823	}
	824	/* Fallback to iso-8859-1: for backward compatibility */
	825	PyErr_Clear();
	826	return 0;
[2]	827	}
	828	#endif
	829
	830	/* Get next char, updating state; error code goes into tok->done */
	831
	832	static int
	833	tok_nextc(register struct tok_state *tok)
	834	{
[391]	835	for (;;) {
	836	if (tok->cur != tok->inp) {
	837	return Py_CHARMASK(tok->cur++); / Fast path */
	838	}
	839	if (tok->done != E_OK)
	840	return EOF;
	841	if (tok->fp == NULL) {
	842	char *end = strchr(tok->inp, '\n');
	843	if (end != NULL)
	844	end++;
	845	else {
	846	end = strchr(tok->inp, '\0');
	847	if (end == tok->inp) {
	848	tok->done = E_EOF;
	849	return EOF;
	850	}
	851	}
	852	if (tok->start == NULL)
	853	tok->buf = tok->cur;
	854	tok->line_start = tok->cur;
	855	tok->lineno++;
	856	tok->inp = end;
	857	return Py_CHARMASK(*tok->cur++);
	858	}
	859	if (tok->prompt != NULL) {
	860	char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
	861	if (tok->nextprompt != NULL)
	862	tok->prompt = tok->nextprompt;
	863	if (newtok == NULL)
	864	tok->done = E_INTR;
	865	else if (*newtok == '\0') {
	866	PyMem_FREE(newtok);
	867	tok->done = E_EOF;
	868	}
[2]	869	#if !defined(PGEN) && defined(Py_USING_UNICODE)
[391]	870	else if (tok_stdin_decode(tok, &newtok) != 0)
	871	PyMem_FREE(newtok);
[2]	872	#endif
[391]	873	else if (tok->start != NULL) {
	874	size_t start = tok->start - tok->buf;
	875	size_t oldlen = tok->cur - tok->buf;
	876	size_t newlen = oldlen + strlen(newtok);
	877	char *buf = tok->buf;
	878	buf = (char *)PyMem_REALLOC(buf, newlen+1);
	879	tok->lineno++;
	880	if (buf == NULL) {
	881	PyMem_FREE(tok->buf);
	882	tok->buf = NULL;
	883	PyMem_FREE(newtok);
	884	tok->done = E_NOMEM;
	885	return EOF;
	886	}
	887	tok->buf = buf;
	888	tok->cur = tok->buf + oldlen;
	889	tok->line_start = tok->cur;
	890	strcpy(tok->buf + oldlen, newtok);
	891	PyMem_FREE(newtok);
	892	tok->inp = tok->buf + newlen;
	893	tok->end = tok->inp + 1;
	894	tok->start = tok->buf + start;
	895	}
	896	else {
	897	tok->lineno++;
	898	if (tok->buf != NULL)
	899	PyMem_FREE(tok->buf);
	900	tok->buf = newtok;
	901	tok->line_start = tok->buf;
	902	tok->cur = tok->buf;
	903	tok->line_start = tok->buf;
	904	tok->inp = strchr(tok->buf, '\0');
	905	tok->end = tok->inp + 1;
	906	}
	907	}
	908	else {
	909	int done = 0;
	910	Py_ssize_t cur = 0;
	911	char *pt;
	912	if (tok->start == NULL) {
	913	if (tok->buf == NULL) {
	914	tok->buf = (char *)
	915	PyMem_MALLOC(BUFSIZ);
	916	if (tok->buf == NULL) {
	917	tok->done = E_NOMEM;
	918	return EOF;
	919	}
	920	tok->end = tok->buf + BUFSIZ;
	921	}
	922	if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
	923	tok) == NULL) {
	924	tok->done = E_EOF;
	925	done = 1;
	926	}
	927	else {
	928	tok->done = E_OK;
	929	tok->inp = strchr(tok->buf, '\0');
	930	done = tok->inp[-1] == '\n';
	931	}
	932	}
	933	else {
	934	cur = tok->cur - tok->buf;
	935	if (decoding_feof(tok)) {
	936	tok->done = E_EOF;
	937	done = 1;
	938	}
	939	else
	940	tok->done = E_OK;
	941	}
	942	tok->lineno++;
	943	/* Read until '\n' or EOF */
	944	while (!done) {
	945	Py_ssize_t curstart = tok->start == NULL ? -1 :
	946	tok->start - tok->buf;
	947	Py_ssize_t curvalid = tok->inp - tok->buf;
	948	Py_ssize_t newsize = curvalid + BUFSIZ;
	949	char *newbuf = tok->buf;
	950	newbuf = (char *)PyMem_REALLOC(newbuf,
	951	newsize);
	952	if (newbuf == NULL) {
	953	tok->done = E_NOMEM;
	954	tok->cur = tok->inp;
	955	return EOF;
	956	}
	957	tok->buf = newbuf;
	958	tok->inp = tok->buf + curvalid;
	959	tok->end = tok->buf + newsize;
	960	tok->start = curstart < 0 ? NULL :
	961	tok->buf + curstart;
	962	if (decoding_fgets(tok->inp,
	963	(int)(tok->end - tok->inp),
	964	tok) == NULL) {
	965	/* Break out early on decoding
	966	errors, as tok->buf will be NULL
	967	*/
	968	if (tok->decoding_erred)
	969	return EOF;
	970	/* Last line does not end in \n,
	971	fake one */
	972	strcpy(tok->inp, "\n");
	973	}
	974	tok->inp = strchr(tok->inp, '\0');
	975	done = tok->inp[-1] == '\n';
	976	}
	977	if (tok->buf != NULL) {
	978	tok->cur = tok->buf + cur;
	979	tok->line_start = tok->cur;
	980	/* replace "\r\n" with "\n" */
	981	/* For Mac leave the \r, giving a syntax error */
	982	pt = tok->inp - 2;
	983	if (pt >= tok->buf && *pt == '\r') {
	984	*pt++ = '\n';
	985	*pt = '\0';
	986	tok->inp = pt;
	987	}
	988	}
	989	}
	990	if (tok->done != E_OK) {
	991	if (tok->prompt != NULL)
	992	PySys_WriteStderr("\n");
	993	tok->cur = tok->inp;
	994	return EOF;
	995	}
	996	}
	997	/NOTREACHED/
[2]	998	}
	999
	1000
	1001	/* Back-up one character */
	1002
	1003	static void
	1004	tok_backup(register struct tok_state *tok, register int c)
	1005	{
[391]	1006	if (c != EOF) {
	1007	if (--tok->cur < tok->buf)
	1008	Py_FatalError("tok_backup: beginning of buffer");
	1009	if (*tok->cur != c)
	1010	*tok->cur = c;
	1011	}
[2]	1012	}
	1013
	1014
	1015	/* Return the token corresponding to a single character */
	1016
	1017	int
	1018	PyToken_OneChar(int c)
	1019	{
[391]	1020	switch (c) {
	1021	case '(': return LPAR;
	1022	case ')': return RPAR;
	1023	case '[': return LSQB;
	1024	case ']': return RSQB;
	1025	case ':': return COLON;
	1026	case ',': return COMMA;
	1027	case ';': return SEMI;
	1028	case '+': return PLUS;
	1029	case '-': return MINUS;
	1030	case '*': return STAR;
	1031	case '/': return SLASH;
	1032	case '\|': return VBAR;
	1033	case '&': return AMPER;
	1034	case '<': return LESS;
	1035	case '>': return GREATER;
	1036	case '=': return EQUAL;
	1037	case '.': return DOT;
	1038	case '%': return PERCENT;
	1039	case '`': return BACKQUOTE;
	1040	case '{': return LBRACE;
	1041	case '}': return RBRACE;
	1042	case '^': return CIRCUMFLEX;
	1043	case '~': return TILDE;
	1044	case '@': return AT;
	1045	default: return OP;
	1046	}
[2]	1047	}
	1048
	1049
	1050	int
	1051	PyToken_TwoChars(int c1, int c2)
	1052	{
[391]	1053	switch (c1) {
	1054	case '=':
	1055	switch (c2) {
	1056	case '=': return EQEQUAL;
	1057	}
	1058	break;
	1059	case '!':
	1060	switch (c2) {
	1061	case '=': return NOTEQUAL;
	1062	}
	1063	break;
	1064	case '<':
	1065	switch (c2) {
	1066	case '>': return NOTEQUAL;
	1067	case '=': return LESSEQUAL;
	1068	case '<': return LEFTSHIFT;
	1069	}
	1070	break;
	1071	case '>':
	1072	switch (c2) {
	1073	case '=': return GREATEREQUAL;
	1074	case '>': return RIGHTSHIFT;
	1075	}
	1076	break;
	1077	case '+':
	1078	switch (c2) {
	1079	case '=': return PLUSEQUAL;
	1080	}
	1081	break;
	1082	case '-':
	1083	switch (c2) {
	1084	case '=': return MINEQUAL;
	1085	}
	1086	break;
	1087	case '*':
	1088	switch (c2) {
	1089	case '*': return DOUBLESTAR;
	1090	case '=': return STAREQUAL;
	1091	}
	1092	break;
	1093	case '/':
	1094	switch (c2) {
	1095	case '/': return DOUBLESLASH;
	1096	case '=': return SLASHEQUAL;
	1097	}
	1098	break;
	1099	case '\|':
	1100	switch (c2) {
	1101	case '=': return VBAREQUAL;
	1102	}
	1103	break;
	1104	case '%':
	1105	switch (c2) {
	1106	case '=': return PERCENTEQUAL;
	1107	}
	1108	break;
	1109	case '&':
	1110	switch (c2) {
	1111	case '=': return AMPEREQUAL;
	1112	}
	1113	break;
	1114	case '^':
	1115	switch (c2) {
	1116	case '=': return CIRCUMFLEXEQUAL;
	1117	}
	1118	break;
	1119	}
	1120	return OP;
[2]	1121	}
	1122
	1123	int
	1124	PyToken_ThreeChars(int c1, int c2, int c3)
	1125	{
[391]	1126	switch (c1) {
	1127	case '<':
	1128	switch (c2) {
	1129	case '<':
	1130	switch (c3) {
	1131	case '=':
	1132	return LEFTSHIFTEQUAL;
	1133	}
	1134	break;
	1135	}
	1136	break;
	1137	case '>':
	1138	switch (c2) {
	1139	case '>':
	1140	switch (c3) {
	1141	case '=':
	1142	return RIGHTSHIFTEQUAL;
	1143	}
	1144	break;
	1145	}
	1146	break;
	1147	case '*':
	1148	switch (c2) {
	1149	case '*':
	1150	switch (c3) {
	1151	case '=':
	1152	return DOUBLESTAREQUAL;
	1153	}
	1154	break;
	1155	}
	1156	break;
	1157	case '/':
	1158	switch (c2) {
	1159	case '/':
	1160	switch (c3) {
	1161	case '=':
	1162	return DOUBLESLASHEQUAL;
	1163	}
	1164	break;
	1165	}
	1166	break;
	1167	}
	1168	return OP;
[2]	1169	}
	1170
	1171	static int
	1172	indenterror(struct tok_state *tok)
	1173	{
[391]	1174	if (tok->alterror) {
	1175	tok->done = E_TABSPACE;
	1176	tok->cur = tok->inp;
	1177	return 1;
	1178	}
	1179	if (tok->altwarning) {
	1180	PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
	1181	"in indentation\n", tok->filename);
	1182	tok->altwarning = 0;
	1183	}
	1184	return 0;
[2]	1185	}
	1186
	1187	/* Get next token, after space stripping etc. */
	1188
	1189	static int
	1190	tok_get(register struct tok_state tok, char p_start, char *p_end)
	1191	{
[391]	1192	register int c;
	1193	int blankline;
[2]	1194
[391]	1195	p_start = p_end = NULL;
[2]	1196	nextline:
[391]	1197	tok->start = NULL;
	1198	blankline = 0;
[2]	1199
[391]	1200	/* Get indentation level */
	1201	if (tok->atbol) {
	1202	register int col = 0;
	1203	register int altcol = 0;
	1204	tok->atbol = 0;
	1205	for (;;) {
	1206	c = tok_nextc(tok);
	1207	if (c == ' ')
	1208	col++, altcol++;
	1209	else if (c == '\t') {
	1210	col = (col/tok->tabsize + 1) * tok->tabsize;
	1211	altcol = (altcol/tok->alttabsize + 1)
	1212	* tok->alttabsize;
	1213	}
	1214	else if (c == '\014') /* Control-L (formfeed) */
	1215	col = altcol = 0; /* For Emacs users */
	1216	else
	1217	break;
	1218	}
	1219	tok_backup(tok, c);
	1220	if (c == '#' \|\| c == '\n') {
	1221	/* Lines with only whitespace and/or comments
	1222	shouldn't affect the indentation and are
	1223	not passed to the parser as NEWLINE tokens,
	1224	except totally empty lines in interactive
	1225	mode, which signal the end of a command group. */
	1226	if (col == 0 && c == '\n' && tok->prompt != NULL)
	1227	blankline = 0; /* Let it through */
	1228	else
	1229	blankline = 1; /* Ignore completely */
	1230	/* We can't jump back right here since we still
	1231	may need to skip to the end of a comment */
	1232	}
	1233	if (!blankline && tok->level == 0) {
	1234	if (col == tok->indstack[tok->indent]) {
	1235	/* No change */
	1236	if (altcol != tok->altindstack[tok->indent]) {
	1237	if (indenterror(tok))
	1238	return ERRORTOKEN;
	1239	}
	1240	}
	1241	else if (col > tok->indstack[tok->indent]) {
	1242	/* Indent -- always one */
	1243	if (tok->indent+1 >= MAXINDENT) {
	1244	tok->done = E_TOODEEP;
	1245	tok->cur = tok->inp;
	1246	return ERRORTOKEN;
	1247	}
	1248	if (altcol <= tok->altindstack[tok->indent]) {
	1249	if (indenterror(tok))
	1250	return ERRORTOKEN;
	1251	}
	1252	tok->pendin++;
	1253	tok->indstack[++tok->indent] = col;
	1254	tok->altindstack[tok->indent] = altcol;
	1255	}
	1256	else /* col < tok->indstack[tok->indent] */ {
	1257	/* Dedent -- any number, must be consistent */
	1258	while (tok->indent > 0 &&
	1259	col < tok->indstack[tok->indent]) {
	1260	tok->pendin--;
	1261	tok->indent--;
	1262	}
	1263	if (col != tok->indstack[tok->indent]) {
	1264	tok->done = E_DEDENT;
	1265	tok->cur = tok->inp;
	1266	return ERRORTOKEN;
	1267	}
	1268	if (altcol != tok->altindstack[tok->indent]) {
	1269	if (indenterror(tok))
	1270	return ERRORTOKEN;
	1271	}
	1272	}
	1273	}
	1274	}
[2]	1275
[391]	1276	tok->start = tok->cur;
[2]	1277
[391]	1278	/* Return pending indents/dedents */
	1279	if (tok->pendin != 0) {
	1280	if (tok->pendin < 0) {
	1281	tok->pendin++;
	1282	return DEDENT;
	1283	}
	1284	else {
	1285	tok->pendin--;
	1286	return INDENT;
	1287	}
	1288	}
[2]	1289
	1290	again:
[391]	1291	tok->start = NULL;
	1292	/* Skip spaces */
	1293	do {
	1294	c = tok_nextc(tok);
	1295	} while (c == ' ' \|\| c == '\t' \|\| c == '\014');
[2]	1296
[391]	1297	/* Set start of current token */
	1298	tok->start = tok->cur - 1;
[2]	1299
[391]	1300	/* Skip comment, while looking for tab-setting magic */
	1301	if (c == '#') {
	1302	static char *tabforms[] = {
	1303	"tab-width:", /* Emacs */
	1304	":tabstop=", /* vim, full form */
	1305	":ts=", /* vim, abbreviated form */
	1306	"set tabsize=", /* will vi never die? */
	1307	/* more templates can be added here to support other editors */
	1308	};
	1309	char cbuf[80];
	1310	char tp, *cp;
	1311	tp = cbuf;
	1312	do {
	1313	*tp++ = c = tok_nextc(tok);
	1314	} while (c != EOF && c != '\n' &&
	1315	(size_t)(tp - cbuf + 1) < sizeof(cbuf));
	1316	*tp = '\0';
	1317	for (cp = tabforms;
	1318	cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
	1319	cp++) {
	1320	if ((tp = strstr(cbuf, *cp))) {
	1321	int newsize = atoi(tp + strlen(*cp));
[2]	1322
[391]	1323	if (newsize >= 1 && newsize <= 40) {
	1324	tok->tabsize = newsize;
	1325	if (Py_VerboseFlag)
	1326	PySys_WriteStderr(
	1327	"Tab size set to %d\n",
	1328	newsize);
	1329	}
	1330	}
	1331	}
	1332	while (c != EOF && c != '\n')
	1333	c = tok_nextc(tok);
	1334	}
[2]	1335
[391]	1336	/* Check for EOF and errors now */
	1337	if (c == EOF) {
	1338	return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
	1339	}
[2]	1340
[391]	1341	/* Identifier (most frequent token!) */
	1342	if (Py_ISALPHA(c) \|\| c == '_') {
	1343	/* Process r"", u"" and ur"" */
	1344	switch (c) {
	1345	case 'b':
	1346	case 'B':
	1347	c = tok_nextc(tok);
	1348	if (c == 'r' \|\| c == 'R')
	1349	c = tok_nextc(tok);
	1350	if (c == '"' \|\| c == '\'')
	1351	goto letter_quote;
	1352	break;
	1353	case 'r':
	1354	case 'R':
	1355	c = tok_nextc(tok);
	1356	if (c == '"' \|\| c == '\'')
	1357	goto letter_quote;
	1358	break;
	1359	case 'u':
	1360	case 'U':
	1361	c = tok_nextc(tok);
	1362	if (c == 'r' \|\| c == 'R')
	1363	c = tok_nextc(tok);
	1364	if (c == '"' \|\| c == '\'')
	1365	goto letter_quote;
	1366	break;
	1367	}
	1368	while (c != EOF && (Py_ISALNUM(c) \|\| c == '_')) {
	1369	c = tok_nextc(tok);
[105]	1370	#ifdef __KLIBC__
[391]	1371	if (c == EOF)
	1372	break;
[105]	1373	#endif
[391]	1374	}
	1375	tok_backup(tok, c);
	1376	*p_start = tok->start;
	1377	*p_end = tok->cur;
	1378	return NAME;
	1379	}
[2]	1380
[391]	1381	/* Newline */
	1382	if (c == '\n') {
	1383	tok->atbol = 1;
	1384	if (blankline \|\| tok->level > 0)
	1385	goto nextline;
	1386	*p_start = tok->start;
	1387	p_end = tok->cur - 1; / Leave '\n' out of the string */
	1388	tok->cont_line = 0;
	1389	return NEWLINE;
	1390	}
[2]	1391
[391]	1392	/* Period or number starting with period? */
	1393	if (c == '.') {
	1394	c = tok_nextc(tok);
	1395	if (isdigit(c)) {
	1396	goto fraction;
	1397	}
	1398	else {
	1399	tok_backup(tok, c);
	1400	*p_start = tok->start;
	1401	*p_end = tok->cur;
	1402	return DOT;
	1403	}
	1404	}
[2]	1405
[391]	1406	/* Number */
	1407	if (isdigit(c)) {
	1408	if (c == '0') {
	1409	/* Hex, octal or binary -- maybe. */
	1410	c = tok_nextc(tok);
	1411	if (c == '.')
	1412	goto fraction;
[2]	1413	#ifndef WITHOUT_COMPLEX
[391]	1414	if (c == 'j' \|\| c == 'J')
	1415	goto imaginary;
[2]	1416	#endif
[391]	1417	if (c == 'x' \|\| c == 'X') {
[2]	1418
[391]	1419	/* Hex */
	1420	c = tok_nextc(tok);
	1421	if (!isxdigit(c)) {
	1422	tok->done = E_TOKEN;
	1423	tok_backup(tok, c);
	1424	return ERRORTOKEN;
	1425	}
	1426	do {
	1427	c = tok_nextc(tok);
	1428	} while (isxdigit(c));
	1429	}
	1430	else if (c == 'o' \|\| c == 'O') {
	1431	/* Octal */
	1432	c = tok_nextc(tok);
	1433	if (c < '0' \|\| c >= '8') {
	1434	tok->done = E_TOKEN;
	1435	tok_backup(tok, c);
	1436	return ERRORTOKEN;
	1437	}
	1438	do {
	1439	c = tok_nextc(tok);
	1440	} while ('0' <= c && c < '8');
	1441	}
	1442	else if (c == 'b' \|\| c == 'B') {
	1443	/* Binary */
	1444	c = tok_nextc(tok);
	1445	if (c != '0' && c != '1') {
	1446	tok->done = E_TOKEN;
	1447	tok_backup(tok, c);
	1448	return ERRORTOKEN;
	1449	}
	1450	do {
	1451	c = tok_nextc(tok);
	1452	} while (c == '0' \|\| c == '1');
	1453	}
	1454	else {
	1455	int found_decimal = 0;
	1456	/* Octal; c is first char of it */
	1457	/* There's no 'isoctdigit' macro, sigh */
	1458	while ('0' <= c && c < '8') {
	1459	c = tok_nextc(tok);
	1460	}
	1461	if (isdigit(c)) {
	1462	found_decimal = 1;
	1463	do {
	1464	c = tok_nextc(tok);
	1465	} while (isdigit(c));
	1466	}
	1467	if (c == '.')
	1468	goto fraction;
	1469	else if (c == 'e' \|\| c == 'E')
	1470	goto exponent;
[2]	1471	#ifndef WITHOUT_COMPLEX
[391]	1472	else if (c == 'j' \|\| c == 'J')
	1473	goto imaginary;
[2]	1474	#endif
[391]	1475	else if (found_decimal) {
	1476	tok->done = E_TOKEN;
	1477	tok_backup(tok, c);
	1478	return ERRORTOKEN;
	1479	}
	1480	}
	1481	if (c == 'l' \|\| c == 'L')
	1482	c = tok_nextc(tok);
	1483	}
	1484	else {
	1485	/* Decimal */
	1486	do {
	1487	c = tok_nextc(tok);
	1488	} while (isdigit(c));
	1489	if (c == 'l' \|\| c == 'L')
	1490	c = tok_nextc(tok);
	1491	else {
	1492	/* Accept floating point numbers. */
	1493	if (c == '.') {
	1494	fraction:
	1495	/* Fraction */
	1496	do {
	1497	c = tok_nextc(tok);
	1498	} while (isdigit(c));
	1499	}
	1500	if (c == 'e' \|\| c == 'E') {
	1501	exponent:
	1502	/* Exponent part */
	1503	c = tok_nextc(tok);
	1504	if (c == '+' \|\| c == '-')
	1505	c = tok_nextc(tok);
	1506	if (!isdigit(c)) {
	1507	tok->done = E_TOKEN;
	1508	tok_backup(tok, c);
	1509	return ERRORTOKEN;
	1510	}
	1511	do {
	1512	c = tok_nextc(tok);
	1513	} while (isdigit(c));
	1514	}
[2]	1515	#ifndef WITHOUT_COMPLEX
[391]	1516	if (c == 'j' \|\| c == 'J')
	1517	/* Imaginary part */
	1518	imaginary:
	1519	c = tok_nextc(tok);
[2]	1520	#endif
[391]	1521	}
	1522	}
	1523	tok_backup(tok, c);
	1524	*p_start = tok->start;
	1525	*p_end = tok->cur;
	1526	return NUMBER;
	1527	}
[2]	1528
	1529	letter_quote:
[391]	1530	/* String */
	1531	if (c == '\'' \|\| c == '"') {
	1532	Py_ssize_t quote2 = tok->cur - tok->start + 1;
	1533	int quote = c;
	1534	int triple = 0;
	1535	int tripcount = 0;
	1536	for (;;) {
	1537	c = tok_nextc(tok);
	1538	if (c == '\n') {
	1539	if (!triple) {
	1540	tok->done = E_EOLS;
	1541	tok_backup(tok, c);
	1542	return ERRORTOKEN;
	1543	}
	1544	tripcount = 0;
	1545	tok->cont_line = 1; /* multiline string. */
	1546	}
	1547	else if (c == EOF) {
	1548	if (triple)
	1549	tok->done = E_EOFS;
	1550	else
	1551	tok->done = E_EOLS;
	1552	tok->cur = tok->inp;
	1553	return ERRORTOKEN;
	1554	}
	1555	else if (c == quote) {
	1556	tripcount++;
	1557	if (tok->cur - tok->start == quote2) {
	1558	c = tok_nextc(tok);
	1559	if (c == quote) {
	1560	triple = 1;
	1561	tripcount = 0;
	1562	continue;
	1563	}
	1564	tok_backup(tok, c);
	1565	}
	1566	if (!triple \|\| tripcount == 3)
	1567	break;
	1568	}
	1569	else if (c == '\\') {
	1570	tripcount = 0;
	1571	c = tok_nextc(tok);
	1572	if (c == EOF) {
	1573	tok->done = E_EOLS;
	1574	tok->cur = tok->inp;
	1575	return ERRORTOKEN;
	1576	}
	1577	}
	1578	else
	1579	tripcount = 0;
	1580	}
	1581	*p_start = tok->start;
	1582	*p_end = tok->cur;
	1583	return STRING;
	1584	}
[2]	1585
[391]	1586	/* Line continuation */
	1587	if (c == '\\') {
	1588	c = tok_nextc(tok);
	1589	if (c != '\n') {
	1590	tok->done = E_LINECONT;
	1591	tok->cur = tok->inp;
	1592	return ERRORTOKEN;
	1593	}
	1594	tok->cont_line = 1;
	1595	goto again; /* Read next line */
	1596	}
[2]	1597
[391]	1598	/* Check for two-character token */
	1599	{
	1600	int c2 = tok_nextc(tok);
	1601	int token = PyToken_TwoChars(c, c2);
[2]	1602	#ifndef PGEN
[391]	1603	if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
	1604	if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
	1605	"<> not supported in 3.x; use !=",
	1606	tok->filename, tok->lineno,
	1607	NULL, NULL)) {
	1608	return ERRORTOKEN;
	1609	}
	1610	}
[2]	1611	#endif
[391]	1612	if (token != OP) {
	1613	int c3 = tok_nextc(tok);
	1614	int token3 = PyToken_ThreeChars(c, c2, c3);
	1615	if (token3 != OP) {
	1616	token = token3;
	1617	} else {
	1618	tok_backup(tok, c3);
	1619	}
	1620	*p_start = tok->start;
	1621	*p_end = tok->cur;
	1622	return token;
	1623	}
	1624	tok_backup(tok, c2);
	1625	}
[2]	1626
[391]	1627	/* Keep track of parentheses nesting level */
	1628	switch (c) {
	1629	case '(':
	1630	case '[':
	1631	case '{':
	1632	tok->level++;
	1633	break;
	1634	case ')':
	1635	case ']':
	1636	case '}':
	1637	tok->level--;
	1638	break;
	1639	}
[2]	1640
[391]	1641	/* Punctuation character */
	1642	*p_start = tok->start;
	1643	*p_end = tok->cur;
	1644	return PyToken_OneChar(c);
[2]	1645	}
	1646
	1647	int
	1648	PyTokenizer_Get(struct tok_state tok, char p_start, char *p_end)
	1649	{
[391]	1650	int result = tok_get(tok, p_start, p_end);
	1651	if (tok->decoding_erred) {
	1652	result = ERRORTOKEN;
	1653	tok->done = E_DECODE;
	1654	}
	1655	return result;
[2]	1656	}
	1657
	1658	/* This function is only called from parsetok. However, it cannot live
	1659	there, as it must be empty for PGEN, and we can check for PGEN only
	1660	in this file. */
	1661
	1662	#if defined(PGEN) \|\| !defined(Py_USING_UNICODE)
	1663	char*
	1664	PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
	1665	{
[391]	1666	return NULL;
[2]	1667	}
	1668	#else
	1669	#ifdef Py_USING_UNICODE
	1670	static PyObject *
	1671	dec_utf8(const char enc, const char text, size_t len) {
[391]	1672	PyObject *ret = NULL;
	1673	PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
	1674	if (unicode_text) {
	1675	ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
	1676	Py_DECREF(unicode_text);
	1677	}
	1678	if (!ret) {
	1679	PyErr_Clear();
	1680	}
	1681	return ret;
[2]	1682	}
	1683	char *
	1684	PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
	1685	{
[391]	1686	char *text = NULL;
	1687	if (tok->encoding) {
	1688	/* convert source to original encondig */
	1689	PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
	1690	if (lineobj != NULL) {
	1691	int linelen = PyString_Size(lineobj);
	1692	const char *line = PyString_AsString(lineobj);
	1693	text = PyObject_MALLOC(linelen + 1);
	1694	if (text != NULL && line != NULL) {
	1695	if (linelen)
	1696	strncpy(text, line, linelen);
	1697	text[linelen] = '\0';
	1698	}
	1699	Py_DECREF(lineobj);
[2]	1700
[391]	1701	/* adjust error offset */
	1702	if (*offset > 1) {
	1703	PyObject *offsetobj = dec_utf8(tok->encoding,
	1704	tok->buf, *offset-1);
	1705	if (offsetobj) {
	1706	*offset = PyString_Size(offsetobj) + 1;
	1707	Py_DECREF(offsetobj);
	1708	}
	1709	}
	1710
	1711	}
	1712	}
	1713	return text;
	1714
[2]	1715	}
	1716	#endif /* defined(Py_USING_UNICODE) */
	1717	#endif
	1718
	1719
	1720	#ifdef Py_DEBUG
	1721
	1722	void
	1723	tok_dump(int type, char start, char end)
	1724	{
[391]	1725	printf("%s", _PyParser_TokenNames[type]);
	1726	if (type == NAME \|\| type == NUMBER \|\| type == STRING \|\| type == OP)
	1727	printf("(%.*s)", (int)(end - start), start);
[2]	1728	}
	1729
	1730	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Parser/tokenizer.c

Download in other formats: