Context Navigation

tokenizer.c

Last change on this file was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 35.8 KB

Line
1
2	/* Tokenizer implementation */
3
4	#include "Python.h"
5	#include "pgenheaders.h"
6
7	#include <ctype.h>
8	#include <assert.h>
9
10	#include "tokenizer.h"
11	#include "errcode.h"
12
13	#ifndef PGEN
14	#include "unicodeobject.h"
15	#include "stringobject.h"
16	#include "fileobject.h"
17	#include "codecs.h"
18	#include "abstract.h"
19	#include "pydebug.h"
20	#endif /* PGEN */
21
22	extern char PyOS_Readline(FILE , FILE , char );
23	/* Return malloc'ed string including trailing \n;
24	empty malloc'ed string for EOF;
25	NULL if interrupted */
26
27	/* Don't ever change this -- it would break the portability of Python code */
28	#define TABSIZE 8
29
30	/* Forward */
31	static struct tok_state *tok_new(void);
32	static int tok_nextc(struct tok_state *tok);
33	static void tok_backup(struct tok_state *tok, int c);
34
35	/* Token names */
36
37	char *_PyParser_TokenNames[] = {
38	"ENDMARKER",
39	"NAME",
40	"NUMBER",
41	"STRING",
42	"NEWLINE",
43	"INDENT",
44	"DEDENT",
45	"LPAR",
46	"RPAR",
47	"LSQB",
48	"RSQB",
49	"COLON",
50	"COMMA",
51	"SEMI",
52	"PLUS",
53	"MINUS",
54	"STAR",
55	"SLASH",
56	"VBAR",
57	"AMPER",
58	"LESS",
59	"GREATER",
60	"EQUAL",
61	"DOT",
62	"PERCENT",
63	"BACKQUOTE",
64	"LBRACE",
65	"RBRACE",
66	"EQEQUAL",
67	"NOTEQUAL",
68	"LESSEQUAL",
69	"GREATEREQUAL",
70	"TILDE",
71	"CIRCUMFLEX",
72	"LEFTSHIFT",
73	"RIGHTSHIFT",
74	"DOUBLESTAR",
75	"PLUSEQUAL",
76	"MINEQUAL",
77	"STAREQUAL",
78	"SLASHEQUAL",
79	"PERCENTEQUAL",
80	"AMPEREQUAL",
81	"VBAREQUAL",
82	"CIRCUMFLEXEQUAL",
83	"LEFTSHIFTEQUAL",
84	"RIGHTSHIFTEQUAL",
85	"DOUBLESTAREQUAL",
86	"DOUBLESLASH",
87	"DOUBLESLASHEQUAL",
88	"AT",
89	/* This table must match the #defines in token.h! */
90	"OP",
91	"<ERRORTOKEN>",
92	"<N_TOKENS>"
93	};
94
95
96	/* Create and initialize a new tok_state structure */
97
98	static struct tok_state *
99	tok_new(void)
100	{
101	struct tok_state tok = (struct tok_state )PyMem_MALLOC(
102	sizeof(struct tok_state));
103	if (tok == NULL)
104	return NULL;
105	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
106	tok->done = E_OK;
107	tok->fp = NULL;
108	tok->tabsize = TABSIZE;
109	tok->indent = 0;
110	tok->indstack[0] = 0;
111	tok->atbol = 1;
112	tok->pendin = 0;
113	tok->prompt = tok->nextprompt = NULL;
114	tok->lineno = 0;
115	tok->level = 0;
116	tok->filename = NULL;
117	tok->altwarning = 0;
118	tok->alterror = 0;
119	tok->alttabsize = 1;
120	tok->altindstack[0] = 0;
121	tok->decoding_state = 0;
122	tok->decoding_erred = 0;
123	tok->read_coding_spec = 0;
124	tok->encoding = NULL;
125	tok->cont_line = 0;
126	#ifndef PGEN
127	tok->decoding_readline = NULL;
128	tok->decoding_buffer = NULL;
129	#endif
130	return tok;
131	}
132
133	#ifdef PGEN
134
135	static char *
136	decoding_fgets(char s, int size, struct tok_state tok)
137	{
138	return fgets(s, size, tok->fp);
139	}
140
141	static int
142	decoding_feof(struct tok_state *tok)
143	{
144	return feof(tok->fp);
145	}
146
147	static const char *
148	decode_str(const char str, struct tok_state tok)
149	{
150	return str;
151	}
152
153	#else /* PGEN */
154
155	static char *
156	error_ret(struct tok_state tok) / XXX */
157	{
158	tok->decoding_erred = 1;
159	if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
160	PyMem_FREE(tok->buf);
161	tok->buf = NULL;
162	return NULL; /* as if it were EOF */
163	}
164
165	static char *
166	new_string(const char *s, Py_ssize_t len)
167	{
168	char* result = (char *)PyMem_MALLOC(len + 1);
169	if (result != NULL) {
170	memcpy(result, s, len);
171	result[len] = '\0';
172	}
173	return result;
174	}
175
176	static char *
177	get_normal_name(char s) / for utf-8 and latin-1 */
178	{
179	char buf[13];
180	int i;
181	for (i = 0; i < 12; i++) {
182	int c = s[i];
183	if (c == '\0') break;
184	else if (c == '_') buf[i] = '-';
185	else buf[i] = tolower(c);
186	}
187	buf[i] = '\0';
188	if (strcmp(buf, "utf-8") == 0 \|\|
189	strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
190	else if (strcmp(buf, "latin-1") == 0 \|\|
191	strcmp(buf, "iso-8859-1") == 0 \|\|
192	strcmp(buf, "iso-latin-1") == 0 \|\|
193	strncmp(buf, "latin-1-", 8) == 0 \|\|
194	strncmp(buf, "iso-8859-1-", 11) == 0 \|\|
195	strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
196	else return s;
197	}
198
199	/* Return the coding spec in S, or NULL if none is found. */
200
201	static char *
202	get_coding_spec(const char *s, Py_ssize_t size)
203	{
204	Py_ssize_t i;
205	/* Coding spec must be in a comment, and that comment must be
206	* the only statement on the source code line. */
207	for (i = 0; i < size - 6; i++) {
208	if (s[i] == '#')
209	break;
210	if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
211	return NULL;
212	}
213	for (; i < size - 6; i++) { /* XXX inefficient search */
214	const char* t = s + i;
215	if (strncmp(t, "coding", 6) == 0) {
216	const char* begin = NULL;
217	t += 6;
218	if (t[0] != ':' && t[0] != '=')
219	continue;
220	do {
221	t++;
222	} while (t[0] == '\x20' \|\| t[0] == '\t');
223
224	begin = t;
225	while (isalnum(Py_CHARMASK(t[0])) \|\|
226	t[0] == '-' \|\| t[0] == '_' \|\| t[0] == '.')
227	t++;
228
229	if (begin < t) {
230	char* r = new_string(begin, t - begin);
231	char* q = get_normal_name(r);
232	if (r != q) {
233	PyMem_FREE(r);
234	r = new_string(q, strlen(q));
235	}
236	return r;
237	}
238	}
239	}
240	return NULL;
241	}
242
243	/* Check whether the line contains a coding spec. If it does,
244	invoke the set_readline function for the new encoding.
245	This function receives the tok_state and the new encoding.
246	Return 1 on success, 0 on failure. */
247
248	static int
249	check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
250	int set_readline(struct tok_state , const char ))
251	{
252	char * cs;
253	int r = 1;
254
255	if (tok->cont_line)
256	/* It's a continuation line, so it can't be a coding spec. */
257	return 1;
258	cs = get_coding_spec(line, size);
259	if (cs != NULL) {
260	tok->read_coding_spec = 1;
261	if (tok->encoding == NULL) {
262	assert(tok->decoding_state == 1); /* raw */
263	if (strcmp(cs, "utf-8") == 0 \|\|
264	strcmp(cs, "iso-8859-1") == 0) {
265	tok->encoding = cs;
266	} else {
267	#ifdef Py_USING_UNICODE
268	r = set_readline(tok, cs);
269	if (r) {
270	tok->encoding = cs;
271	tok->decoding_state = -1;
272	}
273	else
274	PyMem_FREE(cs);
275	#else
276	/* Without Unicode support, we cannot
277	process the coding spec. Since there
278	won't be any Unicode literals, that
279	won't matter. */
280	PyMem_FREE(cs);
281	#endif
282	}
283	} else { /* then, compare cs with BOM */
284	r = (strcmp(tok->encoding, cs) == 0);
285	PyMem_FREE(cs);
286	}
287	}
288	if (!r) {
289	cs = tok->encoding;
290	if (!cs)
291	cs = "with BOM";
292	PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
293	}
294	return r;
295	}
296
297	/* See whether the file starts with a BOM. If it does,
298	invoke the set_readline function with the new encoding.
299	Return 1 on success, 0 on failure. */
300
301	static int
302	check_bom(int get_char(struct tok_state *),
303	void unget_char(int, struct tok_state *),
304	int set_readline(struct tok_state , const char ),
305	struct tok_state *tok)
306	{
307	int ch = get_char(tok);
308	tok->decoding_state = 1;
309	if (ch == EOF) {
310	return 1;
311	} else if (ch == 0xEF) {
312	ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
313	ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
314	#if 0
315	/* Disable support for UTF-16 BOMs until a decision
316	is made whether this needs to be supported. */
317	} else if (ch == 0xFE) {
318	ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
319	if (!set_readline(tok, "utf-16-be")) return 0;
320	tok->decoding_state = -1;
321	} else if (ch == 0xFF) {
322	ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
323	if (!set_readline(tok, "utf-16-le")) return 0;
324	tok->decoding_state = -1;
325	#endif
326	} else {
327	unget_char(ch, tok);
328	return 1;
329	}
330	if (tok->encoding != NULL)
331	PyMem_FREE(tok->encoding);
332	tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
333	return 1;
334	NON_BOM:
335	/* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
336	unget_char(0xFF, tok); /* XXX this will cause a syntax error */
337	return 1;
338	}
339
340	/* Read a line of text from TOK into S, using the stream in TOK.
341	Return NULL on failure, else S.
342
343	On entry, tok->decoding_buffer will be one of:
344	1) NULL: need to call tok->decoding_readline to get a new line
345	2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
346	stored the result in tok->decoding_buffer
347	3) PyStringObject *: previous call to fp_readl did not have enough room
348	(in the s buffer) to copy entire contents of the line read
349	by tok->decoding_readline. tok->decoding_buffer has the overflow.
350	In this case, fp_readl is called in a loop (with an expanded buffer)
351	until the buffer ends with a '\n' (or until the end of the file is
352	reached): see tok_nextc and its calls to decoding_fgets.
353	*/
354
355	static char *
356	fp_readl(char s, int size, struct tok_state tok)
357	{
358	#ifndef Py_USING_UNICODE
359	/* In a non-Unicode built, this should never be called. */
360	Py_FatalError("fp_readl should not be called in this build.");
361	return NULL; /* Keep compiler happy (not reachable) */
362	#else
363	PyObject* utf8 = NULL;
364	PyObject* buf = tok->decoding_buffer;
365	char *str;
366	Py_ssize_t utf8len;
367
368	/* Ask for one less byte so we can terminate it */
369	assert(size > 0);
370	size--;
371
372	if (buf == NULL) {
373	buf = PyObject_CallObject(tok->decoding_readline, NULL);
374	if (buf == NULL)
375	return error_ret(tok);
376	} else {
377	tok->decoding_buffer = NULL;
378	if (PyString_CheckExact(buf))
379	utf8 = buf;
380	}
381	if (utf8 == NULL) {
382	utf8 = PyUnicode_AsUTF8String(buf);
383	Py_DECREF(buf);
384	if (utf8 == NULL)
385	return error_ret(tok);
386	}
387	str = PyString_AsString(utf8);
388	utf8len = PyString_GET_SIZE(utf8);
389	if (utf8len > size) {
390	tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
391	if (tok->decoding_buffer == NULL) {
392	Py_DECREF(utf8);
393	return error_ret(tok);
394	}
395	utf8len = size;
396	}
397	memcpy(s, str, utf8len);
398	s[utf8len] = '\0';
399	Py_DECREF(utf8);
400	if (utf8len == 0) return NULL; /* EOF */
401	return s;
402	#endif
403	}
404
405	/* Set the readline function for TOK to a StreamReader's
406	readline function. The StreamReader is named ENC.
407
408	This function is called from check_bom and check_coding_spec.
409
410	ENC is usually identical to the future value of tok->encoding,
411	except for the (currently unsupported) case of UTF-16.
412
413	Return 1 on success, 0 on failure. */
414
415	static int
416	fp_setreadl(struct tok_state tok, const char enc)
417	{
418	PyObject reader, stream, *readline;
419
420	/* XXX: constify filename argument. */
421	stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
422	if (stream == NULL)
423	return 0;
424
425	reader = PyCodec_StreamReader(enc, stream, NULL);
426	Py_DECREF(stream);
427	if (reader == NULL)
428	return 0;
429
430	readline = PyObject_GetAttrString(reader, "readline");
431	Py_DECREF(reader);
432	if (readline == NULL)
433	return 0;
434
435	tok->decoding_readline = readline;
436	return 1;
437	}
438
439	/* Fetch the next byte from TOK. */
440
441	static int fp_getc(struct tok_state *tok) {
442	return getc(tok->fp);
443	}
444
445	/* Unfetch the last byte back into TOK. */
446
447	static void fp_ungetc(int c, struct tok_state *tok) {
448	ungetc(c, tok->fp);
449	}
450
451	/* Read a line of input from TOK. Determine encoding
452	if necessary. */
453
454	static char *
455	decoding_fgets(char s, int size, struct tok_state tok)
456	{
457	char *line = NULL;
458	int badchar = 0;
459	for (;;) {
460	if (tok->decoding_state < 0) {
461	/* We already have a codec associated with
462	this input. */
463	line = fp_readl(s, size, tok);
464	break;
465	} else if (tok->decoding_state > 0) {
466	/* We want a 'raw' read. */
467	line = Py_UniversalNewlineFgets(s, size,
468	tok->fp, NULL);
469	break;
470	} else {
471	/* We have not yet determined the encoding.
472	If an encoding is found, use the file-pointer
473	reader functions from now on. */
474	if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
475	return error_ret(tok);
476	assert(tok->decoding_state != 0);
477	}
478	}
479	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
480	if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
481	return error_ret(tok);
482	}
483	}
484	#ifndef PGEN
485	/* The default encoding is ASCII, so make sure we don't have any
486	non-ASCII bytes in it. */
487	if (line && !tok->encoding) {
488	unsigned char *c;
489	for (c = (unsigned char )line; c; c++)
490	if (*c > 127) {
491	badchar = *c;
492	break;
493	}
494	}
495	if (badchar) {
496	char buf[500];
497	/* Need to add 1 to the line number, since this line
498	has not been counted, yet. */
499	sprintf(buf,
500	"Non-ASCII character '\\x%.2x' "
501	"in file %.200s on line %i, "
502	"but no encoding declared; "
503	"see http://www.python.org/peps/pep-0263.html for details",
504	badchar, tok->filename, tok->lineno + 1);
505	PyErr_SetString(PyExc_SyntaxError, buf);
506	return error_ret(tok);
507	}
508	#endif
509	return line;
510	}
511
512	static int
513	decoding_feof(struct tok_state *tok)
514	{
515	if (tok->decoding_state >= 0) {
516	return feof(tok->fp);
517	} else {
518	PyObject* buf = tok->decoding_buffer;
519	if (buf == NULL) {
520	buf = PyObject_CallObject(tok->decoding_readline, NULL);
521	if (buf == NULL) {
522	error_ret(tok);
523	return 1;
524	} else {
525	tok->decoding_buffer = buf;
526	}
527	}
528	return PyObject_Length(buf) == 0;
529	}
530	}
531
532	/* Fetch a byte from TOK, using the string buffer. */
533
534	static int
535	buf_getc(struct tok_state *tok) {
536	return Py_CHARMASK(*tok->str++);
537	}
538
539	/* Unfetch a byte from TOK, using the string buffer. */
540
541	static void
542	buf_ungetc(int c, struct tok_state *tok) {
543	tok->str--;
544	assert(Py_CHARMASK(tok->str) == c); / tok->cur may point to read-only segment */
545	}
546
547	/* Set the readline function for TOK to ENC. For the string-based
548	tokenizer, this means to just record the encoding. */
549
550	static int
551	buf_setreadl(struct tok_state tok, const char enc) {
552	tok->enc = enc;
553	return 1;
554	}
555
556	/* Return a UTF-8 encoding Python string object from the
557	C byte string STR, which is encoded with ENC. */
558
559	#ifdef Py_USING_UNICODE
560	static PyObject *
561	translate_into_utf8(const char* str, const char* enc) {
562	PyObject *utf8;
563	PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
564	if (buf == NULL)
565	return NULL;
566	utf8 = PyUnicode_AsUTF8String(buf);
567	Py_DECREF(buf);
568	return utf8;
569	}
570	#endif
571
572	/* Decode a byte string STR for use as the buffer of TOK.
573	Look for encoding declarations inside STR, and record them
574	inside TOK. */
575
576	static const char *
577	decode_str(const char str, struct tok_state tok)
578	{
579	PyObject* utf8 = NULL;
580	const char *s;
581	const char *newl[2] = {NULL, NULL};
582	int lineno = 0;
583	tok->enc = NULL;
584	tok->str = str;
585	if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
586	return error_ret(tok);
587	str = tok->str; /* string after BOM if any */
588	assert(str);
589	#ifdef Py_USING_UNICODE
590	if (tok->enc != NULL) {
591	utf8 = translate_into_utf8(str, tok->enc);
592	if (utf8 == NULL)
593	return error_ret(tok);
594	str = PyString_AsString(utf8);
595	}
596	#endif
597	for (s = str;; s++) {
598	if (*s == '\0') break;
599	else if (*s == '\n') {
600	assert(lineno < 2);
601	newl[lineno] = s;
602	lineno++;
603	if (lineno == 2) break;
604	}
605	}
606	tok->enc = NULL;
607	/* need to check line 1 and 2 separately since check_coding_spec
608	assumes a single line as input */
609	if (newl[0]) {
610	if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
611	return error_ret(tok);
612	if (tok->enc == NULL && newl[1]) {
613	if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
614	tok, buf_setreadl))
615	return error_ret(tok);
616	}
617	}
618	#ifdef Py_USING_UNICODE
619	if (tok->enc != NULL) {
620	assert(utf8 == NULL);
621	utf8 = translate_into_utf8(str, tok->enc);
622	if (utf8 == NULL) {
623	PyErr_Format(PyExc_SyntaxError,
624	"unknown encoding: %s", tok->enc);
625	return error_ret(tok);
626	}
627	str = PyString_AsString(utf8);
628	}
629	#endif
630	assert(tok->decoding_buffer == NULL);
631	tok->decoding_buffer = utf8; /* CAUTION */
632	return str;
633	}
634
635	#endif /* PGEN */
636
637	/* Set up tokenizer for string */
638
639	struct tok_state *
640	PyTokenizer_FromString(const char *str)
641	{
642	struct tok_state *tok = tok_new();
643	if (tok == NULL)
644	return NULL;
645	str = (char *)decode_str(str, tok);
646	if (str == NULL) {
647	PyTokenizer_Free(tok);
648	return NULL;
649	}
650
651	/* XXX: constify members. */
652	tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
653	return tok;
654	}
655
656
657	/* Set up tokenizer for file */
658
659	struct tok_state *
660	PyTokenizer_FromFile(FILE fp, char ps1, char *ps2)
661	{
662	struct tok_state *tok = tok_new();
663	if (tok == NULL)
664	return NULL;
665	if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
666	PyTokenizer_Free(tok);
667	return NULL;
668	}
669	tok->cur = tok->inp = tok->buf;
670	tok->end = tok->buf + BUFSIZ;
671	tok->fp = fp;
672	tok->prompt = ps1;
673	tok->nextprompt = ps2;
674	return tok;
675	}
676
677
678	/* Free a tok_state structure */
679
680	void
681	PyTokenizer_Free(struct tok_state *tok)
682	{
683	if (tok->encoding != NULL)
684	PyMem_FREE(tok->encoding);
685	#ifndef PGEN
686	Py_XDECREF(tok->decoding_readline);
687	Py_XDECREF(tok->decoding_buffer);
688	#endif
689	if (tok->fp != NULL && tok->buf != NULL)
690	PyMem_FREE(tok->buf);
691	PyMem_FREE(tok);
692	}
693
694	#if !defined(PGEN) && defined(Py_USING_UNICODE)
695	static int
696	tok_stdin_decode(struct tok_state tok, char *inp)
697	{
698	PyObject enc, sysstdin, decoded, utf8;
699	const char *encoding;
700	char *converted;
701
702	if (PySys_GetFile((char *)"stdin", NULL) != stdin)
703	return 0;
704	sysstdin = PySys_GetObject("stdin");
705	if (sysstdin == NULL \|\| !PyFile_Check(sysstdin))
706	return 0;
707
708	enc = ((PyFileObject *)sysstdin)->f_encoding;
709	if (enc == NULL \|\| !PyString_Check(enc))
710	return 0;
711	Py_INCREF(enc);
712
713	encoding = PyString_AsString(enc);
714	decoded = PyUnicode_Decode(inp, strlen(inp), encoding, NULL);
715	if (decoded == NULL)
716	goto error_clear;
717
718	utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
719	Py_DECREF(decoded);
720	if (utf8 == NULL)
721	goto error_clear;
722
723	assert(PyString_Check(utf8));
724	converted = new_string(PyString_AS_STRING(utf8),
725	PyString_GET_SIZE(utf8));
726	Py_DECREF(utf8);
727	if (converted == NULL)
728	goto error_nomem;
729
730	PyMem_FREE(*inp);
731	*inp = converted;
732	if (tok->encoding != NULL)
733	PyMem_FREE(tok->encoding);
734	tok->encoding = new_string(encoding, strlen(encoding));
735	if (tok->encoding == NULL)
736	goto error_nomem;
737
738	Py_DECREF(enc);
739	return 0;
740
741	error_nomem:
742	Py_DECREF(enc);
743	tok->done = E_NOMEM;
744	return -1;
745
746	error_clear:
747	/* Fallback to iso-8859-1: for backward compatibility */
748	Py_DECREF(enc);
749	PyErr_Clear();
750	return 0;
751	}
752	#endif
753
754	/* Get next char, updating state; error code goes into tok->done */
755
756	static int
757	tok_nextc(register struct tok_state *tok)
758	{
759	for (;;) {
760	if (tok->cur != tok->inp) {
761	return Py_CHARMASK(tok->cur++); / Fast path */
762	}
763	if (tok->done != E_OK)
764	return EOF;
765	if (tok->fp == NULL) {
766	char *end = strchr(tok->inp, '\n');
767	if (end != NULL)
768	end++;
769	else {
770	end = strchr(tok->inp, '\0');
771	if (end == tok->inp) {
772	tok->done = E_EOF;
773	return EOF;
774	}
775	}
776	if (tok->start == NULL)
777	tok->buf = tok->cur;
778	tok->line_start = tok->cur;
779	tok->lineno++;
780	tok->inp = end;
781	return Py_CHARMASK(*tok->cur++);
782	}
783	if (tok->prompt != NULL) {
784	char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
785	if (tok->nextprompt != NULL)
786	tok->prompt = tok->nextprompt;
787	if (newtok == NULL)
788	tok->done = E_INTR;
789	else if (*newtok == '\0') {
790	PyMem_FREE(newtok);
791	tok->done = E_EOF;
792	}
793	#if !defined(PGEN) && defined(Py_USING_UNICODE)
794	else if (tok_stdin_decode(tok, &newtok) != 0)
795	PyMem_FREE(newtok);
796	#endif
797	else if (tok->start != NULL) {
798	size_t start = tok->start - tok->buf;
799	size_t oldlen = tok->cur - tok->buf;
800	size_t newlen = oldlen + strlen(newtok);
801	char *buf = tok->buf;
802	buf = (char *)PyMem_REALLOC(buf, newlen+1);
803	tok->lineno++;
804	if (buf == NULL) {
805	PyMem_FREE(tok->buf);
806	tok->buf = NULL;
807	PyMem_FREE(newtok);
808	tok->done = E_NOMEM;
809	return EOF;
810	}
811	tok->buf = buf;
812	tok->cur = tok->buf + oldlen;
813	tok->line_start = tok->cur;
814	strcpy(tok->buf + oldlen, newtok);
815	PyMem_FREE(newtok);
816	tok->inp = tok->buf + newlen;
817	tok->end = tok->inp + 1;
818	tok->start = tok->buf + start;
819	}
820	else {
821	tok->lineno++;
822	if (tok->buf != NULL)
823	PyMem_FREE(tok->buf);
824	tok->buf = newtok;
825	tok->line_start = tok->buf;
826	tok->cur = tok->buf;
827	tok->line_start = tok->buf;
828	tok->inp = strchr(tok->buf, '\0');
829	tok->end = tok->inp + 1;
830	}
831	}
832	else {
833	int done = 0;
834	Py_ssize_t cur = 0;
835	char *pt;
836	if (tok->start == NULL) {
837	if (tok->buf == NULL) {
838	tok->buf = (char *)
839	PyMem_MALLOC(BUFSIZ);
840	if (tok->buf == NULL) {
841	tok->done = E_NOMEM;
842	return EOF;
843	}
844	tok->end = tok->buf + BUFSIZ;
845	}
846	if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
847	tok) == NULL) {
848	tok->done = E_EOF;
849	done = 1;
850	}
851	else {
852	tok->done = E_OK;
853	tok->inp = strchr(tok->buf, '\0');
854	done = tok->inp[-1] == '\n';
855	}
856	}
857	else {
858	cur = tok->cur - tok->buf;
859	if (decoding_feof(tok)) {
860	tok->done = E_EOF;
861	done = 1;
862	}
863	else
864	tok->done = E_OK;
865	}
866	tok->lineno++;
867	/* Read until '\n' or EOF */
868	while (!done) {
869	Py_ssize_t curstart = tok->start == NULL ? -1 :
870	tok->start - tok->buf;
871	Py_ssize_t curvalid = tok->inp - tok->buf;
872	Py_ssize_t newsize = curvalid + BUFSIZ;
873	char *newbuf = tok->buf;
874	newbuf = (char *)PyMem_REALLOC(newbuf,
875	newsize);
876	if (newbuf == NULL) {
877	tok->done = E_NOMEM;
878	tok->cur = tok->inp;
879	return EOF;
880	}
881	tok->buf = newbuf;
882	tok->inp = tok->buf + curvalid;
883	tok->end = tok->buf + newsize;
884	tok->start = curstart < 0 ? NULL :
885	tok->buf + curstart;
886	if (decoding_fgets(tok->inp,
887	(int)(tok->end - tok->inp),
888	tok) == NULL) {
889	/* Break out early on decoding
890	errors, as tok->buf will be NULL
891	*/
892	if (tok->decoding_erred)
893	return EOF;
894	/* Last line does not end in \n,
895	fake one */
896	strcpy(tok->inp, "\n");
897	}
898	tok->inp = strchr(tok->inp, '\0');
899	done = tok->inp[-1] == '\n';
900	}
901	if (tok->buf != NULL) {
902	tok->cur = tok->buf + cur;
903	tok->line_start = tok->cur;
904	/* replace "\r\n" with "\n" */
905	/* For Mac leave the \r, giving a syntax error */
906	pt = tok->inp - 2;
907	if (pt >= tok->buf && *pt == '\r') {
908	*pt++ = '\n';
909	*pt = '\0';
910	tok->inp = pt;
911	}
912	}
913	}
914	if (tok->done != E_OK) {
915	if (tok->prompt != NULL)
916	PySys_WriteStderr("\n");
917	tok->cur = tok->inp;
918	return EOF;
919	}
920	}
921	/NOTREACHED/
922	}
923
924
925	/* Back-up one character */
926
927	static void
928	tok_backup(register struct tok_state *tok, register int c)
929	{
930	if (c != EOF) {
931	if (--tok->cur < tok->buf)
932	Py_FatalError("tok_backup: begin of buffer");
933	if (*tok->cur != c)
934	*tok->cur = c;
935	}
936	}
937
938
939	/* Return the token corresponding to a single character */
940
941	int
942	PyToken_OneChar(int c)
943	{
944	switch (c) {
945	case '(': return LPAR;
946	case ')': return RPAR;
947	case '[': return LSQB;
948	case ']': return RSQB;
949	case ':': return COLON;
950	case ',': return COMMA;
951	case ';': return SEMI;
952	case '+': return PLUS;
953	case '-': return MINUS;
954	case '*': return STAR;
955	case '/': return SLASH;
956	case '\|': return VBAR;
957	case '&': return AMPER;
958	case '<': return LESS;
959	case '>': return GREATER;
960	case '=': return EQUAL;
961	case '.': return DOT;
962	case '%': return PERCENT;
963	case '`': return BACKQUOTE;
964	case '{': return LBRACE;
965	case '}': return RBRACE;
966	case '^': return CIRCUMFLEX;
967	case '~': return TILDE;
968	case '@': return AT;
969	default: return OP;
970	}
971	}
972
973
974	int
975	PyToken_TwoChars(int c1, int c2)
976	{
977	switch (c1) {
978	case '=':
979	switch (c2) {
980	case '=': return EQEQUAL;
981	}
982	break;
983	case '!':
984	switch (c2) {
985	case '=': return NOTEQUAL;
986	}
987	break;
988	case '<':
989	switch (c2) {
990	case '>': return NOTEQUAL;
991	case '=': return LESSEQUAL;
992	case '<': return LEFTSHIFT;
993	}
994	break;
995	case '>':
996	switch (c2) {
997	case '=': return GREATEREQUAL;
998	case '>': return RIGHTSHIFT;
999	}
1000	break;
1001	case '+':
1002	switch (c2) {
1003	case '=': return PLUSEQUAL;
1004	}
1005	break;
1006	case '-':
1007	switch (c2) {
1008	case '=': return MINEQUAL;
1009	}
1010	break;
1011	case '*':
1012	switch (c2) {
1013	case '*': return DOUBLESTAR;
1014	case '=': return STAREQUAL;
1015	}
1016	break;
1017	case '/':
1018	switch (c2) {
1019	case '/': return DOUBLESLASH;
1020	case '=': return SLASHEQUAL;
1021	}
1022	break;
1023	case '\|':
1024	switch (c2) {
1025	case '=': return VBAREQUAL;
1026	}
1027	break;
1028	case '%':
1029	switch (c2) {
1030	case '=': return PERCENTEQUAL;
1031	}
1032	break;
1033	case '&':
1034	switch (c2) {
1035	case '=': return AMPEREQUAL;
1036	}
1037	break;
1038	case '^':
1039	switch (c2) {
1040	case '=': return CIRCUMFLEXEQUAL;
1041	}
1042	break;
1043	}
1044	return OP;
1045	}
1046
1047	int
1048	PyToken_ThreeChars(int c1, int c2, int c3)
1049	{
1050	switch (c1) {
1051	case '<':
1052	switch (c2) {
1053	case '<':
1054	switch (c3) {
1055	case '=':
1056	return LEFTSHIFTEQUAL;
1057	}
1058	break;
1059	}
1060	break;
1061	case '>':
1062	switch (c2) {
1063	case '>':
1064	switch (c3) {
1065	case '=':
1066	return RIGHTSHIFTEQUAL;
1067	}
1068	break;
1069	}
1070	break;
1071	case '*':
1072	switch (c2) {
1073	case '*':
1074	switch (c3) {
1075	case '=':
1076	return DOUBLESTAREQUAL;
1077	}
1078	break;
1079	}
1080	break;
1081	case '/':
1082	switch (c2) {
1083	case '/':
1084	switch (c3) {
1085	case '=':
1086	return DOUBLESLASHEQUAL;
1087	}
1088	break;
1089	}
1090	break;
1091	}
1092	return OP;
1093	}
1094
1095	static int
1096	indenterror(struct tok_state *tok)
1097	{
1098	if (tok->alterror) {
1099	tok->done = E_TABSPACE;
1100	tok->cur = tok->inp;
1101	return 1;
1102	}
1103	if (tok->altwarning) {
1104	PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1105	"in indentation\n", tok->filename);
1106	tok->altwarning = 0;
1107	}
1108	return 0;
1109	}
1110
1111
1112	/* Get next token, after space stripping etc. */
1113
1114	static int
1115	tok_get(register struct tok_state tok, char p_start, char *p_end)
1116	{
1117	register int c;
1118	int blankline;
1119
1120	p_start = p_end = NULL;
1121	nextline:
1122	tok->start = NULL;
1123	blankline = 0;
1124
1125	/* Get indentation level */
1126	if (tok->atbol) {
1127	register int col = 0;
1128	register int altcol = 0;
1129	tok->atbol = 0;
1130	for (;;) {
1131	c = tok_nextc(tok);
1132	if (c == ' ')
1133	col++, altcol++;
1134	else if (c == '\t') {
1135	col = (col/tok->tabsize + 1) * tok->tabsize;
1136	altcol = (altcol/tok->alttabsize + 1)
1137	* tok->alttabsize;
1138	}
1139	else if (c == '\014') /* Control-L (formfeed) */
1140	col = altcol = 0; /* For Emacs users */
1141	else
1142	break;
1143	}
1144	tok_backup(tok, c);
1145	if (c == '#' \|\| c == '\n') {
1146	/* Lines with only whitespace and/or comments
1147	shouldn't affect the indentation and are
1148	not passed to the parser as NEWLINE tokens,
1149	except totally empty lines in interactive
1150	mode, which signal the end of a command group. */
1151	if (col == 0 && c == '\n' && tok->prompt != NULL)
1152	blankline = 0; /* Let it through */
1153	else
1154	blankline = 1; /* Ignore completely */
1155	/* We can't jump back right here since we still
1156	may need to skip to the end of a comment */
1157	}
1158	if (!blankline && tok->level == 0) {
1159	if (col == tok->indstack[tok->indent]) {
1160	/* No change */
1161	if (altcol != tok->altindstack[tok->indent]) {
1162	if (indenterror(tok))
1163	return ERRORTOKEN;
1164	}
1165	}
1166	else if (col > tok->indstack[tok->indent]) {
1167	/* Indent -- always one */
1168	if (tok->indent+1 >= MAXINDENT) {
1169	tok->done = E_TOODEEP;
1170	tok->cur = tok->inp;
1171	return ERRORTOKEN;
1172	}
1173	if (altcol <= tok->altindstack[tok->indent]) {
1174	if (indenterror(tok))
1175	return ERRORTOKEN;
1176	}
1177	tok->pendin++;
1178	tok->indstack[++tok->indent] = col;
1179	tok->altindstack[tok->indent] = altcol;
1180	}
1181	else /* col < tok->indstack[tok->indent] */ {
1182	/* Dedent -- any number, must be consistent */
1183	while (tok->indent > 0 &&
1184	col < tok->indstack[tok->indent]) {
1185	tok->pendin--;
1186	tok->indent--;
1187	}
1188	if (col != tok->indstack[tok->indent]) {
1189	tok->done = E_DEDENT;
1190	tok->cur = tok->inp;
1191	return ERRORTOKEN;
1192	}
1193	if (altcol != tok->altindstack[tok->indent]) {
1194	if (indenterror(tok))
1195	return ERRORTOKEN;
1196	}
1197	}
1198	}
1199	}
1200
1201	tok->start = tok->cur;
1202
1203	/* Return pending indents/dedents */
1204	if (tok->pendin != 0) {
1205	if (tok->pendin < 0) {
1206	tok->pendin++;
1207	return DEDENT;
1208	}
1209	else {
1210	tok->pendin--;
1211	return INDENT;
1212	}
1213	}
1214
1215	again:
1216	tok->start = NULL;
1217	/* Skip spaces */
1218	do {
1219	c = tok_nextc(tok);
1220	} while (c == ' ' \|\| c == '\t' \|\| c == '\014');
1221
1222	/* Set start of current token */
1223	tok->start = tok->cur - 1;
1224
1225	/* Skip comment, while looking for tab-setting magic */
1226	if (c == '#') {
1227	static char *tabforms[] = {
1228	"tab-width:", /* Emacs */
1229	":tabstop=", /* vim, full form */
1230	":ts=", /* vim, abbreviated form */
1231	"set tabsize=", /* will vi never die? */
1232	/* more templates can be added here to support other editors */
1233	};
1234	char cbuf[80];
1235	char tp, *cp;
1236	tp = cbuf;
1237	do {
1238	*tp++ = c = tok_nextc(tok);
1239	} while (c != EOF && c != '\n' &&
1240	(size_t)(tp - cbuf + 1) < sizeof(cbuf));
1241	*tp = '\0';
1242	for (cp = tabforms;
1243	cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1244	cp++) {
1245	if ((tp = strstr(cbuf, *cp))) {
1246	int newsize = atoi(tp + strlen(*cp));
1247
1248	if (newsize >= 1 && newsize <= 40) {
1249	tok->tabsize = newsize;
1250	if (Py_VerboseFlag)
1251	PySys_WriteStderr(
1252	"Tab size set to %d\n",
1253	newsize);
1254	}
1255	}
1256	}
1257	while (c != EOF && c != '\n')
1258	c = tok_nextc(tok);
1259	}
1260
1261	/* Check for EOF and errors now */
1262	if (c == EOF) {
1263	return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1264	}
1265
1266	/* Identifier (most frequent token!) */
1267	if (isalpha(c) \|\| c == '_') {
1268	/* Process r"", u"" and ur"" */
1269	switch (c) {
1270	case 'b':
1271	case 'B':
1272	c = tok_nextc(tok);
1273	if (c == 'r' \|\| c == 'R')
1274	c = tok_nextc(tok);
1275	if (c == '"' \|\| c == '\'')
1276	goto letter_quote;
1277	break;
1278	case 'r':
1279	case 'R':
1280	c = tok_nextc(tok);
1281	if (c == '"' \|\| c == '\'')
1282	goto letter_quote;
1283	break;
1284	case 'u':
1285	case 'U':
1286	c = tok_nextc(tok);
1287	if (c == 'r' \|\| c == 'R')
1288	c = tok_nextc(tok);
1289	if (c == '"' \|\| c == '\'')
1290	goto letter_quote;
1291	break;
1292	}
1293	while (isalnum(c) \|\| c == '_') {
1294	c = tok_nextc(tok);
1295	}
1296	tok_backup(tok, c);
1297	*p_start = tok->start;
1298	*p_end = tok->cur;
1299	return NAME;
1300	}
1301
1302	/* Newline */
1303	if (c == '\n') {
1304	tok->atbol = 1;
1305	if (blankline \|\| tok->level > 0)
1306	goto nextline;
1307	*p_start = tok->start;
1308	p_end = tok->cur - 1; / Leave '\n' out of the string */
1309	tok->cont_line = 0;
1310	return NEWLINE;
1311	}
1312
1313	/* Period or number starting with period? */
1314	if (c == '.') {
1315	c = tok_nextc(tok);
1316	if (isdigit(c)) {
1317	goto fraction;
1318	}
1319	else {
1320	tok_backup(tok, c);
1321	*p_start = tok->start;
1322	*p_end = tok->cur;
1323	return DOT;
1324	}
1325	}
1326
1327	/* Number */
1328	if (isdigit(c)) {
1329	if (c == '0') {
1330	/* Hex, octal or binary -- maybe. */
1331	c = tok_nextc(tok);
1332	if (c == '.')
1333	goto fraction;
1334	#ifndef WITHOUT_COMPLEX
1335	if (c == 'j' \|\| c == 'J')
1336	goto imaginary;
1337	#endif
1338	if (c == 'x' \|\| c == 'X') {
1339
1340	/* Hex */
1341	c = tok_nextc(tok);
1342	if (!isxdigit(c)) {
1343	tok->done = E_TOKEN;
1344	tok_backup(tok, c);
1345	return ERRORTOKEN;
1346	}
1347	do {
1348	c = tok_nextc(tok);
1349	} while (isxdigit(c));
1350	}
1351	else if (c == 'o' \|\| c == 'O') {
1352	/* Octal */
1353	c = tok_nextc(tok);
1354	if (c < '0' \|\| c >= '8') {
1355	tok->done = E_TOKEN;
1356	tok_backup(tok, c);
1357	return ERRORTOKEN;
1358	}
1359	do {
1360	c = tok_nextc(tok);
1361	} while ('0' <= c && c < '8');
1362	}
1363	else if (c == 'b' \|\| c == 'B') {
1364	/* Binary */
1365	c = tok_nextc(tok);
1366	if (c != '0' && c != '1') {
1367	tok->done = E_TOKEN;
1368	tok_backup(tok, c);
1369	return ERRORTOKEN;
1370	}
1371	do {
1372	c = tok_nextc(tok);
1373	} while (c == '0' \|\| c == '1');
1374	}
1375	else {
1376	int found_decimal = 0;
1377	/* Octal; c is first char of it */
1378	/* There's no 'isoctdigit' macro, sigh */
1379	while ('0' <= c && c < '8') {
1380	c = tok_nextc(tok);
1381	}
1382	if (isdigit(c)) {
1383	found_decimal = 1;
1384	do {
1385	c = tok_nextc(tok);
1386	} while (isdigit(c));
1387	}
1388	if (c == '.')
1389	goto fraction;
1390	else if (c == 'e' \|\| c == 'E')
1391	goto exponent;
1392	#ifndef WITHOUT_COMPLEX
1393	else if (c == 'j' \|\| c == 'J')
1394	goto imaginary;
1395	#endif
1396	else if (found_decimal) {
1397	tok->done = E_TOKEN;
1398	tok_backup(tok, c);
1399	return ERRORTOKEN;
1400	}
1401	}
1402	if (c == 'l' \|\| c == 'L')
1403	c = tok_nextc(tok);
1404	}
1405	else {
1406	/* Decimal */
1407	do {
1408	c = tok_nextc(tok);
1409	} while (isdigit(c));
1410	if (c == 'l' \|\| c == 'L')
1411	c = tok_nextc(tok);
1412	else {
1413	/* Accept floating point numbers. */
1414	if (c == '.') {
1415	fraction:
1416	/* Fraction */
1417	do {
1418	c = tok_nextc(tok);
1419	} while (isdigit(c));
1420	}
1421	if (c == 'e' \|\| c == 'E') {
1422	exponent:
1423	/* Exponent part */
1424	c = tok_nextc(tok);
1425	if (c == '+' \|\| c == '-')
1426	c = tok_nextc(tok);
1427	if (!isdigit(c)) {
1428	tok->done = E_TOKEN;
1429	tok_backup(tok, c);
1430	return ERRORTOKEN;
1431	}
1432	do {
1433	c = tok_nextc(tok);
1434	} while (isdigit(c));
1435	}
1436	#ifndef WITHOUT_COMPLEX
1437	if (c == 'j' \|\| c == 'J')
1438	/* Imaginary part */
1439	imaginary:
1440	c = tok_nextc(tok);
1441	#endif
1442	}
1443	}
1444	tok_backup(tok, c);
1445	*p_start = tok->start;
1446	*p_end = tok->cur;
1447	return NUMBER;
1448	}
1449
1450	letter_quote:
1451	/* String */
1452	if (c == '\'' \|\| c == '"') {
1453	Py_ssize_t quote2 = tok->cur - tok->start + 1;
1454	int quote = c;
1455	int triple = 0;
1456	int tripcount = 0;
1457	for (;;) {
1458	c = tok_nextc(tok);
1459	if (c == '\n') {
1460	if (!triple) {
1461	tok->done = E_EOLS;
1462	tok_backup(tok, c);
1463	return ERRORTOKEN;
1464	}
1465	tripcount = 0;
1466	tok->cont_line = 1; /* multiline string. */
1467	}
1468	else if (c == EOF) {
1469	if (triple)
1470	tok->done = E_EOFS;
1471	else
1472	tok->done = E_EOLS;
1473	tok->cur = tok->inp;
1474	return ERRORTOKEN;
1475	}
1476	else if (c == quote) {
1477	tripcount++;
1478	if (tok->cur - tok->start == quote2) {
1479	c = tok_nextc(tok);
1480	if (c == quote) {
1481	triple = 1;
1482	tripcount = 0;
1483	continue;
1484	}
1485	tok_backup(tok, c);
1486	}
1487	if (!triple \|\| tripcount == 3)
1488	break;
1489	}
1490	else if (c == '\\') {
1491	tripcount = 0;
1492	c = tok_nextc(tok);
1493	if (c == EOF) {
1494	tok->done = E_EOLS;
1495	tok->cur = tok->inp;
1496	return ERRORTOKEN;
1497	}
1498	}
1499	else
1500	tripcount = 0;
1501	}
1502	*p_start = tok->start;
1503	*p_end = tok->cur;
1504	return STRING;
1505	}
1506
1507	/* Line continuation */
1508	if (c == '\\') {
1509	c = tok_nextc(tok);
1510	if (c != '\n') {
1511	tok->done = E_LINECONT;
1512	tok->cur = tok->inp;
1513	return ERRORTOKEN;
1514	}
1515	tok->cont_line = 1;
1516	goto again; /* Read next line */
1517	}
1518
1519	/* Check for two-character token */
1520	{
1521	int c2 = tok_nextc(tok);
1522	int token = PyToken_TwoChars(c, c2);
1523	#ifndef PGEN
1524	if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1525	if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1526	"<> not supported in 3.x; use !=",
1527	tok->filename, tok->lineno,
1528	NULL, NULL)) {
1529	return ERRORTOKEN;
1530	}
1531	}
1532	#endif
1533	if (token != OP) {
1534	int c3 = tok_nextc(tok);
1535	int token3 = PyToken_ThreeChars(c, c2, c3);
1536	if (token3 != OP) {
1537	token = token3;
1538	} else {
1539	tok_backup(tok, c3);
1540	}
1541	*p_start = tok->start;
1542	*p_end = tok->cur;
1543	return token;
1544	}
1545	tok_backup(tok, c2);
1546	}
1547
1548	/* Keep track of parentheses nesting level */
1549	switch (c) {
1550	case '(':
1551	case '[':
1552	case '{':
1553	tok->level++;
1554	break;
1555	case ')':
1556	case ']':
1557	case '}':
1558	tok->level--;
1559	break;
1560	}
1561
1562	/* Punctuation character */
1563	*p_start = tok->start;
1564	*p_end = tok->cur;
1565	return PyToken_OneChar(c);
1566	}
1567
1568	int
1569	PyTokenizer_Get(struct tok_state tok, char p_start, char *p_end)
1570	{
1571	int result = tok_get(tok, p_start, p_end);
1572	if (tok->decoding_erred) {
1573	result = ERRORTOKEN;
1574	tok->done = E_DECODE;
1575	}
1576	return result;
1577	}
1578
1579	/* This function is only called from parsetok. However, it cannot live
1580	there, as it must be empty for PGEN, and we can check for PGEN only
1581	in this file. */
1582
1583	#if defined(PGEN) \|\| !defined(Py_USING_UNICODE)
1584	char*
1585	PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1586	{
1587	return NULL;
1588	}
1589	#else
1590	#ifdef Py_USING_UNICODE
1591	static PyObject *
1592	dec_utf8(const char enc, const char text, size_t len) {
1593	PyObject *ret = NULL;
1594	PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1595	if (unicode_text) {
1596	ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1597	Py_DECREF(unicode_text);
1598	}
1599	if (!ret) {
1600	PyErr_Clear();
1601	}
1602	return ret;
1603	}
1604	char *
1605	PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1606	{
1607	char *text = NULL;
1608	if (tok->encoding) {
1609	/* convert source to original encondig */
1610	PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1611	if (lineobj != NULL) {
1612	int linelen = PyString_Size(lineobj);
1613	const char *line = PyString_AsString(lineobj);
1614	text = PyObject_MALLOC(linelen + 1);
1615	if (text != NULL && line != NULL) {
1616	if (linelen)
1617	strncpy(text, line, linelen);
1618	text[linelen] = '\0';
1619	}
1620	Py_DECREF(lineobj);
1621
1622	/* adjust error offset */
1623	if (*offset > 1) {
1624	PyObject *offsetobj = dec_utf8(tok->encoding,
1625	tok->buf, *offset-1);
1626	if (offsetobj) {
1627	*offset = PyString_Size(offsetobj) + 1;
1628	Py_DECREF(offsetobj);
1629	}
1630	}
1631
1632	}
1633	}
1634	return text;
1635
1636	}
1637	#endif /* defined(Py_USING_UNICODE) */
1638	#endif
1639
1640
1641	#ifdef Py_DEBUG
1642
1643	void
1644	tok_dump(int type, char start, char end)
1645	{
1646	printf("%s", _PyParser_TokenNames[type]);
1647	if (type == NAME \|\| type == NUMBER \|\| type == STRING \|\| type == OP)
1648	printf("(%.*s)", (int)(end - start), start);
1649	}
1650
1651	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/vendor/Python-2.6.5/Parser/tokenizer.c

Download in other formats: