source: python/vendor/Python-2.6.5/Parser/tokenizer.c

Last change on this file was 2, checked in by Yuri Dario, 15 years ago

Initial import for vendor code.

  • Property svn:eol-style set to native
File size: 35.8 KB
Line 
1
2/* Tokenizer implementation */
3
4#include "Python.h"
5#include "pgenheaders.h"
6
7#include <ctype.h>
8#include <assert.h>
9
10#include "tokenizer.h"
11#include "errcode.h"
12
13#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#include "pydebug.h"
20#endif /* PGEN */
21
22extern char *PyOS_Readline(FILE *, FILE *, char *);
23/* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
26
27/* Don't ever change this -- it would break the portability of Python code */
28#define TABSIZE 8
29
30/* Forward */
31static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
34
35/* Token names */
36
37char *_PyParser_TokenNames[] = {
38 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
66 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
70 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
74 "DOUBLESTAR",
75 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
86 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
88 "AT",
89 /* This table must match the #defines in token.h! */
90 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
93};
94
95
96/* Create and initialize a new tok_state structure */
97
98static struct tok_state *
99tok_new(void)
100{
101 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
102 sizeof(struct tok_state));
103 if (tok == NULL)
104 return NULL;
105 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
106 tok->done = E_OK;
107 tok->fp = NULL;
108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
115 tok->level = 0;
116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
124 tok->encoding = NULL;
125 tok->cont_line = 0;
126#ifndef PGEN
127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
129#endif
130 return tok;
131}
132
133#ifdef PGEN
134
135static char *
136decoding_fgets(char *s, int size, struct tok_state *tok)
137{
138 return fgets(s, size, tok->fp);
139}
140
141static int
142decoding_feof(struct tok_state *tok)
143{
144 return feof(tok->fp);
145}
146
147static const char *
148decode_str(const char *str, struct tok_state *tok)
149{
150 return str;
151}
152
153#else /* PGEN */
154
155static char *
156error_ret(struct tok_state *tok) /* XXX */
157{
158 tok->decoding_erred = 1;
159 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
160 PyMem_FREE(tok->buf);
161 tok->buf = NULL;
162 return NULL; /* as if it were EOF */
163}
164
165static char *
166new_string(const char *s, Py_ssize_t len)
167{
168 char* result = (char *)PyMem_MALLOC(len + 1);
169 if (result != NULL) {
170 memcpy(result, s, len);
171 result[len] = '\0';
172 }
173 return result;
174}
175
176static char *
177get_normal_name(char *s) /* for utf-8 and latin-1 */
178{
179 char buf[13];
180 int i;
181 for (i = 0; i < 12; i++) {
182 int c = s[i];
183 if (c == '\0') break;
184 else if (c == '_') buf[i] = '-';
185 else buf[i] = tolower(c);
186 }
187 buf[i] = '\0';
188 if (strcmp(buf, "utf-8") == 0 ||
189 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
190 else if (strcmp(buf, "latin-1") == 0 ||
191 strcmp(buf, "iso-8859-1") == 0 ||
192 strcmp(buf, "iso-latin-1") == 0 ||
193 strncmp(buf, "latin-1-", 8) == 0 ||
194 strncmp(buf, "iso-8859-1-", 11) == 0 ||
195 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
196 else return s;
197}
198
199/* Return the coding spec in S, or NULL if none is found. */
200
201static char *
202get_coding_spec(const char *s, Py_ssize_t size)
203{
204 Py_ssize_t i;
205 /* Coding spec must be in a comment, and that comment must be
206 * the only statement on the source code line. */
207 for (i = 0; i < size - 6; i++) {
208 if (s[i] == '#')
209 break;
210 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
211 return NULL;
212 }
213 for (; i < size - 6; i++) { /* XXX inefficient search */
214 const char* t = s + i;
215 if (strncmp(t, "coding", 6) == 0) {
216 const char* begin = NULL;
217 t += 6;
218 if (t[0] != ':' && t[0] != '=')
219 continue;
220 do {
221 t++;
222 } while (t[0] == '\x20' || t[0] == '\t');
223
224 begin = t;
225 while (isalnum(Py_CHARMASK(t[0])) ||
226 t[0] == '-' || t[0] == '_' || t[0] == '.')
227 t++;
228
229 if (begin < t) {
230 char* r = new_string(begin, t - begin);
231 char* q = get_normal_name(r);
232 if (r != q) {
233 PyMem_FREE(r);
234 r = new_string(q, strlen(q));
235 }
236 return r;
237 }
238 }
239 }
240 return NULL;
241}
242
243/* Check whether the line contains a coding spec. If it does,
244 invoke the set_readline function for the new encoding.
245 This function receives the tok_state and the new encoding.
246 Return 1 on success, 0 on failure. */
247
248static int
249check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
250 int set_readline(struct tok_state *, const char *))
251{
252 char * cs;
253 int r = 1;
254
255 if (tok->cont_line)
256 /* It's a continuation line, so it can't be a coding spec. */
257 return 1;
258 cs = get_coding_spec(line, size);
259 if (cs != NULL) {
260 tok->read_coding_spec = 1;
261 if (tok->encoding == NULL) {
262 assert(tok->decoding_state == 1); /* raw */
263 if (strcmp(cs, "utf-8") == 0 ||
264 strcmp(cs, "iso-8859-1") == 0) {
265 tok->encoding = cs;
266 } else {
267#ifdef Py_USING_UNICODE
268 r = set_readline(tok, cs);
269 if (r) {
270 tok->encoding = cs;
271 tok->decoding_state = -1;
272 }
273 else
274 PyMem_FREE(cs);
275#else
276 /* Without Unicode support, we cannot
277 process the coding spec. Since there
278 won't be any Unicode literals, that
279 won't matter. */
280 PyMem_FREE(cs);
281#endif
282 }
283 } else { /* then, compare cs with BOM */
284 r = (strcmp(tok->encoding, cs) == 0);
285 PyMem_FREE(cs);
286 }
287 }
288 if (!r) {
289 cs = tok->encoding;
290 if (!cs)
291 cs = "with BOM";
292 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
293 }
294 return r;
295}
296
297/* See whether the file starts with a BOM. If it does,
298 invoke the set_readline function with the new encoding.
299 Return 1 on success, 0 on failure. */
300
301static int
302check_bom(int get_char(struct tok_state *),
303 void unget_char(int, struct tok_state *),
304 int set_readline(struct tok_state *, const char *),
305 struct tok_state *tok)
306{
307 int ch = get_char(tok);
308 tok->decoding_state = 1;
309 if (ch == EOF) {
310 return 1;
311 } else if (ch == 0xEF) {
312 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
313 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
314#if 0
315 /* Disable support for UTF-16 BOMs until a decision
316 is made whether this needs to be supported. */
317 } else if (ch == 0xFE) {
318 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
319 if (!set_readline(tok, "utf-16-be")) return 0;
320 tok->decoding_state = -1;
321 } else if (ch == 0xFF) {
322 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
323 if (!set_readline(tok, "utf-16-le")) return 0;
324 tok->decoding_state = -1;
325#endif
326 } else {
327 unget_char(ch, tok);
328 return 1;
329 }
330 if (tok->encoding != NULL)
331 PyMem_FREE(tok->encoding);
332 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
333 return 1;
334 NON_BOM:
335 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
336 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
337 return 1;
338}
339
340/* Read a line of text from TOK into S, using the stream in TOK.
341 Return NULL on failure, else S.
342
343 On entry, tok->decoding_buffer will be one of:
344 1) NULL: need to call tok->decoding_readline to get a new line
345 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
346 stored the result in tok->decoding_buffer
347 3) PyStringObject *: previous call to fp_readl did not have enough room
348 (in the s buffer) to copy entire contents of the line read
349 by tok->decoding_readline. tok->decoding_buffer has the overflow.
350 In this case, fp_readl is called in a loop (with an expanded buffer)
351 until the buffer ends with a '\n' (or until the end of the file is
352 reached): see tok_nextc and its calls to decoding_fgets.
353*/
354
355static char *
356fp_readl(char *s, int size, struct tok_state *tok)
357{
358#ifndef Py_USING_UNICODE
359 /* In a non-Unicode built, this should never be called. */
360 Py_FatalError("fp_readl should not be called in this build.");
361 return NULL; /* Keep compiler happy (not reachable) */
362#else
363 PyObject* utf8 = NULL;
364 PyObject* buf = tok->decoding_buffer;
365 char *str;
366 Py_ssize_t utf8len;
367
368 /* Ask for one less byte so we can terminate it */
369 assert(size > 0);
370 size--;
371
372 if (buf == NULL) {
373 buf = PyObject_CallObject(tok->decoding_readline, NULL);
374 if (buf == NULL)
375 return error_ret(tok);
376 } else {
377 tok->decoding_buffer = NULL;
378 if (PyString_CheckExact(buf))
379 utf8 = buf;
380 }
381 if (utf8 == NULL) {
382 utf8 = PyUnicode_AsUTF8String(buf);
383 Py_DECREF(buf);
384 if (utf8 == NULL)
385 return error_ret(tok);
386 }
387 str = PyString_AsString(utf8);
388 utf8len = PyString_GET_SIZE(utf8);
389 if (utf8len > size) {
390 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
391 if (tok->decoding_buffer == NULL) {
392 Py_DECREF(utf8);
393 return error_ret(tok);
394 }
395 utf8len = size;
396 }
397 memcpy(s, str, utf8len);
398 s[utf8len] = '\0';
399 Py_DECREF(utf8);
400 if (utf8len == 0) return NULL; /* EOF */
401 return s;
402#endif
403}
404
405/* Set the readline function for TOK to a StreamReader's
406 readline function. The StreamReader is named ENC.
407
408 This function is called from check_bom and check_coding_spec.
409
410 ENC is usually identical to the future value of tok->encoding,
411 except for the (currently unsupported) case of UTF-16.
412
413 Return 1 on success, 0 on failure. */
414
415static int
416fp_setreadl(struct tok_state *tok, const char* enc)
417{
418 PyObject *reader, *stream, *readline;
419
420 /* XXX: constify filename argument. */
421 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
422 if (stream == NULL)
423 return 0;
424
425 reader = PyCodec_StreamReader(enc, stream, NULL);
426 Py_DECREF(stream);
427 if (reader == NULL)
428 return 0;
429
430 readline = PyObject_GetAttrString(reader, "readline");
431 Py_DECREF(reader);
432 if (readline == NULL)
433 return 0;
434
435 tok->decoding_readline = readline;
436 return 1;
437}
438
439/* Fetch the next byte from TOK. */
440
441static int fp_getc(struct tok_state *tok) {
442 return getc(tok->fp);
443}
444
445/* Unfetch the last byte back into TOK. */
446
447static void fp_ungetc(int c, struct tok_state *tok) {
448 ungetc(c, tok->fp);
449}
450
451/* Read a line of input from TOK. Determine encoding
452 if necessary. */
453
454static char *
455decoding_fgets(char *s, int size, struct tok_state *tok)
456{
457 char *line = NULL;
458 int badchar = 0;
459 for (;;) {
460 if (tok->decoding_state < 0) {
461 /* We already have a codec associated with
462 this input. */
463 line = fp_readl(s, size, tok);
464 break;
465 } else if (tok->decoding_state > 0) {
466 /* We want a 'raw' read. */
467 line = Py_UniversalNewlineFgets(s, size,
468 tok->fp, NULL);
469 break;
470 } else {
471 /* We have not yet determined the encoding.
472 If an encoding is found, use the file-pointer
473 reader functions from now on. */
474 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
475 return error_ret(tok);
476 assert(tok->decoding_state != 0);
477 }
478 }
479 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
480 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
481 return error_ret(tok);
482 }
483 }
484#ifndef PGEN
485 /* The default encoding is ASCII, so make sure we don't have any
486 non-ASCII bytes in it. */
487 if (line && !tok->encoding) {
488 unsigned char *c;
489 for (c = (unsigned char *)line; *c; c++)
490 if (*c > 127) {
491 badchar = *c;
492 break;
493 }
494 }
495 if (badchar) {
496 char buf[500];
497 /* Need to add 1 to the line number, since this line
498 has not been counted, yet. */
499 sprintf(buf,
500 "Non-ASCII character '\\x%.2x' "
501 "in file %.200s on line %i, "
502 "but no encoding declared; "
503 "see http://www.python.org/peps/pep-0263.html for details",
504 badchar, tok->filename, tok->lineno + 1);
505 PyErr_SetString(PyExc_SyntaxError, buf);
506 return error_ret(tok);
507 }
508#endif
509 return line;
510}
511
512static int
513decoding_feof(struct tok_state *tok)
514{
515 if (tok->decoding_state >= 0) {
516 return feof(tok->fp);
517 } else {
518 PyObject* buf = tok->decoding_buffer;
519 if (buf == NULL) {
520 buf = PyObject_CallObject(tok->decoding_readline, NULL);
521 if (buf == NULL) {
522 error_ret(tok);
523 return 1;
524 } else {
525 tok->decoding_buffer = buf;
526 }
527 }
528 return PyObject_Length(buf) == 0;
529 }
530}
531
532/* Fetch a byte from TOK, using the string buffer. */
533
534static int
535buf_getc(struct tok_state *tok) {
536 return Py_CHARMASK(*tok->str++);
537}
538
539/* Unfetch a byte from TOK, using the string buffer. */
540
541static void
542buf_ungetc(int c, struct tok_state *tok) {
543 tok->str--;
544 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
545}
546
547/* Set the readline function for TOK to ENC. For the string-based
548 tokenizer, this means to just record the encoding. */
549
550static int
551buf_setreadl(struct tok_state *tok, const char* enc) {
552 tok->enc = enc;
553 return 1;
554}
555
556/* Return a UTF-8 encoding Python string object from the
557 C byte string STR, which is encoded with ENC. */
558
559#ifdef Py_USING_UNICODE
560static PyObject *
561translate_into_utf8(const char* str, const char* enc) {
562 PyObject *utf8;
563 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
564 if (buf == NULL)
565 return NULL;
566 utf8 = PyUnicode_AsUTF8String(buf);
567 Py_DECREF(buf);
568 return utf8;
569}
570#endif
571
572/* Decode a byte string STR for use as the buffer of TOK.
573 Look for encoding declarations inside STR, and record them
574 inside TOK. */
575
576static const char *
577decode_str(const char *str, struct tok_state *tok)
578{
579 PyObject* utf8 = NULL;
580 const char *s;
581 const char *newl[2] = {NULL, NULL};
582 int lineno = 0;
583 tok->enc = NULL;
584 tok->str = str;
585 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
586 return error_ret(tok);
587 str = tok->str; /* string after BOM if any */
588 assert(str);
589#ifdef Py_USING_UNICODE
590 if (tok->enc != NULL) {
591 utf8 = translate_into_utf8(str, tok->enc);
592 if (utf8 == NULL)
593 return error_ret(tok);
594 str = PyString_AsString(utf8);
595 }
596#endif
597 for (s = str;; s++) {
598 if (*s == '\0') break;
599 else if (*s == '\n') {
600 assert(lineno < 2);
601 newl[lineno] = s;
602 lineno++;
603 if (lineno == 2) break;
604 }
605 }
606 tok->enc = NULL;
607 /* need to check line 1 and 2 separately since check_coding_spec
608 assumes a single line as input */
609 if (newl[0]) {
610 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
611 return error_ret(tok);
612 if (tok->enc == NULL && newl[1]) {
613 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
614 tok, buf_setreadl))
615 return error_ret(tok);
616 }
617 }
618#ifdef Py_USING_UNICODE
619 if (tok->enc != NULL) {
620 assert(utf8 == NULL);
621 utf8 = translate_into_utf8(str, tok->enc);
622 if (utf8 == NULL) {
623 PyErr_Format(PyExc_SyntaxError,
624 "unknown encoding: %s", tok->enc);
625 return error_ret(tok);
626 }
627 str = PyString_AsString(utf8);
628 }
629#endif
630 assert(tok->decoding_buffer == NULL);
631 tok->decoding_buffer = utf8; /* CAUTION */
632 return str;
633}
634
635#endif /* PGEN */
636
637/* Set up tokenizer for string */
638
639struct tok_state *
640PyTokenizer_FromString(const char *str)
641{
642 struct tok_state *tok = tok_new();
643 if (tok == NULL)
644 return NULL;
645 str = (char *)decode_str(str, tok);
646 if (str == NULL) {
647 PyTokenizer_Free(tok);
648 return NULL;
649 }
650
651 /* XXX: constify members. */
652 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
653 return tok;
654}
655
656
657/* Set up tokenizer for file */
658
659struct tok_state *
660PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
661{
662 struct tok_state *tok = tok_new();
663 if (tok == NULL)
664 return NULL;
665 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
666 PyTokenizer_Free(tok);
667 return NULL;
668 }
669 tok->cur = tok->inp = tok->buf;
670 tok->end = tok->buf + BUFSIZ;
671 tok->fp = fp;
672 tok->prompt = ps1;
673 tok->nextprompt = ps2;
674 return tok;
675}
676
677
678/* Free a tok_state structure */
679
680void
681PyTokenizer_Free(struct tok_state *tok)
682{
683 if (tok->encoding != NULL)
684 PyMem_FREE(tok->encoding);
685#ifndef PGEN
686 Py_XDECREF(tok->decoding_readline);
687 Py_XDECREF(tok->decoding_buffer);
688#endif
689 if (tok->fp != NULL && tok->buf != NULL)
690 PyMem_FREE(tok->buf);
691 PyMem_FREE(tok);
692}
693
694#if !defined(PGEN) && defined(Py_USING_UNICODE)
695static int
696tok_stdin_decode(struct tok_state *tok, char **inp)
697{
698 PyObject *enc, *sysstdin, *decoded, *utf8;
699 const char *encoding;
700 char *converted;
701
702 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
703 return 0;
704 sysstdin = PySys_GetObject("stdin");
705 if (sysstdin == NULL || !PyFile_Check(sysstdin))
706 return 0;
707
708 enc = ((PyFileObject *)sysstdin)->f_encoding;
709 if (enc == NULL || !PyString_Check(enc))
710 return 0;
711 Py_INCREF(enc);
712
713 encoding = PyString_AsString(enc);
714 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
715 if (decoded == NULL)
716 goto error_clear;
717
718 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
719 Py_DECREF(decoded);
720 if (utf8 == NULL)
721 goto error_clear;
722
723 assert(PyString_Check(utf8));
724 converted = new_string(PyString_AS_STRING(utf8),
725 PyString_GET_SIZE(utf8));
726 Py_DECREF(utf8);
727 if (converted == NULL)
728 goto error_nomem;
729
730 PyMem_FREE(*inp);
731 *inp = converted;
732 if (tok->encoding != NULL)
733 PyMem_FREE(tok->encoding);
734 tok->encoding = new_string(encoding, strlen(encoding));
735 if (tok->encoding == NULL)
736 goto error_nomem;
737
738 Py_DECREF(enc);
739 return 0;
740
741error_nomem:
742 Py_DECREF(enc);
743 tok->done = E_NOMEM;
744 return -1;
745
746error_clear:
747 /* Fallback to iso-8859-1: for backward compatibility */
748 Py_DECREF(enc);
749 PyErr_Clear();
750 return 0;
751}
752#endif
753
754/* Get next char, updating state; error code goes into tok->done */
755
756static int
757tok_nextc(register struct tok_state *tok)
758{
759 for (;;) {
760 if (tok->cur != tok->inp) {
761 return Py_CHARMASK(*tok->cur++); /* Fast path */
762 }
763 if (tok->done != E_OK)
764 return EOF;
765 if (tok->fp == NULL) {
766 char *end = strchr(tok->inp, '\n');
767 if (end != NULL)
768 end++;
769 else {
770 end = strchr(tok->inp, '\0');
771 if (end == tok->inp) {
772 tok->done = E_EOF;
773 return EOF;
774 }
775 }
776 if (tok->start == NULL)
777 tok->buf = tok->cur;
778 tok->line_start = tok->cur;
779 tok->lineno++;
780 tok->inp = end;
781 return Py_CHARMASK(*tok->cur++);
782 }
783 if (tok->prompt != NULL) {
784 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
785 if (tok->nextprompt != NULL)
786 tok->prompt = tok->nextprompt;
787 if (newtok == NULL)
788 tok->done = E_INTR;
789 else if (*newtok == '\0') {
790 PyMem_FREE(newtok);
791 tok->done = E_EOF;
792 }
793#if !defined(PGEN) && defined(Py_USING_UNICODE)
794 else if (tok_stdin_decode(tok, &newtok) != 0)
795 PyMem_FREE(newtok);
796#endif
797 else if (tok->start != NULL) {
798 size_t start = tok->start - tok->buf;
799 size_t oldlen = tok->cur - tok->buf;
800 size_t newlen = oldlen + strlen(newtok);
801 char *buf = tok->buf;
802 buf = (char *)PyMem_REALLOC(buf, newlen+1);
803 tok->lineno++;
804 if (buf == NULL) {
805 PyMem_FREE(tok->buf);
806 tok->buf = NULL;
807 PyMem_FREE(newtok);
808 tok->done = E_NOMEM;
809 return EOF;
810 }
811 tok->buf = buf;
812 tok->cur = tok->buf + oldlen;
813 tok->line_start = tok->cur;
814 strcpy(tok->buf + oldlen, newtok);
815 PyMem_FREE(newtok);
816 tok->inp = tok->buf + newlen;
817 tok->end = tok->inp + 1;
818 tok->start = tok->buf + start;
819 }
820 else {
821 tok->lineno++;
822 if (tok->buf != NULL)
823 PyMem_FREE(tok->buf);
824 tok->buf = newtok;
825 tok->line_start = tok->buf;
826 tok->cur = tok->buf;
827 tok->line_start = tok->buf;
828 tok->inp = strchr(tok->buf, '\0');
829 tok->end = tok->inp + 1;
830 }
831 }
832 else {
833 int done = 0;
834 Py_ssize_t cur = 0;
835 char *pt;
836 if (tok->start == NULL) {
837 if (tok->buf == NULL) {
838 tok->buf = (char *)
839 PyMem_MALLOC(BUFSIZ);
840 if (tok->buf == NULL) {
841 tok->done = E_NOMEM;
842 return EOF;
843 }
844 tok->end = tok->buf + BUFSIZ;
845 }
846 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
847 tok) == NULL) {
848 tok->done = E_EOF;
849 done = 1;
850 }
851 else {
852 tok->done = E_OK;
853 tok->inp = strchr(tok->buf, '\0');
854 done = tok->inp[-1] == '\n';
855 }
856 }
857 else {
858 cur = tok->cur - tok->buf;
859 if (decoding_feof(tok)) {
860 tok->done = E_EOF;
861 done = 1;
862 }
863 else
864 tok->done = E_OK;
865 }
866 tok->lineno++;
867 /* Read until '\n' or EOF */
868 while (!done) {
869 Py_ssize_t curstart = tok->start == NULL ? -1 :
870 tok->start - tok->buf;
871 Py_ssize_t curvalid = tok->inp - tok->buf;
872 Py_ssize_t newsize = curvalid + BUFSIZ;
873 char *newbuf = tok->buf;
874 newbuf = (char *)PyMem_REALLOC(newbuf,
875 newsize);
876 if (newbuf == NULL) {
877 tok->done = E_NOMEM;
878 tok->cur = tok->inp;
879 return EOF;
880 }
881 tok->buf = newbuf;
882 tok->inp = tok->buf + curvalid;
883 tok->end = tok->buf + newsize;
884 tok->start = curstart < 0 ? NULL :
885 tok->buf + curstart;
886 if (decoding_fgets(tok->inp,
887 (int)(tok->end - tok->inp),
888 tok) == NULL) {
889 /* Break out early on decoding
890 errors, as tok->buf will be NULL
891 */
892 if (tok->decoding_erred)
893 return EOF;
894 /* Last line does not end in \n,
895 fake one */
896 strcpy(tok->inp, "\n");
897 }
898 tok->inp = strchr(tok->inp, '\0');
899 done = tok->inp[-1] == '\n';
900 }
901 if (tok->buf != NULL) {
902 tok->cur = tok->buf + cur;
903 tok->line_start = tok->cur;
904 /* replace "\r\n" with "\n" */
905 /* For Mac leave the \r, giving a syntax error */
906 pt = tok->inp - 2;
907 if (pt >= tok->buf && *pt == '\r') {
908 *pt++ = '\n';
909 *pt = '\0';
910 tok->inp = pt;
911 }
912 }
913 }
914 if (tok->done != E_OK) {
915 if (tok->prompt != NULL)
916 PySys_WriteStderr("\n");
917 tok->cur = tok->inp;
918 return EOF;
919 }
920 }
921 /*NOTREACHED*/
922}
923
924
925/* Back-up one character */
926
927static void
928tok_backup(register struct tok_state *tok, register int c)
929{
930 if (c != EOF) {
931 if (--tok->cur < tok->buf)
932 Py_FatalError("tok_backup: begin of buffer");
933 if (*tok->cur != c)
934 *tok->cur = c;
935 }
936}
937
938
939/* Return the token corresponding to a single character */
940
941int
942PyToken_OneChar(int c)
943{
944 switch (c) {
945 case '(': return LPAR;
946 case ')': return RPAR;
947 case '[': return LSQB;
948 case ']': return RSQB;
949 case ':': return COLON;
950 case ',': return COMMA;
951 case ';': return SEMI;
952 case '+': return PLUS;
953 case '-': return MINUS;
954 case '*': return STAR;
955 case '/': return SLASH;
956 case '|': return VBAR;
957 case '&': return AMPER;
958 case '<': return LESS;
959 case '>': return GREATER;
960 case '=': return EQUAL;
961 case '.': return DOT;
962 case '%': return PERCENT;
963 case '`': return BACKQUOTE;
964 case '{': return LBRACE;
965 case '}': return RBRACE;
966 case '^': return CIRCUMFLEX;
967 case '~': return TILDE;
968 case '@': return AT;
969 default: return OP;
970 }
971}
972
973
974int
975PyToken_TwoChars(int c1, int c2)
976{
977 switch (c1) {
978 case '=':
979 switch (c2) {
980 case '=': return EQEQUAL;
981 }
982 break;
983 case '!':
984 switch (c2) {
985 case '=': return NOTEQUAL;
986 }
987 break;
988 case '<':
989 switch (c2) {
990 case '>': return NOTEQUAL;
991 case '=': return LESSEQUAL;
992 case '<': return LEFTSHIFT;
993 }
994 break;
995 case '>':
996 switch (c2) {
997 case '=': return GREATEREQUAL;
998 case '>': return RIGHTSHIFT;
999 }
1000 break;
1001 case '+':
1002 switch (c2) {
1003 case '=': return PLUSEQUAL;
1004 }
1005 break;
1006 case '-':
1007 switch (c2) {
1008 case '=': return MINEQUAL;
1009 }
1010 break;
1011 case '*':
1012 switch (c2) {
1013 case '*': return DOUBLESTAR;
1014 case '=': return STAREQUAL;
1015 }
1016 break;
1017 case '/':
1018 switch (c2) {
1019 case '/': return DOUBLESLASH;
1020 case '=': return SLASHEQUAL;
1021 }
1022 break;
1023 case '|':
1024 switch (c2) {
1025 case '=': return VBAREQUAL;
1026 }
1027 break;
1028 case '%':
1029 switch (c2) {
1030 case '=': return PERCENTEQUAL;
1031 }
1032 break;
1033 case '&':
1034 switch (c2) {
1035 case '=': return AMPEREQUAL;
1036 }
1037 break;
1038 case '^':
1039 switch (c2) {
1040 case '=': return CIRCUMFLEXEQUAL;
1041 }
1042 break;
1043 }
1044 return OP;
1045}
1046
1047int
1048PyToken_ThreeChars(int c1, int c2, int c3)
1049{
1050 switch (c1) {
1051 case '<':
1052 switch (c2) {
1053 case '<':
1054 switch (c3) {
1055 case '=':
1056 return LEFTSHIFTEQUAL;
1057 }
1058 break;
1059 }
1060 break;
1061 case '>':
1062 switch (c2) {
1063 case '>':
1064 switch (c3) {
1065 case '=':
1066 return RIGHTSHIFTEQUAL;
1067 }
1068 break;
1069 }
1070 break;
1071 case '*':
1072 switch (c2) {
1073 case '*':
1074 switch (c3) {
1075 case '=':
1076 return DOUBLESTAREQUAL;
1077 }
1078 break;
1079 }
1080 break;
1081 case '/':
1082 switch (c2) {
1083 case '/':
1084 switch (c3) {
1085 case '=':
1086 return DOUBLESLASHEQUAL;
1087 }
1088 break;
1089 }
1090 break;
1091 }
1092 return OP;
1093}
1094
1095static int
1096indenterror(struct tok_state *tok)
1097{
1098 if (tok->alterror) {
1099 tok->done = E_TABSPACE;
1100 tok->cur = tok->inp;
1101 return 1;
1102 }
1103 if (tok->altwarning) {
1104 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1105 "in indentation\n", tok->filename);
1106 tok->altwarning = 0;
1107 }
1108 return 0;
1109}
1110
1111
1112/* Get next token, after space stripping etc. */
1113
1114static int
1115tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1116{
1117 register int c;
1118 int blankline;
1119
1120 *p_start = *p_end = NULL;
1121 nextline:
1122 tok->start = NULL;
1123 blankline = 0;
1124
1125 /* Get indentation level */
1126 if (tok->atbol) {
1127 register int col = 0;
1128 register int altcol = 0;
1129 tok->atbol = 0;
1130 for (;;) {
1131 c = tok_nextc(tok);
1132 if (c == ' ')
1133 col++, altcol++;
1134 else if (c == '\t') {
1135 col = (col/tok->tabsize + 1) * tok->tabsize;
1136 altcol = (altcol/tok->alttabsize + 1)
1137 * tok->alttabsize;
1138 }
1139 else if (c == '\014') /* Control-L (formfeed) */
1140 col = altcol = 0; /* For Emacs users */
1141 else
1142 break;
1143 }
1144 tok_backup(tok, c);
1145 if (c == '#' || c == '\n') {
1146 /* Lines with only whitespace and/or comments
1147 shouldn't affect the indentation and are
1148 not passed to the parser as NEWLINE tokens,
1149 except *totally* empty lines in interactive
1150 mode, which signal the end of a command group. */
1151 if (col == 0 && c == '\n' && tok->prompt != NULL)
1152 blankline = 0; /* Let it through */
1153 else
1154 blankline = 1; /* Ignore completely */
1155 /* We can't jump back right here since we still
1156 may need to skip to the end of a comment */
1157 }
1158 if (!blankline && tok->level == 0) {
1159 if (col == tok->indstack[tok->indent]) {
1160 /* No change */
1161 if (altcol != tok->altindstack[tok->indent]) {
1162 if (indenterror(tok))
1163 return ERRORTOKEN;
1164 }
1165 }
1166 else if (col > tok->indstack[tok->indent]) {
1167 /* Indent -- always one */
1168 if (tok->indent+1 >= MAXINDENT) {
1169 tok->done = E_TOODEEP;
1170 tok->cur = tok->inp;
1171 return ERRORTOKEN;
1172 }
1173 if (altcol <= tok->altindstack[tok->indent]) {
1174 if (indenterror(tok))
1175 return ERRORTOKEN;
1176 }
1177 tok->pendin++;
1178 tok->indstack[++tok->indent] = col;
1179 tok->altindstack[tok->indent] = altcol;
1180 }
1181 else /* col < tok->indstack[tok->indent] */ {
1182 /* Dedent -- any number, must be consistent */
1183 while (tok->indent > 0 &&
1184 col < tok->indstack[tok->indent]) {
1185 tok->pendin--;
1186 tok->indent--;
1187 }
1188 if (col != tok->indstack[tok->indent]) {
1189 tok->done = E_DEDENT;
1190 tok->cur = tok->inp;
1191 return ERRORTOKEN;
1192 }
1193 if (altcol != tok->altindstack[tok->indent]) {
1194 if (indenterror(tok))
1195 return ERRORTOKEN;
1196 }
1197 }
1198 }
1199 }
1200
1201 tok->start = tok->cur;
1202
1203 /* Return pending indents/dedents */
1204 if (tok->pendin != 0) {
1205 if (tok->pendin < 0) {
1206 tok->pendin++;
1207 return DEDENT;
1208 }
1209 else {
1210 tok->pendin--;
1211 return INDENT;
1212 }
1213 }
1214
1215 again:
1216 tok->start = NULL;
1217 /* Skip spaces */
1218 do {
1219 c = tok_nextc(tok);
1220 } while (c == ' ' || c == '\t' || c == '\014');
1221
1222 /* Set start of current token */
1223 tok->start = tok->cur - 1;
1224
1225 /* Skip comment, while looking for tab-setting magic */
1226 if (c == '#') {
1227 static char *tabforms[] = {
1228 "tab-width:", /* Emacs */
1229 ":tabstop=", /* vim, full form */
1230 ":ts=", /* vim, abbreviated form */
1231 "set tabsize=", /* will vi never die? */
1232 /* more templates can be added here to support other editors */
1233 };
1234 char cbuf[80];
1235 char *tp, **cp;
1236 tp = cbuf;
1237 do {
1238 *tp++ = c = tok_nextc(tok);
1239 } while (c != EOF && c != '\n' &&
1240 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1241 *tp = '\0';
1242 for (cp = tabforms;
1243 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1244 cp++) {
1245 if ((tp = strstr(cbuf, *cp))) {
1246 int newsize = atoi(tp + strlen(*cp));
1247
1248 if (newsize >= 1 && newsize <= 40) {
1249 tok->tabsize = newsize;
1250 if (Py_VerboseFlag)
1251 PySys_WriteStderr(
1252 "Tab size set to %d\n",
1253 newsize);
1254 }
1255 }
1256 }
1257 while (c != EOF && c != '\n')
1258 c = tok_nextc(tok);
1259 }
1260
1261 /* Check for EOF and errors now */
1262 if (c == EOF) {
1263 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1264 }
1265
1266 /* Identifier (most frequent token!) */
1267 if (isalpha(c) || c == '_') {
1268 /* Process r"", u"" and ur"" */
1269 switch (c) {
1270 case 'b':
1271 case 'B':
1272 c = tok_nextc(tok);
1273 if (c == 'r' || c == 'R')
1274 c = tok_nextc(tok);
1275 if (c == '"' || c == '\'')
1276 goto letter_quote;
1277 break;
1278 case 'r':
1279 case 'R':
1280 c = tok_nextc(tok);
1281 if (c == '"' || c == '\'')
1282 goto letter_quote;
1283 break;
1284 case 'u':
1285 case 'U':
1286 c = tok_nextc(tok);
1287 if (c == 'r' || c == 'R')
1288 c = tok_nextc(tok);
1289 if (c == '"' || c == '\'')
1290 goto letter_quote;
1291 break;
1292 }
1293 while (isalnum(c) || c == '_') {
1294 c = tok_nextc(tok);
1295 }
1296 tok_backup(tok, c);
1297 *p_start = tok->start;
1298 *p_end = tok->cur;
1299 return NAME;
1300 }
1301
1302 /* Newline */
1303 if (c == '\n') {
1304 tok->atbol = 1;
1305 if (blankline || tok->level > 0)
1306 goto nextline;
1307 *p_start = tok->start;
1308 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1309 tok->cont_line = 0;
1310 return NEWLINE;
1311 }
1312
1313 /* Period or number starting with period? */
1314 if (c == '.') {
1315 c = tok_nextc(tok);
1316 if (isdigit(c)) {
1317 goto fraction;
1318 }
1319 else {
1320 tok_backup(tok, c);
1321 *p_start = tok->start;
1322 *p_end = tok->cur;
1323 return DOT;
1324 }
1325 }
1326
1327 /* Number */
1328 if (isdigit(c)) {
1329 if (c == '0') {
1330 /* Hex, octal or binary -- maybe. */
1331 c = tok_nextc(tok);
1332 if (c == '.')
1333 goto fraction;
1334#ifndef WITHOUT_COMPLEX
1335 if (c == 'j' || c == 'J')
1336 goto imaginary;
1337#endif
1338 if (c == 'x' || c == 'X') {
1339
1340 /* Hex */
1341 c = tok_nextc(tok);
1342 if (!isxdigit(c)) {
1343 tok->done = E_TOKEN;
1344 tok_backup(tok, c);
1345 return ERRORTOKEN;
1346 }
1347 do {
1348 c = tok_nextc(tok);
1349 } while (isxdigit(c));
1350 }
1351 else if (c == 'o' || c == 'O') {
1352 /* Octal */
1353 c = tok_nextc(tok);
1354 if (c < '0' || c >= '8') {
1355 tok->done = E_TOKEN;
1356 tok_backup(tok, c);
1357 return ERRORTOKEN;
1358 }
1359 do {
1360 c = tok_nextc(tok);
1361 } while ('0' <= c && c < '8');
1362 }
1363 else if (c == 'b' || c == 'B') {
1364 /* Binary */
1365 c = tok_nextc(tok);
1366 if (c != '0' && c != '1') {
1367 tok->done = E_TOKEN;
1368 tok_backup(tok, c);
1369 return ERRORTOKEN;
1370 }
1371 do {
1372 c = tok_nextc(tok);
1373 } while (c == '0' || c == '1');
1374 }
1375 else {
1376 int found_decimal = 0;
1377 /* Octal; c is first char of it */
1378 /* There's no 'isoctdigit' macro, sigh */
1379 while ('0' <= c && c < '8') {
1380 c = tok_nextc(tok);
1381 }
1382 if (isdigit(c)) {
1383 found_decimal = 1;
1384 do {
1385 c = tok_nextc(tok);
1386 } while (isdigit(c));
1387 }
1388 if (c == '.')
1389 goto fraction;
1390 else if (c == 'e' || c == 'E')
1391 goto exponent;
1392#ifndef WITHOUT_COMPLEX
1393 else if (c == 'j' || c == 'J')
1394 goto imaginary;
1395#endif
1396 else if (found_decimal) {
1397 tok->done = E_TOKEN;
1398 tok_backup(tok, c);
1399 return ERRORTOKEN;
1400 }
1401 }
1402 if (c == 'l' || c == 'L')
1403 c = tok_nextc(tok);
1404 }
1405 else {
1406 /* Decimal */
1407 do {
1408 c = tok_nextc(tok);
1409 } while (isdigit(c));
1410 if (c == 'l' || c == 'L')
1411 c = tok_nextc(tok);
1412 else {
1413 /* Accept floating point numbers. */
1414 if (c == '.') {
1415 fraction:
1416 /* Fraction */
1417 do {
1418 c = tok_nextc(tok);
1419 } while (isdigit(c));
1420 }
1421 if (c == 'e' || c == 'E') {
1422 exponent:
1423 /* Exponent part */
1424 c = tok_nextc(tok);
1425 if (c == '+' || c == '-')
1426 c = tok_nextc(tok);
1427 if (!isdigit(c)) {
1428 tok->done = E_TOKEN;
1429 tok_backup(tok, c);
1430 return ERRORTOKEN;
1431 }
1432 do {
1433 c = tok_nextc(tok);
1434 } while (isdigit(c));
1435 }
1436#ifndef WITHOUT_COMPLEX
1437 if (c == 'j' || c == 'J')
1438 /* Imaginary part */
1439 imaginary:
1440 c = tok_nextc(tok);
1441#endif
1442 }
1443 }
1444 tok_backup(tok, c);
1445 *p_start = tok->start;
1446 *p_end = tok->cur;
1447 return NUMBER;
1448 }
1449
1450 letter_quote:
1451 /* String */
1452 if (c == '\'' || c == '"') {
1453 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1454 int quote = c;
1455 int triple = 0;
1456 int tripcount = 0;
1457 for (;;) {
1458 c = tok_nextc(tok);
1459 if (c == '\n') {
1460 if (!triple) {
1461 tok->done = E_EOLS;
1462 tok_backup(tok, c);
1463 return ERRORTOKEN;
1464 }
1465 tripcount = 0;
1466 tok->cont_line = 1; /* multiline string. */
1467 }
1468 else if (c == EOF) {
1469 if (triple)
1470 tok->done = E_EOFS;
1471 else
1472 tok->done = E_EOLS;
1473 tok->cur = tok->inp;
1474 return ERRORTOKEN;
1475 }
1476 else if (c == quote) {
1477 tripcount++;
1478 if (tok->cur - tok->start == quote2) {
1479 c = tok_nextc(tok);
1480 if (c == quote) {
1481 triple = 1;
1482 tripcount = 0;
1483 continue;
1484 }
1485 tok_backup(tok, c);
1486 }
1487 if (!triple || tripcount == 3)
1488 break;
1489 }
1490 else if (c == '\\') {
1491 tripcount = 0;
1492 c = tok_nextc(tok);
1493 if (c == EOF) {
1494 tok->done = E_EOLS;
1495 tok->cur = tok->inp;
1496 return ERRORTOKEN;
1497 }
1498 }
1499 else
1500 tripcount = 0;
1501 }
1502 *p_start = tok->start;
1503 *p_end = tok->cur;
1504 return STRING;
1505 }
1506
1507 /* Line continuation */
1508 if (c == '\\') {
1509 c = tok_nextc(tok);
1510 if (c != '\n') {
1511 tok->done = E_LINECONT;
1512 tok->cur = tok->inp;
1513 return ERRORTOKEN;
1514 }
1515 tok->cont_line = 1;
1516 goto again; /* Read next line */
1517 }
1518
1519 /* Check for two-character token */
1520 {
1521 int c2 = tok_nextc(tok);
1522 int token = PyToken_TwoChars(c, c2);
1523#ifndef PGEN
1524 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1525 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1526 "<> not supported in 3.x; use !=",
1527 tok->filename, tok->lineno,
1528 NULL, NULL)) {
1529 return ERRORTOKEN;
1530 }
1531 }
1532#endif
1533 if (token != OP) {
1534 int c3 = tok_nextc(tok);
1535 int token3 = PyToken_ThreeChars(c, c2, c3);
1536 if (token3 != OP) {
1537 token = token3;
1538 } else {
1539 tok_backup(tok, c3);
1540 }
1541 *p_start = tok->start;
1542 *p_end = tok->cur;
1543 return token;
1544 }
1545 tok_backup(tok, c2);
1546 }
1547
1548 /* Keep track of parentheses nesting level */
1549 switch (c) {
1550 case '(':
1551 case '[':
1552 case '{':
1553 tok->level++;
1554 break;
1555 case ')':
1556 case ']':
1557 case '}':
1558 tok->level--;
1559 break;
1560 }
1561
1562 /* Punctuation character */
1563 *p_start = tok->start;
1564 *p_end = tok->cur;
1565 return PyToken_OneChar(c);
1566}
1567
1568int
1569PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1570{
1571 int result = tok_get(tok, p_start, p_end);
1572 if (tok->decoding_erred) {
1573 result = ERRORTOKEN;
1574 tok->done = E_DECODE;
1575 }
1576 return result;
1577}
1578
1579/* This function is only called from parsetok. However, it cannot live
1580 there, as it must be empty for PGEN, and we can check for PGEN only
1581 in this file. */
1582
1583#if defined(PGEN) || !defined(Py_USING_UNICODE)
1584char*
1585PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1586{
1587 return NULL;
1588}
1589#else
1590#ifdef Py_USING_UNICODE
1591static PyObject *
1592dec_utf8(const char *enc, const char *text, size_t len) {
1593 PyObject *ret = NULL;
1594 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1595 if (unicode_text) {
1596 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1597 Py_DECREF(unicode_text);
1598 }
1599 if (!ret) {
1600 PyErr_Clear();
1601 }
1602 return ret;
1603}
1604char *
1605PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1606{
1607 char *text = NULL;
1608 if (tok->encoding) {
1609 /* convert source to original encondig */
1610 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1611 if (lineobj != NULL) {
1612 int linelen = PyString_Size(lineobj);
1613 const char *line = PyString_AsString(lineobj);
1614 text = PyObject_MALLOC(linelen + 1);
1615 if (text != NULL && line != NULL) {
1616 if (linelen)
1617 strncpy(text, line, linelen);
1618 text[linelen] = '\0';
1619 }
1620 Py_DECREF(lineobj);
1621
1622 /* adjust error offset */
1623 if (*offset > 1) {
1624 PyObject *offsetobj = dec_utf8(tok->encoding,
1625 tok->buf, *offset-1);
1626 if (offsetobj) {
1627 *offset = PyString_Size(offsetobj) + 1;
1628 Py_DECREF(offsetobj);
1629 }
1630 }
1631
1632 }
1633 }
1634 return text;
1635
1636}
1637#endif /* defined(Py_USING_UNICODE) */
1638#endif
1639
1640
1641#ifdef Py_DEBUG
1642
1643void
1644tok_dump(int type, char *start, char *end)
1645{
1646 printf("%s", _PyParser_TokenNames[type]);
1647 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1648 printf("(%.*s)", (int)(end - start), start);
1649}
1650
1651#endif
Note: See TracBrowser for help on using the repository browser.