source: vendor/python/2.5/Parser/tokenizer.c

Last change on this file was 3225, checked in by bird, 18 years ago

Python 2.5

File size: 33.1 KB
Line 
1
2/* Tokenizer implementation */
3
4#include "Python.h"
5#include "pgenheaders.h"
6
7#include <ctype.h>
8#include <assert.h>
9
10#include "tokenizer.h"
11#include "errcode.h"
12
13#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
21extern char *PyOS_Readline(FILE *, FILE *, char *);
22/* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
26/* Don't ever change this -- it would break the portability of Python code */
27#define TABSIZE 8
28
29/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c) (c)
33#else
34#define Py_CHARMASK(c) ((c) & 0xff)
35#endif
36
37/* Forward */
38static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
41
42/* Token names */
43
44char *_PyParser_TokenNames[] = {
45 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
73 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
77 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
81 "DOUBLESTAR",
82 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
93 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
95 "AT",
96 /* This table must match the #defines in token.h! */
97 "OP",
98 "<ERRORTOKEN>",
99 "<N_TOKENS>"
100};
101
102
103/* Create and initialize a new tok_state structure */
104
105static struct tok_state *
106tok_new(void)
107{
108 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
109 sizeof(struct tok_state));
110 if (tok == NULL)
111 return NULL;
112 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
113 tok->done = E_OK;
114 tok->fp = NULL;
115 tok->tabsize = TABSIZE;
116 tok->indent = 0;
117 tok->indstack[0] = 0;
118 tok->atbol = 1;
119 tok->pendin = 0;
120 tok->prompt = tok->nextprompt = NULL;
121 tok->lineno = 0;
122 tok->level = 0;
123 tok->filename = NULL;
124 tok->altwarning = 0;
125 tok->alterror = 0;
126 tok->alttabsize = 1;
127 tok->altindstack[0] = 0;
128 tok->decoding_state = 0;
129 tok->decoding_erred = 0;
130 tok->read_coding_spec = 0;
131 tok->encoding = NULL;
132 tok->cont_line = 0;
133#ifndef PGEN
134 tok->decoding_readline = NULL;
135 tok->decoding_buffer = NULL;
136#endif
137 return tok;
138}
139
140#ifdef PGEN
141
142static char *
143decoding_fgets(char *s, int size, struct tok_state *tok)
144{
145 return fgets(s, size, tok->fp);
146}
147
148static int
149decoding_feof(struct tok_state *tok)
150{
151 return feof(tok->fp);
152}
153
154static const char *
155decode_str(const char *str, struct tok_state *tok)
156{
157 return str;
158}
159
160#else /* PGEN */
161
162static char *
163error_ret(struct tok_state *tok) /* XXX */
164{
165 tok->decoding_erred = 1;
166 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
167 PyMem_FREE(tok->buf);
168 tok->buf = NULL;
169 return NULL; /* as if it were EOF */
170}
171
172static char *
173new_string(const char *s, Py_ssize_t len)
174{
175 char* result = (char *)PyMem_MALLOC(len + 1);
176 if (result != NULL) {
177 memcpy(result, s, len);
178 result[len] = '\0';
179 }
180 return result;
181}
182
183static char *
184get_normal_name(char *s) /* for utf-8 and latin-1 */
185{
186 char buf[13];
187 int i;
188 for (i = 0; i < 12; i++) {
189 int c = s[i];
190 if (c == '\0') break;
191 else if (c == '_') buf[i] = '-';
192 else buf[i] = tolower(c);
193 }
194 buf[i] = '\0';
195 if (strcmp(buf, "utf-8") == 0 ||
196 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
197 else if (strcmp(buf, "latin-1") == 0 ||
198 strcmp(buf, "iso-8859-1") == 0 ||
199 strcmp(buf, "iso-latin-1") == 0 ||
200 strncmp(buf, "latin-1-", 8) == 0 ||
201 strncmp(buf, "iso-8859-1-", 11) == 0 ||
202 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
203 else return s;
204}
205
206/* Return the coding spec in S, or NULL if none is found. */
207
208static char *
209get_coding_spec(const char *s, Py_ssize_t size)
210{
211 Py_ssize_t i;
212 /* Coding spec must be in a comment, and that comment must be
213 * the only statement on the source code line. */
214 for (i = 0; i < size - 6; i++) {
215 if (s[i] == '#')
216 break;
217 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218 return NULL;
219 }
220 for (; i < size - 6; i++) { /* XXX inefficient search */
221 const char* t = s + i;
222 if (strncmp(t, "coding", 6) == 0) {
223 const char* begin = NULL;
224 t += 6;
225 if (t[0] != ':' && t[0] != '=')
226 continue;
227 do {
228 t++;
229 } while (t[0] == '\x20' || t[0] == '\t');
230
231 begin = t;
232 while (isalnum(Py_CHARMASK(t[0])) ||
233 t[0] == '-' || t[0] == '_' || t[0] == '.')
234 t++;
235
236 if (begin < t) {
237 char* r = new_string(begin, t - begin);
238 char* q = get_normal_name(r);
239 if (r != q) {
240 PyMem_FREE(r);
241 r = new_string(q, strlen(q));
242 }
243 return r;
244 }
245 }
246 }
247 return NULL;
248}
249
250/* Check whether the line contains a coding spec. If it does,
251 invoke the set_readline function for the new encoding.
252 This function receives the tok_state and the new encoding.
253 Return 1 on success, 0 on failure. */
254
255static int
256check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
257 int set_readline(struct tok_state *, const char *))
258{
259 char * cs;
260 int r = 1;
261
262 if (tok->cont_line)
263 /* It's a continuation line, so it can't be a coding spec. */
264 return 1;
265 cs = get_coding_spec(line, size);
266 if (cs != NULL) {
267 tok->read_coding_spec = 1;
268 if (tok->encoding == NULL) {
269 assert(tok->decoding_state == 1); /* raw */
270 if (strcmp(cs, "utf-8") == 0 ||
271 strcmp(cs, "iso-8859-1") == 0) {
272 tok->encoding = cs;
273 } else {
274#ifdef Py_USING_UNICODE
275 r = set_readline(tok, cs);
276 if (r) {
277 tok->encoding = cs;
278 tok->decoding_state = -1;
279 }
280 else
281 PyMem_FREE(cs);
282#else
283 /* Without Unicode support, we cannot
284 process the coding spec. Since there
285 won't be any Unicode literals, that
286 won't matter. */
287 PyMem_FREE(cs);
288#endif
289 }
290 } else { /* then, compare cs with BOM */
291 r = (strcmp(tok->encoding, cs) == 0);
292 PyMem_FREE(cs);
293 }
294 }
295 if (!r) {
296 cs = tok->encoding;
297 if (!cs)
298 cs = "with BOM";
299 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
300 }
301 return r;
302}
303
304/* See whether the file starts with a BOM. If it does,
305 invoke the set_readline function with the new encoding.
306 Return 1 on success, 0 on failure. */
307
308static int
309check_bom(int get_char(struct tok_state *),
310 void unget_char(int, struct tok_state *),
311 int set_readline(struct tok_state *, const char *),
312 struct tok_state *tok)
313{
314 int ch = get_char(tok);
315 tok->decoding_state = 1;
316 if (ch == EOF) {
317 return 1;
318 } else if (ch == 0xEF) {
319 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
320 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
321#if 0
322 /* Disable support for UTF-16 BOMs until a decision
323 is made whether this needs to be supported. */
324 } else if (ch == 0xFE) {
325 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
326 if (!set_readline(tok, "utf-16-be")) return 0;
327 tok->decoding_state = -1;
328 } else if (ch == 0xFF) {
329 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
330 if (!set_readline(tok, "utf-16-le")) return 0;
331 tok->decoding_state = -1;
332#endif
333 } else {
334 unget_char(ch, tok);
335 return 1;
336 }
337 if (tok->encoding != NULL)
338 PyMem_FREE(tok->encoding);
339 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
340 return 1;
341 NON_BOM:
342 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
343 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
344 return 1;
345}
346
347/* Read a line of text from TOK into S, using the stream in TOK.
348 Return NULL on failure, else S.
349
350 On entry, tok->decoding_buffer will be one of:
351 1) NULL: need to call tok->decoding_readline to get a new line
352 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
353 stored the result in tok->decoding_buffer
354 3) PyStringObject *: previous call to fp_readl did not have enough room
355 (in the s buffer) to copy entire contents of the line read
356 by tok->decoding_readline. tok->decoding_buffer has the overflow.
357 In this case, fp_readl is called in a loop (with an expanded buffer)
358 until the buffer ends with a '\n' (or until the end of the file is
359 reached): see tok_nextc and its calls to decoding_fgets.
360*/
361
362static char *
363fp_readl(char *s, int size, struct tok_state *tok)
364{
365#ifndef Py_USING_UNICODE
366 /* In a non-Unicode built, this should never be called. */
367 Py_FatalError("fp_readl should not be called in this build.");
368 return NULL; /* Keep compiler happy (not reachable) */
369#else
370 PyObject* utf8 = NULL;
371 PyObject* buf = tok->decoding_buffer;
372 char *str;
373 Py_ssize_t utf8len;
374
375 /* Ask for one less byte so we can terminate it */
376 assert(size > 0);
377 size--;
378
379 if (buf == NULL) {
380 buf = PyObject_CallObject(tok->decoding_readline, NULL);
381 if (buf == NULL)
382 return error_ret(tok);
383 } else {
384 tok->decoding_buffer = NULL;
385 if (PyString_CheckExact(buf))
386 utf8 = buf;
387 }
388 if (utf8 == NULL) {
389 utf8 = PyUnicode_AsUTF8String(buf);
390 Py_DECREF(buf);
391 if (utf8 == NULL)
392 return error_ret(tok);
393 }
394 str = PyString_AsString(utf8);
395 utf8len = PyString_GET_SIZE(utf8);
396 if (utf8len > size) {
397 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
398 if (tok->decoding_buffer == NULL) {
399 Py_DECREF(utf8);
400 return error_ret(tok);
401 }
402 utf8len = size;
403 }
404 memcpy(s, str, utf8len);
405 s[utf8len] = '\0';
406 Py_DECREF(utf8);
407 if (utf8len == 0) return NULL; /* EOF */
408 return s;
409#endif
410}
411
412/* Set the readline function for TOK to a StreamReader's
413 readline function. The StreamReader is named ENC.
414
415 This function is called from check_bom and check_coding_spec.
416
417 ENC is usually identical to the future value of tok->encoding,
418 except for the (currently unsupported) case of UTF-16.
419
420 Return 1 on success, 0 on failure. */
421
422static int
423fp_setreadl(struct tok_state *tok, const char* enc)
424{
425 PyObject *reader, *stream, *readline;
426
427 /* XXX: constify filename argument. */
428 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
429 if (stream == NULL)
430 return 0;
431
432 reader = PyCodec_StreamReader(enc, stream, NULL);
433 Py_DECREF(stream);
434 if (reader == NULL)
435 return 0;
436
437 readline = PyObject_GetAttrString(reader, "readline");
438 Py_DECREF(reader);
439 if (readline == NULL)
440 return 0;
441
442 tok->decoding_readline = readline;
443 return 1;
444}
445
446/* Fetch the next byte from TOK. */
447
448static int fp_getc(struct tok_state *tok) {
449 return getc(tok->fp);
450}
451
452/* Unfetch the last byte back into TOK. */
453
454static void fp_ungetc(int c, struct tok_state *tok) {
455 ungetc(c, tok->fp);
456}
457
458/* Read a line of input from TOK. Determine encoding
459 if necessary. */
460
461static char *
462decoding_fgets(char *s, int size, struct tok_state *tok)
463{
464 char *line = NULL;
465 int badchar = 0;
466 for (;;) {
467 if (tok->decoding_state < 0) {
468 /* We already have a codec associated with
469 this input. */
470 line = fp_readl(s, size, tok);
471 break;
472 } else if (tok->decoding_state > 0) {
473 /* We want a 'raw' read. */
474 line = Py_UniversalNewlineFgets(s, size,
475 tok->fp, NULL);
476 break;
477 } else {
478 /* We have not yet determined the encoding.
479 If an encoding is found, use the file-pointer
480 reader functions from now on. */
481 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
482 return error_ret(tok);
483 assert(tok->decoding_state != 0);
484 }
485 }
486 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
487 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
488 return error_ret(tok);
489 }
490 }
491#ifndef PGEN
492 /* The default encoding is ASCII, so make sure we don't have any
493 non-ASCII bytes in it. */
494 if (line && !tok->encoding) {
495 unsigned char *c;
496 for (c = (unsigned char *)line; *c; c++)
497 if (*c > 127) {
498 badchar = *c;
499 break;
500 }
501 }
502 if (badchar) {
503 char buf[500];
504 /* Need to add 1 to the line number, since this line
505 has not been counted, yet. */
506 sprintf(buf,
507 "Non-ASCII character '\\x%.2x' "
508 "in file %.200s on line %i, "
509 "but no encoding declared; "
510 "see http://www.python.org/peps/pep-0263.html for details",
511 badchar, tok->filename, tok->lineno + 1);
512 PyErr_SetString(PyExc_SyntaxError, buf);
513 return error_ret(tok);
514 }
515#endif
516 return line;
517}
518
519static int
520decoding_feof(struct tok_state *tok)
521{
522 if (tok->decoding_state >= 0) {
523 return feof(tok->fp);
524 } else {
525 PyObject* buf = tok->decoding_buffer;
526 if (buf == NULL) {
527 buf = PyObject_CallObject(tok->decoding_readline, NULL);
528 if (buf == NULL) {
529 error_ret(tok);
530 return 1;
531 } else {
532 tok->decoding_buffer = buf;
533 }
534 }
535 return PyObject_Length(buf) == 0;
536 }
537}
538
539/* Fetch a byte from TOK, using the string buffer. */
540
541static int
542buf_getc(struct tok_state *tok) {
543 return Py_CHARMASK(*tok->str++);
544}
545
546/* Unfetch a byte from TOK, using the string buffer. */
547
548static void
549buf_ungetc(int c, struct tok_state *tok) {
550 tok->str--;
551 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
552}
553
554/* Set the readline function for TOK to ENC. For the string-based
555 tokenizer, this means to just record the encoding. */
556
557static int
558buf_setreadl(struct tok_state *tok, const char* enc) {
559 tok->enc = enc;
560 return 1;
561}
562
563/* Return a UTF-8 encoding Python string object from the
564 C byte string STR, which is encoded with ENC. */
565
566#ifdef Py_USING_UNICODE
567static PyObject *
568translate_into_utf8(const char* str, const char* enc) {
569 PyObject *utf8;
570 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
571 if (buf == NULL)
572 return NULL;
573 utf8 = PyUnicode_AsUTF8String(buf);
574 Py_DECREF(buf);
575 return utf8;
576}
577#endif
578
579/* Decode a byte string STR for use as the buffer of TOK.
580 Look for encoding declarations inside STR, and record them
581 inside TOK. */
582
583static const char *
584decode_str(const char *str, struct tok_state *tok)
585{
586 PyObject* utf8 = NULL;
587 const char *s;
588 int lineno = 0;
589 tok->enc = NULL;
590 tok->str = str;
591 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
592 return error_ret(tok);
593 str = tok->str; /* string after BOM if any */
594 assert(str);
595#ifdef Py_USING_UNICODE
596 if (tok->enc != NULL) {
597 utf8 = translate_into_utf8(str, tok->enc);
598 if (utf8 == NULL)
599 return error_ret(tok);
600 str = PyString_AsString(utf8);
601 }
602#endif
603 for (s = str;; s++) {
604 if (*s == '\0') break;
605 else if (*s == '\n') {
606 lineno++;
607 if (lineno == 2) break;
608 }
609 }
610 tok->enc = NULL;
611 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
612 return error_ret(tok);
613#ifdef Py_USING_UNICODE
614 if (tok->enc != NULL) {
615 assert(utf8 == NULL);
616 utf8 = translate_into_utf8(str, tok->enc);
617 if (utf8 == NULL) {
618 PyErr_Format(PyExc_SyntaxError,
619 "unknown encoding: %s", tok->enc);
620 return error_ret(tok);
621 }
622 str = PyString_AsString(utf8);
623 }
624#endif
625 assert(tok->decoding_buffer == NULL);
626 tok->decoding_buffer = utf8; /* CAUTION */
627 return str;
628}
629
630#endif /* PGEN */
631
632/* Set up tokenizer for string */
633
634struct tok_state *
635PyTokenizer_FromString(const char *str)
636{
637 struct tok_state *tok = tok_new();
638 if (tok == NULL)
639 return NULL;
640 str = (char *)decode_str(str, tok);
641 if (str == NULL) {
642 PyTokenizer_Free(tok);
643 return NULL;
644 }
645
646 /* XXX: constify members. */
647 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
648 return tok;
649}
650
651
652/* Set up tokenizer for file */
653
654struct tok_state *
655PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
656{
657 struct tok_state *tok = tok_new();
658 if (tok == NULL)
659 return NULL;
660 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
661 PyTokenizer_Free(tok);
662 return NULL;
663 }
664 tok->cur = tok->inp = tok->buf;
665 tok->end = tok->buf + BUFSIZ;
666 tok->fp = fp;
667 tok->prompt = ps1;
668 tok->nextprompt = ps2;
669 return tok;
670}
671
672
673/* Free a tok_state structure */
674
675void
676PyTokenizer_Free(struct tok_state *tok)
677{
678 if (tok->encoding != NULL)
679 PyMem_FREE(tok->encoding);
680#ifndef PGEN
681 Py_XDECREF(tok->decoding_readline);
682 Py_XDECREF(tok->decoding_buffer);
683#endif
684 if (tok->fp != NULL && tok->buf != NULL)
685 PyMem_FREE(tok->buf);
686 PyMem_FREE(tok);
687}
688
689#if !defined(PGEN) && defined(Py_USING_UNICODE)
690static int
691tok_stdin_decode(struct tok_state *tok, char **inp)
692{
693 PyObject *enc, *sysstdin, *decoded, *utf8;
694 const char *encoding;
695 char *converted;
696
697 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
698 return 0;
699 sysstdin = PySys_GetObject("stdin");
700 if (sysstdin == NULL || !PyFile_Check(sysstdin))
701 return 0;
702
703 enc = ((PyFileObject *)sysstdin)->f_encoding;
704 if (enc == NULL || !PyString_Check(enc))
705 return 0;
706 Py_INCREF(enc);
707
708 encoding = PyString_AsString(enc);
709 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
710 if (decoded == NULL)
711 goto error_clear;
712
713 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
714 Py_DECREF(decoded);
715 if (utf8 == NULL)
716 goto error_clear;
717
718 assert(PyString_Check(utf8));
719 converted = new_string(PyString_AS_STRING(utf8),
720 PyString_GET_SIZE(utf8));
721 Py_DECREF(utf8);
722 if (converted == NULL)
723 goto error_nomem;
724
725 PyMem_FREE(*inp);
726 *inp = converted;
727 if (tok->encoding != NULL)
728 PyMem_FREE(tok->encoding);
729 tok->encoding = new_string(encoding, strlen(encoding));
730 if (tok->encoding == NULL)
731 goto error_nomem;
732
733 Py_DECREF(enc);
734 return 0;
735
736error_nomem:
737 Py_DECREF(enc);
738 tok->done = E_NOMEM;
739 return -1;
740
741error_clear:
742 /* Fallback to iso-8859-1: for backward compatibility */
743 Py_DECREF(enc);
744 PyErr_Clear();
745 return 0;
746}
747#endif
748
749/* Get next char, updating state; error code goes into tok->done */
750
751static int
752tok_nextc(register struct tok_state *tok)
753{
754 for (;;) {
755 if (tok->cur != tok->inp) {
756 return Py_CHARMASK(*tok->cur++); /* Fast path */
757 }
758 if (tok->done != E_OK)
759 return EOF;
760 if (tok->fp == NULL) {
761 char *end = strchr(tok->inp, '\n');
762 if (end != NULL)
763 end++;
764 else {
765 end = strchr(tok->inp, '\0');
766 if (end == tok->inp) {
767 tok->done = E_EOF;
768 return EOF;
769 }
770 }
771 if (tok->start == NULL)
772 tok->buf = tok->cur;
773 tok->line_start = tok->cur;
774 tok->lineno++;
775 tok->inp = end;
776 return Py_CHARMASK(*tok->cur++);
777 }
778 if (tok->prompt != NULL) {
779 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
780 if (tok->nextprompt != NULL)
781 tok->prompt = tok->nextprompt;
782 if (newtok == NULL)
783 tok->done = E_INTR;
784 else if (*newtok == '\0') {
785 PyMem_FREE(newtok);
786 tok->done = E_EOF;
787 }
788#if !defined(PGEN) && defined(Py_USING_UNICODE)
789 else if (tok_stdin_decode(tok, &newtok) != 0)
790 PyMem_FREE(newtok);
791#endif
792 else if (tok->start != NULL) {
793 size_t start = tok->start - tok->buf;
794 size_t oldlen = tok->cur - tok->buf;
795 size_t newlen = oldlen + strlen(newtok);
796 char *buf = tok->buf;
797 buf = (char *)PyMem_REALLOC(buf, newlen+1);
798 tok->lineno++;
799 if (buf == NULL) {
800 PyMem_FREE(tok->buf);
801 tok->buf = NULL;
802 PyMem_FREE(newtok);
803 tok->done = E_NOMEM;
804 return EOF;
805 }
806 tok->buf = buf;
807 tok->cur = tok->buf + oldlen;
808 tok->line_start = tok->cur;
809 strcpy(tok->buf + oldlen, newtok);
810 PyMem_FREE(newtok);
811 tok->inp = tok->buf + newlen;
812 tok->end = tok->inp + 1;
813 tok->start = tok->buf + start;
814 }
815 else {
816 tok->lineno++;
817 if (tok->buf != NULL)
818 PyMem_FREE(tok->buf);
819 tok->buf = newtok;
820 tok->line_start = tok->buf;
821 tok->cur = tok->buf;
822 tok->line_start = tok->buf;
823 tok->inp = strchr(tok->buf, '\0');
824 tok->end = tok->inp + 1;
825 }
826 }
827 else {
828 int done = 0;
829 Py_ssize_t cur = 0;
830 char *pt;
831 if (tok->start == NULL) {
832 if (tok->buf == NULL) {
833 tok->buf = (char *)
834 PyMem_MALLOC(BUFSIZ);
835 if (tok->buf == NULL) {
836 tok->done = E_NOMEM;
837 return EOF;
838 }
839 tok->end = tok->buf + BUFSIZ;
840 }
841 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
842 tok) == NULL) {
843 tok->done = E_EOF;
844 done = 1;
845 }
846 else {
847 tok->done = E_OK;
848 tok->inp = strchr(tok->buf, '\0');
849 done = tok->inp[-1] == '\n';
850 }
851 }
852 else {
853 cur = tok->cur - tok->buf;
854 if (decoding_feof(tok)) {
855 tok->done = E_EOF;
856 done = 1;
857 }
858 else
859 tok->done = E_OK;
860 }
861 tok->lineno++;
862 /* Read until '\n' or EOF */
863 while (!done) {
864 Py_ssize_t curstart = tok->start == NULL ? -1 :
865 tok->start - tok->buf;
866 Py_ssize_t curvalid = tok->inp - tok->buf;
867 Py_ssize_t newsize = curvalid + BUFSIZ;
868 char *newbuf = tok->buf;
869 newbuf = (char *)PyMem_REALLOC(newbuf,
870 newsize);
871 if (newbuf == NULL) {
872 tok->done = E_NOMEM;
873 tok->cur = tok->inp;
874 return EOF;
875 }
876 tok->buf = newbuf;
877 tok->inp = tok->buf + curvalid;
878 tok->end = tok->buf + newsize;
879 tok->start = curstart < 0 ? NULL :
880 tok->buf + curstart;
881 if (decoding_fgets(tok->inp,
882 (int)(tok->end - tok->inp),
883 tok) == NULL) {
884 /* Break out early on decoding
885 errors, as tok->buf will be NULL
886 */
887 if (tok->decoding_erred)
888 return EOF;
889 /* Last line does not end in \n,
890 fake one */
891 strcpy(tok->inp, "\n");
892 }
893 tok->inp = strchr(tok->inp, '\0');
894 done = tok->inp[-1] == '\n';
895 }
896 if (tok->buf != NULL) {
897 tok->cur = tok->buf + cur;
898 tok->line_start = tok->cur;
899 /* replace "\r\n" with "\n" */
900 /* For Mac leave the \r, giving syntax error */
901 pt = tok->inp - 2;
902 if (pt >= tok->buf && *pt == '\r') {
903 *pt++ = '\n';
904 *pt = '\0';
905 tok->inp = pt;
906 }
907 }
908 }
909 if (tok->done != E_OK) {
910 if (tok->prompt != NULL)
911 PySys_WriteStderr("\n");
912 tok->cur = tok->inp;
913 return EOF;
914 }
915 }
916 /*NOTREACHED*/
917}
918
919
920/* Back-up one character */
921
922static void
923tok_backup(register struct tok_state *tok, register int c)
924{
925 if (c != EOF) {
926 if (--tok->cur < tok->buf)
927 Py_FatalError("tok_backup: begin of buffer");
928 if (*tok->cur != c)
929 *tok->cur = c;
930 }
931}
932
933
934/* Return the token corresponding to a single character */
935
936int
937PyToken_OneChar(int c)
938{
939 switch (c) {
940 case '(': return LPAR;
941 case ')': return RPAR;
942 case '[': return LSQB;
943 case ']': return RSQB;
944 case ':': return COLON;
945 case ',': return COMMA;
946 case ';': return SEMI;
947 case '+': return PLUS;
948 case '-': return MINUS;
949 case '*': return STAR;
950 case '/': return SLASH;
951 case '|': return VBAR;
952 case '&': return AMPER;
953 case '<': return LESS;
954 case '>': return GREATER;
955 case '=': return EQUAL;
956 case '.': return DOT;
957 case '%': return PERCENT;
958 case '`': return BACKQUOTE;
959 case '{': return LBRACE;
960 case '}': return RBRACE;
961 case '^': return CIRCUMFLEX;
962 case '~': return TILDE;
963 case '@': return AT;
964 default: return OP;
965 }
966}
967
968
969int
970PyToken_TwoChars(int c1, int c2)
971{
972 switch (c1) {
973 case '=':
974 switch (c2) {
975 case '=': return EQEQUAL;
976 }
977 break;
978 case '!':
979 switch (c2) {
980 case '=': return NOTEQUAL;
981 }
982 break;
983 case '<':
984 switch (c2) {
985 case '>': return NOTEQUAL;
986 case '=': return LESSEQUAL;
987 case '<': return LEFTSHIFT;
988 }
989 break;
990 case '>':
991 switch (c2) {
992 case '=': return GREATEREQUAL;
993 case '>': return RIGHTSHIFT;
994 }
995 break;
996 case '+':
997 switch (c2) {
998 case '=': return PLUSEQUAL;
999 }
1000 break;
1001 case '-':
1002 switch (c2) {
1003 case '=': return MINEQUAL;
1004 }
1005 break;
1006 case '*':
1007 switch (c2) {
1008 case '*': return DOUBLESTAR;
1009 case '=': return STAREQUAL;
1010 }
1011 break;
1012 case '/':
1013 switch (c2) {
1014 case '/': return DOUBLESLASH;
1015 case '=': return SLASHEQUAL;
1016 }
1017 break;
1018 case '|':
1019 switch (c2) {
1020 case '=': return VBAREQUAL;
1021 }
1022 break;
1023 case '%':
1024 switch (c2) {
1025 case '=': return PERCENTEQUAL;
1026 }
1027 break;
1028 case '&':
1029 switch (c2) {
1030 case '=': return AMPEREQUAL;
1031 }
1032 break;
1033 case '^':
1034 switch (c2) {
1035 case '=': return CIRCUMFLEXEQUAL;
1036 }
1037 break;
1038 }
1039 return OP;
1040}
1041
1042int
1043PyToken_ThreeChars(int c1, int c2, int c3)
1044{
1045 switch (c1) {
1046 case '<':
1047 switch (c2) {
1048 case '<':
1049 switch (c3) {
1050 case '=':
1051 return LEFTSHIFTEQUAL;
1052 }
1053 break;
1054 }
1055 break;
1056 case '>':
1057 switch (c2) {
1058 case '>':
1059 switch (c3) {
1060 case '=':
1061 return RIGHTSHIFTEQUAL;
1062 }
1063 break;
1064 }
1065 break;
1066 case '*':
1067 switch (c2) {
1068 case '*':
1069 switch (c3) {
1070 case '=':
1071 return DOUBLESTAREQUAL;
1072 }
1073 break;
1074 }
1075 break;
1076 case '/':
1077 switch (c2) {
1078 case '/':
1079 switch (c3) {
1080 case '=':
1081 return DOUBLESLASHEQUAL;
1082 }
1083 break;
1084 }
1085 break;
1086 }
1087 return OP;
1088}
1089
1090static int
1091indenterror(struct tok_state *tok)
1092{
1093 if (tok->alterror) {
1094 tok->done = E_TABSPACE;
1095 tok->cur = tok->inp;
1096 return 1;
1097 }
1098 if (tok->altwarning) {
1099 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1100 "in indentation\n", tok->filename);
1101 tok->altwarning = 0;
1102 }
1103 return 0;
1104}
1105
1106
1107/* Get next token, after space stripping etc. */
1108
1109static int
1110tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1111{
1112 register int c;
1113 int blankline;
1114
1115 *p_start = *p_end = NULL;
1116 nextline:
1117 tok->start = NULL;
1118 blankline = 0;
1119
1120 /* Get indentation level */
1121 if (tok->atbol) {
1122 register int col = 0;
1123 register int altcol = 0;
1124 tok->atbol = 0;
1125 for (;;) {
1126 c = tok_nextc(tok);
1127 if (c == ' ')
1128 col++, altcol++;
1129 else if (c == '\t') {
1130 col = (col/tok->tabsize + 1) * tok->tabsize;
1131 altcol = (altcol/tok->alttabsize + 1)
1132 * tok->alttabsize;
1133 }
1134 else if (c == '\014') /* Control-L (formfeed) */
1135 col = altcol = 0; /* For Emacs users */
1136 else
1137 break;
1138 }
1139 tok_backup(tok, c);
1140 if (c == '#' || c == '\n') {
1141 /* Lines with only whitespace and/or comments
1142 shouldn't affect the indentation and are
1143 not passed to the parser as NEWLINE tokens,
1144 except *totally* empty lines in interactive
1145 mode, which signal the end of a command group. */
1146 if (col == 0 && c == '\n' && tok->prompt != NULL)
1147 blankline = 0; /* Let it through */
1148 else
1149 blankline = 1; /* Ignore completely */
1150 /* We can't jump back right here since we still
1151 may need to skip to the end of a comment */
1152 }
1153 if (!blankline && tok->level == 0) {
1154 if (col == tok->indstack[tok->indent]) {
1155 /* No change */
1156 if (altcol != tok->altindstack[tok->indent]) {
1157 if (indenterror(tok))
1158 return ERRORTOKEN;
1159 }
1160 }
1161 else if (col > tok->indstack[tok->indent]) {
1162 /* Indent -- always one */
1163 if (tok->indent+1 >= MAXINDENT) {
1164 tok->done = E_TOODEEP;
1165 tok->cur = tok->inp;
1166 return ERRORTOKEN;
1167 }
1168 if (altcol <= tok->altindstack[tok->indent]) {
1169 if (indenterror(tok))
1170 return ERRORTOKEN;
1171 }
1172 tok->pendin++;
1173 tok->indstack[++tok->indent] = col;
1174 tok->altindstack[tok->indent] = altcol;
1175 }
1176 else /* col < tok->indstack[tok->indent] */ {
1177 /* Dedent -- any number, must be consistent */
1178 while (tok->indent > 0 &&
1179 col < tok->indstack[tok->indent]) {
1180 tok->pendin--;
1181 tok->indent--;
1182 }
1183 if (col != tok->indstack[tok->indent]) {
1184 tok->done = E_DEDENT;
1185 tok->cur = tok->inp;
1186 return ERRORTOKEN;
1187 }
1188 if (altcol != tok->altindstack[tok->indent]) {
1189 if (indenterror(tok))
1190 return ERRORTOKEN;
1191 }
1192 }
1193 }
1194 }
1195
1196 tok->start = tok->cur;
1197
1198 /* Return pending indents/dedents */
1199 if (tok->pendin != 0) {
1200 if (tok->pendin < 0) {
1201 tok->pendin++;
1202 return DEDENT;
1203 }
1204 else {
1205 tok->pendin--;
1206 return INDENT;
1207 }
1208 }
1209
1210 again:
1211 tok->start = NULL;
1212 /* Skip spaces */
1213 do {
1214 c = tok_nextc(tok);
1215 } while (c == ' ' || c == '\t' || c == '\014');
1216
1217 /* Set start of current token */
1218 tok->start = tok->cur - 1;
1219
1220 /* Skip comment, while looking for tab-setting magic */
1221 if (c == '#') {
1222 static char *tabforms[] = {
1223 "tab-width:", /* Emacs */
1224 ":tabstop=", /* vim, full form */
1225 ":ts=", /* vim, abbreviated form */
1226 "set tabsize=", /* will vi never die? */
1227 /* more templates can be added here to support other editors */
1228 };
1229 char cbuf[80];
1230 char *tp, **cp;
1231 tp = cbuf;
1232 do {
1233 *tp++ = c = tok_nextc(tok);
1234 } while (c != EOF && c != '\n' &&
1235 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1236 *tp = '\0';
1237 for (cp = tabforms;
1238 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1239 cp++) {
1240 if ((tp = strstr(cbuf, *cp))) {
1241 int newsize = atoi(tp + strlen(*cp));
1242
1243 if (newsize >= 1 && newsize <= 40) {
1244 tok->tabsize = newsize;
1245 if (Py_VerboseFlag)
1246 PySys_WriteStderr(
1247 "Tab size set to %d\n",
1248 newsize);
1249 }
1250 }
1251 }
1252 while (c != EOF && c != '\n')
1253 c = tok_nextc(tok);
1254 }
1255
1256 /* Check for EOF and errors now */
1257 if (c == EOF) {
1258 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1259 }
1260
1261 /* Identifier (most frequent token!) */
1262 if (isalpha(c) || c == '_') {
1263 /* Process r"", u"" and ur"" */
1264 switch (c) {
1265 case 'r':
1266 case 'R':
1267 c = tok_nextc(tok);
1268 if (c == '"' || c == '\'')
1269 goto letter_quote;
1270 break;
1271 case 'u':
1272 case 'U':
1273 c = tok_nextc(tok);
1274 if (c == 'r' || c == 'R')
1275 c = tok_nextc(tok);
1276 if (c == '"' || c == '\'')
1277 goto letter_quote;
1278 break;
1279 }
1280 while (isalnum(c) || c == '_') {
1281 c = tok_nextc(tok);
1282 }
1283 tok_backup(tok, c);
1284 *p_start = tok->start;
1285 *p_end = tok->cur;
1286 return NAME;
1287 }
1288
1289 /* Newline */
1290 if (c == '\n') {
1291 tok->atbol = 1;
1292 if (blankline || tok->level > 0)
1293 goto nextline;
1294 *p_start = tok->start;
1295 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1296 tok->cont_line = 0;
1297 return NEWLINE;
1298 }
1299
1300 /* Period or number starting with period? */
1301 if (c == '.') {
1302 c = tok_nextc(tok);
1303 if (isdigit(c)) {
1304 goto fraction;
1305 }
1306 else {
1307 tok_backup(tok, c);
1308 *p_start = tok->start;
1309 *p_end = tok->cur;
1310 return DOT;
1311 }
1312 }
1313
1314 /* Number */
1315 if (isdigit(c)) {
1316 if (c == '0') {
1317 /* Hex or octal -- maybe. */
1318 c = tok_nextc(tok);
1319 if (c == '.')
1320 goto fraction;
1321#ifndef WITHOUT_COMPLEX
1322 if (c == 'j' || c == 'J')
1323 goto imaginary;
1324#endif
1325 if (c == 'x' || c == 'X') {
1326 /* Hex */
1327 do {
1328 c = tok_nextc(tok);
1329 } while (isxdigit(c));
1330 }
1331 else {
1332 int found_decimal = 0;
1333 /* Octal; c is first char of it */
1334 /* There's no 'isoctdigit' macro, sigh */
1335 while ('0' <= c && c < '8') {
1336 c = tok_nextc(tok);
1337 }
1338 if (isdigit(c)) {
1339 found_decimal = 1;
1340 do {
1341 c = tok_nextc(tok);
1342 } while (isdigit(c));
1343 }
1344 if (c == '.')
1345 goto fraction;
1346 else if (c == 'e' || c == 'E')
1347 goto exponent;
1348#ifndef WITHOUT_COMPLEX
1349 else if (c == 'j' || c == 'J')
1350 goto imaginary;
1351#endif
1352 else if (found_decimal) {
1353 tok->done = E_TOKEN;
1354 tok_backup(tok, c);
1355 return ERRORTOKEN;
1356 }
1357 }
1358 if (c == 'l' || c == 'L')
1359 c = tok_nextc(tok);
1360 }
1361 else {
1362 /* Decimal */
1363 do {
1364 c = tok_nextc(tok);
1365 } while (isdigit(c));
1366 if (c == 'l' || c == 'L')
1367 c = tok_nextc(tok);
1368 else {
1369 /* Accept floating point numbers. */
1370 if (c == '.') {
1371 fraction:
1372 /* Fraction */
1373 do {
1374 c = tok_nextc(tok);
1375 } while (isdigit(c));
1376 }
1377 if (c == 'e' || c == 'E') {
1378 exponent:
1379 /* Exponent part */
1380 c = tok_nextc(tok);
1381 if (c == '+' || c == '-')
1382 c = tok_nextc(tok);
1383 if (!isdigit(c)) {
1384 tok->done = E_TOKEN;
1385 tok_backup(tok, c);
1386 return ERRORTOKEN;
1387 }
1388 do {
1389 c = tok_nextc(tok);
1390 } while (isdigit(c));
1391 }
1392#ifndef WITHOUT_COMPLEX
1393 if (c == 'j' || c == 'J')
1394 /* Imaginary part */
1395 imaginary:
1396 c = tok_nextc(tok);
1397#endif
1398 }
1399 }
1400 tok_backup(tok, c);
1401 *p_start = tok->start;
1402 *p_end = tok->cur;
1403 return NUMBER;
1404 }
1405
1406 letter_quote:
1407 /* String */
1408 if (c == '\'' || c == '"') {
1409 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1410 int quote = c;
1411 int triple = 0;
1412 int tripcount = 0;
1413 for (;;) {
1414 c = tok_nextc(tok);
1415 if (c == '\n') {
1416 if (!triple) {
1417 tok->done = E_EOLS;
1418 tok_backup(tok, c);
1419 return ERRORTOKEN;
1420 }
1421 tripcount = 0;
1422 tok->cont_line = 1; /* multiline string. */
1423 }
1424 else if (c == EOF) {
1425 if (triple)
1426 tok->done = E_EOFS;
1427 else
1428 tok->done = E_EOLS;
1429 tok->cur = tok->inp;
1430 return ERRORTOKEN;
1431 }
1432 else if (c == quote) {
1433 tripcount++;
1434 if (tok->cur - tok->start == quote2) {
1435 c = tok_nextc(tok);
1436 if (c == quote) {
1437 triple = 1;
1438 tripcount = 0;
1439 continue;
1440 }
1441 tok_backup(tok, c);
1442 }
1443 if (!triple || tripcount == 3)
1444 break;
1445 }
1446 else if (c == '\\') {
1447 tripcount = 0;
1448 c = tok_nextc(tok);
1449 if (c == EOF) {
1450 tok->done = E_EOLS;
1451 tok->cur = tok->inp;
1452 return ERRORTOKEN;
1453 }
1454 }
1455 else
1456 tripcount = 0;
1457 }
1458 *p_start = tok->start;
1459 *p_end = tok->cur;
1460 return STRING;
1461 }
1462
1463 /* Line continuation */
1464 if (c == '\\') {
1465 c = tok_nextc(tok);
1466 if (c != '\n') {
1467 tok->done = E_LINECONT;
1468 tok->cur = tok->inp;
1469 return ERRORTOKEN;
1470 }
1471 tok->cont_line = 1;
1472 goto again; /* Read next line */
1473 }
1474
1475 /* Check for two-character token */
1476 {
1477 int c2 = tok_nextc(tok);
1478 int token = PyToken_TwoChars(c, c2);
1479 if (token != OP) {
1480 int c3 = tok_nextc(tok);
1481 int token3 = PyToken_ThreeChars(c, c2, c3);
1482 if (token3 != OP) {
1483 token = token3;
1484 } else {
1485 tok_backup(tok, c3);
1486 }
1487 *p_start = tok->start;
1488 *p_end = tok->cur;
1489 return token;
1490 }
1491 tok_backup(tok, c2);
1492 }
1493
1494 /* Keep track of parentheses nesting level */
1495 switch (c) {
1496 case '(':
1497 case '[':
1498 case '{':
1499 tok->level++;
1500 break;
1501 case ')':
1502 case ']':
1503 case '}':
1504 tok->level--;
1505 break;
1506 }
1507
1508 /* Punctuation character */
1509 *p_start = tok->start;
1510 *p_end = tok->cur;
1511 return PyToken_OneChar(c);
1512}
1513
1514int
1515PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1516{
1517 int result = tok_get(tok, p_start, p_end);
1518 if (tok->decoding_erred) {
1519 result = ERRORTOKEN;
1520 tok->done = E_DECODE;
1521 }
1522 return result;
1523}
1524
1525#ifdef Py_DEBUG
1526
1527void
1528tok_dump(int type, char *start, char *end)
1529{
1530 printf("%s", _PyParser_TokenNames[type]);
1531 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1532 printf("(%.*s)", (int)(end - start), start);
1533}
1534
1535#endif
Note: See TracBrowser for help on using the repository browser.