Changeset 391 for python/trunk/Parser/tokenizer.c
- Timestamp:
- Mar 19, 2014, 11:31:01 PM (11 years ago)
- Location:
- python/trunk
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
python/trunk
-
Property svn:mergeinfo
set to
/python/vendor/Python-2.7.6 merged eligible /python/vendor/current merged eligible
-
Property svn:mergeinfo
set to
-
python/trunk/Parser/tokenizer.c
r105 r391 36 36 37 37 char *_PyParser_TokenNames[] = { 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 38 "ENDMARKER", 39 "NAME", 40 "NUMBER", 41 "STRING", 42 "NEWLINE", 43 "INDENT", 44 "DEDENT", 45 "LPAR", 46 "RPAR", 47 "LSQB", 48 "RSQB", 49 "COLON", 50 "COMMA", 51 "SEMI", 52 "PLUS", 53 "MINUS", 54 "STAR", 55 "SLASH", 56 "VBAR", 57 "AMPER", 58 "LESS", 59 "GREATER", 60 "EQUAL", 61 "DOT", 62 "PERCENT", 63 "BACKQUOTE", 64 "LBRACE", 65 "RBRACE", 66 "EQEQUAL", 67 "NOTEQUAL", 68 "LESSEQUAL", 69 "GREATEREQUAL", 70 "TILDE", 71 "CIRCUMFLEX", 72 "LEFTSHIFT", 73 "RIGHTSHIFT", 74 "DOUBLESTAR", 75 "PLUSEQUAL", 76 "MINEQUAL", 77 "STAREQUAL", 78 "SLASHEQUAL", 79 "PERCENTEQUAL", 80 "AMPEREQUAL", 81 "VBAREQUAL", 82 "CIRCUMFLEXEQUAL", 83 "LEFTSHIFTEQUAL", 84 "RIGHTSHIFTEQUAL", 85 "DOUBLESTAREQUAL", 86 "DOUBLESLASH", 87 "DOUBLESLASHEQUAL", 88 "AT", 89 /* This table must match the #defines in token.h! */ 90 "OP", 91 "<ERRORTOKEN>", 92 "<N_TOKENS>" 93 93 }; 94 95 94 96 95 /* Create and initialize a new tok_state structure */ … … 99 98 tok_new(void) 100 99 { 101 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC( 102 sizeof(struct tok_state)); 103 if (tok == NULL) 104 return NULL; 105 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; 106 tok->done = E_OK; 107 tok->fp = NULL; 108 tok->tabsize = TABSIZE; 109 tok->indent = 0; 110 tok->indstack[0] = 0; 111 tok->atbol = 1; 112 tok->pendin = 0; 113 tok->prompt = tok->nextprompt = NULL; 114 tok->lineno = 0; 115 tok->level = 0; 116 tok->filename = NULL; 117 tok->altwarning = 0; 118 tok->alterror = 0; 119 tok->alttabsize = 1; 120 tok->altindstack[0] = 0; 121 tok->decoding_state = 0; 122 tok->decoding_erred = 0; 123 tok->read_coding_spec = 0; 124 tok->encoding = NULL; 125 tok->cont_line = 0; 100 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC( 101 sizeof(struct tok_state)); 102 if (tok == NULL) 103 return NULL; 104 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; 105 tok->done = E_OK; 106 tok->fp = NULL; 107 tok->input = NULL; 108 tok->tabsize = TABSIZE; 109 tok->indent = 0; 110 tok->indstack[0] = 0; 111 tok->atbol = 1; 112 tok->pendin = 0; 113 tok->prompt = tok->nextprompt = NULL; 114 tok->lineno = 0; 115 tok->level = 0; 116 tok->filename = NULL; 117 tok->altwarning = 0; 118 tok->alterror = 0; 119 tok->alttabsize = 1; 120 tok->altindstack[0] = 0; 121 tok->decoding_state = 0; 122 tok->decoding_erred = 0; 123 tok->read_coding_spec = 0; 124 tok->encoding = NULL; 125 tok->cont_line = 0; 126 126 #ifndef PGEN 127 128 127 tok->decoding_readline = NULL; 128 tok->decoding_buffer = NULL; 129 129 #endif 130 return tok; 130 return tok; 131 } 132 133 static char * 134 new_string(const char *s, Py_ssize_t len) 135 { 136 char* result = (char *)PyMem_MALLOC(len + 1); 137 if (result != NULL) { 138 memcpy(result, s, len); 139 result[len] = '\0'; 140 } 141 return result; 131 142 } 132 143 … … 136 147 decoding_fgets(char *s, int size, struct tok_state *tok) 137 148 { 138 149 return fgets(s, size, tok->fp); 139 150 } 140 151 … … 142 153 decoding_feof(struct tok_state *tok) 143 154 { 144 145 } 146 147 static c onst char *148 decode_str(const char *str, struct tok_state *tok)149 { 150 return str;155 return feof(tok->fp); 156 } 157 158 static char * 159 decode_str(const char *str, int exec_input, struct tok_state *tok) 160 { 161 return new_string(str, strlen(str)); 151 162 } 152 163 … … 156 167 error_ret(struct tok_state *tok) /* XXX */ 157 168 { 158 tok->decoding_erred = 1; 159 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */ 160 PyMem_FREE(tok->buf); 161 tok->buf = NULL; 162 return NULL; /* as if it were EOF */ 163 } 169 tok->decoding_erred = 1; 170 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */ 171 PyMem_FREE(tok->buf); 172 tok->buf = NULL; 173 return NULL; /* as if it were EOF */ 174 } 175 164 176 165 177 static char * 166 new_string(const char *s, Py_ssize_t len) 167 { 168 char* result = (char *)PyMem_MALLOC(len + 1); 169 if (result != NULL) { 170 memcpy(result, s, len); 171 result[len] = '\0'; 172 } 173 return result; 174 } 175 176 static char * 177 get_normal_name(char *s) /* for utf-8 and latin-1 */ 178 { 179 char buf[13]; 180 int i; 181 for (i = 0; i < 12; i++) { 182 int c = s[i]; 183 if (c == '\0') break; 184 else if (c == '_') buf[i] = '-'; 185 else buf[i] = tolower(c); 186 } 187 buf[i] = '\0'; 188 if (strcmp(buf, "utf-8") == 0 || 189 strncmp(buf, "utf-8-", 6) == 0) return "utf-8"; 190 else if (strcmp(buf, "latin-1") == 0 || 191 strcmp(buf, "iso-8859-1") == 0 || 192 strcmp(buf, "iso-latin-1") == 0 || 193 strncmp(buf, "latin-1-", 8) == 0 || 194 strncmp(buf, "iso-8859-1-", 11) == 0 || 195 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1"; 196 else return s; 178 get_normal_name(char *s) /* for utf-8 and latin-1 */ 179 { 180 char buf[13]; 181 int i; 182 for (i = 0; i < 12; i++) { 183 int c = s[i]; 184 if (c == '\0') 185 break; 186 else if (c == '_') 187 buf[i] = '-'; 188 else 189 buf[i] = tolower(c); 190 } 191 buf[i] = '\0'; 192 if (strcmp(buf, "utf-8") == 0 || 193 strncmp(buf, "utf-8-", 6) == 0) 194 return "utf-8"; 195 else if (strcmp(buf, "latin-1") == 0 || 196 strcmp(buf, "iso-8859-1") == 0 || 197 strcmp(buf, "iso-latin-1") == 0 || 198 strncmp(buf, "latin-1-", 8) == 0 || 199 strncmp(buf, "iso-8859-1-", 11) == 0 || 200 strncmp(buf, "iso-latin-1-", 12) == 0) 201 return "iso-8859-1"; 202 else 203 return s; 197 204 } 198 205 … … 202 209 get_coding_spec(const char *s, Py_ssize_t size) 203 210 { 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 while (isalnum(Py_CHARMASK(t[0])) ||226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 211 Py_ssize_t i; 212 /* Coding spec must be in a comment, and that comment must be 213 * the only statement on the source code line. */ 214 for (i = 0; i < size - 6; i++) { 215 if (s[i] == '#') 216 break; 217 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') 218 return NULL; 219 } 220 for (; i < size - 6; i++) { /* XXX inefficient search */ 221 const char* t = s + i; 222 if (strncmp(t, "coding", 6) == 0) { 223 const char* begin = NULL; 224 t += 6; 225 if (t[0] != ':' && t[0] != '=') 226 continue; 227 do { 228 t++; 229 } while (t[0] == '\x20' || t[0] == '\t'); 230 231 begin = t; 232 while (Py_ISALNUM(t[0]) || 233 t[0] == '-' || t[0] == '_' || t[0] == '.') 234 t++; 235 236 if (begin < t) { 237 char* r = new_string(begin, t - begin); 238 char* q = get_normal_name(r); 239 if (r != q) { 240 PyMem_FREE(r); 241 r = new_string(q, strlen(q)); 242 } 243 return r; 244 } 245 } 246 } 247 return NULL; 241 248 } 242 249 … … 248 255 static int 249 256 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, 250 251 { 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 257 int set_readline(struct tok_state *, const char *)) 258 { 259 char * cs; 260 int r = 1; 261 262 if (tok->cont_line) 263 /* It's a continuation line, so it can't be a coding spec. */ 264 return 1; 265 cs = get_coding_spec(line, size); 266 if (cs != NULL) { 267 tok->read_coding_spec = 1; 268 if (tok->encoding == NULL) { 269 assert(tok->decoding_state == 1); /* raw */ 270 if (strcmp(cs, "utf-8") == 0 || 271 strcmp(cs, "iso-8859-1") == 0) { 272 tok->encoding = cs; 273 } else { 267 274 #ifdef Py_USING_UNICODE 268 r = set_readline(tok, cs); 269 if (r) { 270 tok->encoding = cs; 271 tok->decoding_state = -1; 272 } 273 else 274 PyMem_FREE(cs); 275 r = set_readline(tok, cs); 276 if (r) { 277 tok->encoding = cs; 278 tok->decoding_state = -1; 279 } 280 else { 281 PyErr_Format(PyExc_SyntaxError, 282 "encoding problem: %s", cs); 283 PyMem_FREE(cs); 284 } 275 285 #else 276 277 278 279 280 286 /* Without Unicode support, we cannot 287 process the coding spec. Since there 288 won't be any Unicode literals, that 289 won't matter. */ 290 PyMem_FREE(cs); 281 291 #endif 282 } 283 } else { /* then, compare cs with BOM */ 284 r = (strcmp(tok->encoding, cs) == 0); 285 PyMem_FREE(cs); 286 } 287 } 288 if (!r) { 289 cs = tok->encoding; 290 if (!cs) 291 cs = "with BOM"; 292 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs); 293 } 294 return r; 292 } 293 } else { /* then, compare cs with BOM */ 294 r = (strcmp(tok->encoding, cs) == 0); 295 if (!r) 296 PyErr_Format(PyExc_SyntaxError, 297 "encoding problem: %s with BOM", cs); 298 PyMem_FREE(cs); 299 } 300 } 301 return r; 295 302 } 296 303 … … 301 308 static int 302 309 check_bom(int get_char(struct tok_state *), 303 void unget_char(int, struct tok_state *), 304 int set_readline(struct tok_state *, const char *), 305 struct tok_state *tok) 306 { 307 int ch = get_char(tok); 308 tok->decoding_state = 1; 309 if (ch == EOF) { 310 return 1; 311 } else if (ch == 0xEF) { 312 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM; 313 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM; 310 void unget_char(int, struct tok_state *), 311 int set_readline(struct tok_state *, const char *), 312 struct tok_state *tok) 313 { 314 int ch1, ch2, ch3; 315 ch1 = get_char(tok); 316 tok->decoding_state = 1; 317 if (ch1 == EOF) { 318 return 1; 319 } else if (ch1 == 0xEF) { 320 ch2 = get_char(tok); 321 if (ch2 != 0xBB) { 322 unget_char(ch2, tok); 323 unget_char(ch1, tok); 324 return 1; 325 } 326 ch3 = get_char(tok); 327 if (ch3 != 0xBF) { 328 unget_char(ch3, tok); 329 unget_char(ch2, tok); 330 unget_char(ch1, tok); 331 return 1; 332 } 314 333 #if 0 315 /* Disable support for UTF-16 BOMs until a decision 316 is made whether this needs to be supported. */ 317 } else if (ch == 0xFE) { 318 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM; 319 if (!set_readline(tok, "utf-16-be")) return 0; 320 tok->decoding_state = -1; 321 } else if (ch == 0xFF) { 322 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM; 323 if (!set_readline(tok, "utf-16-le")) return 0; 324 tok->decoding_state = -1; 334 /* Disable support for UTF-16 BOMs until a decision 335 is made whether this needs to be supported. */ 336 } else if (ch1 == 0xFE) { 337 ch2 = get_char(tok); 338 if (ch2 != 0xFF) { 339 unget_char(ch2, tok); 340 unget_char(ch1, tok); 341 return 1; 342 } 343 if (!set_readline(tok, "utf-16-be")) 344 return 0; 345 tok->decoding_state = -1; 346 } else if (ch1 == 0xFF) { 347 ch2 = get_char(tok); 348 if (ch2 != 0xFE) { 349 unget_char(ch2, tok); 350 unget_char(ch1, tok); 351 return 1; 352 } 353 if (!set_readline(tok, "utf-16-le")) 354 return 0; 355 tok->decoding_state = -1; 325 356 #endif 326 } else { 327 unget_char(ch, tok); 328 return 1; 329 } 330 if (tok->encoding != NULL) 331 PyMem_FREE(tok->encoding); 332 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */ 333 return 1; 334 NON_BOM: 335 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */ 336 unget_char(0xFF, tok); /* XXX this will cause a syntax error */ 337 return 1; 357 } else { 358 unget_char(ch1, tok); 359 return 1; 360 } 361 if (tok->encoding != NULL) 362 PyMem_FREE(tok->encoding); 363 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */ 364 return 1; 338 365 } 339 366 … … 344 371 1) NULL: need to call tok->decoding_readline to get a new line 345 372 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and 346 373 stored the result in tok->decoding_buffer 347 374 3) PyStringObject *: previous call to fp_readl did not have enough room 348 349 350 351 352 375 (in the s buffer) to copy entire contents of the line read 376 by tok->decoding_readline. tok->decoding_buffer has the overflow. 377 In this case, fp_readl is called in a loop (with an expanded buffer) 378 until the buffer ends with a '\n' (or until the end of the file is 379 reached): see tok_nextc and its calls to decoding_fgets. 353 380 */ 354 381 … … 357 384 { 358 385 #ifndef Py_USING_UNICODE 359 360 361 386 /* In a non-Unicode built, this should never be called. */ 387 Py_FatalError("fp_readl should not be called in this build."); 388 return NULL; /* Keep compiler happy (not reachable) */ 362 389 #else 363 PyObject* utf8 = NULL; 364 PyObject* buf = tok->decoding_buffer; 365 char *str; 366 Py_ssize_t utf8len; 367 368 /* Ask for one less byte so we can terminate it */ 369 assert(size > 0); 370 size--; 371 372 if (buf == NULL) { 373 buf = PyObject_CallObject(tok->decoding_readline, NULL); 374 if (buf == NULL) 375 return error_ret(tok); 376 } else { 377 tok->decoding_buffer = NULL; 378 if (PyString_CheckExact(buf)) 379 utf8 = buf; 380 } 381 if (utf8 == NULL) { 382 utf8 = PyUnicode_AsUTF8String(buf); 383 Py_DECREF(buf); 384 if (utf8 == NULL) 385 return error_ret(tok); 386 } 387 str = PyString_AsString(utf8); 388 utf8len = PyString_GET_SIZE(utf8); 389 if (utf8len > size) { 390 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size); 391 if (tok->decoding_buffer == NULL) { 392 Py_DECREF(utf8); 393 return error_ret(tok); 394 } 395 utf8len = size; 396 } 397 memcpy(s, str, utf8len); 398 s[utf8len] = '\0'; 399 Py_DECREF(utf8); 400 if (utf8len == 0) return NULL; /* EOF */ 401 return s; 390 PyObject* utf8 = NULL; 391 PyObject* buf = tok->decoding_buffer; 392 char *str; 393 Py_ssize_t utf8len; 394 395 /* Ask for one less byte so we can terminate it */ 396 assert(size > 0); 397 size--; 398 399 if (buf == NULL) { 400 buf = PyObject_CallObject(tok->decoding_readline, NULL); 401 if (buf == NULL) 402 return error_ret(tok); 403 } else { 404 tok->decoding_buffer = NULL; 405 if (PyString_CheckExact(buf)) 406 utf8 = buf; 407 } 408 if (utf8 == NULL) { 409 utf8 = PyUnicode_AsUTF8String(buf); 410 Py_DECREF(buf); 411 if (utf8 == NULL) 412 return error_ret(tok); 413 } 414 str = PyString_AsString(utf8); 415 utf8len = PyString_GET_SIZE(utf8); 416 if (utf8len > size) { 417 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size); 418 if (tok->decoding_buffer == NULL) { 419 Py_DECREF(utf8); 420 return error_ret(tok); 421 } 422 utf8len = size; 423 } 424 memcpy(s, str, utf8len); 425 s[utf8len] = '\0'; 426 Py_DECREF(utf8); 427 if (utf8len == 0) 428 return NULL; /* EOF */ 429 return s; 402 430 #endif 403 431 } … … 416 444 fp_setreadl(struct tok_state *tok, const char* enc) 417 445 { 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 446 PyObject *reader, *stream, *readline; 447 448 /* XXX: constify filename argument. */ 449 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL); 450 if (stream == NULL) 451 return 0; 452 453 reader = PyCodec_StreamReader(enc, stream, NULL); 454 Py_DECREF(stream); 455 if (reader == NULL) 456 return 0; 457 458 readline = PyObject_GetAttrString(reader, "readline"); 459 Py_DECREF(reader); 460 if (readline == NULL) 461 return 0; 462 463 tok->decoding_readline = readline; 464 return 1; 437 465 } 438 466 … … 440 468 441 469 static int fp_getc(struct tok_state *tok) { 442 470 return getc(tok->fp); 443 471 } 444 472 … … 446 474 447 475 static void fp_ungetc(int c, struct tok_state *tok) { 448 476 ungetc(c, tok->fp); 449 477 } 450 478 … … 455 483 decoding_fgets(char *s, int size, struct tok_state *tok) 456 484 { 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 485 char *line = NULL; 486 int badchar = 0; 487 for (;;) { 488 if (tok->decoding_state < 0) { 489 /* We already have a codec associated with 490 this input. */ 491 line = fp_readl(s, size, tok); 492 break; 493 } else if (tok->decoding_state > 0) { 494 /* We want a 'raw' read. */ 495 line = Py_UniversalNewlineFgets(s, size, 496 tok->fp, NULL); 497 break; 498 } else { 499 /* We have not yet determined the encoding. 500 If an encoding is found, use the file-pointer 501 reader functions from now on. */ 502 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) 503 return error_ret(tok); 504 assert(tok->decoding_state != 0); 505 } 506 } 507 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { 508 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) { 509 return error_ret(tok); 510 } 511 } 484 512 #ifndef PGEN 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 513 /* The default encoding is ASCII, so make sure we don't have any 514 non-ASCII bytes in it. */ 515 if (line && !tok->encoding) { 516 unsigned char *c; 517 for (c = (unsigned char *)line; *c; c++) 518 if (*c > 127) { 519 badchar = *c; 520 break; 521 } 522 } 523 if (badchar) { 524 char buf[500]; 525 /* Need to add 1 to the line number, since this line 526 has not been counted, yet. */ 527 sprintf(buf, 528 "Non-ASCII character '\\x%.2x' " 529 "in file %.200s on line %i, " 530 "but no encoding declared; " 531 "see http://www.python.org/peps/pep-0263.html for details", 532 badchar, tok->filename, tok->lineno + 1); 533 PyErr_SetString(PyExc_SyntaxError, buf); 534 return error_ret(tok); 535 } 508 536 #endif 509 537 return line; 510 538 } 511 539 … … 513 541 decoding_feof(struct tok_state *tok) 514 542 { 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 543 if (tok->decoding_state >= 0) { 544 return feof(tok->fp); 545 } else { 546 PyObject* buf = tok->decoding_buffer; 547 if (buf == NULL) { 548 buf = PyObject_CallObject(tok->decoding_readline, NULL); 549 if (buf == NULL) { 550 error_ret(tok); 551 return 1; 552 } else { 553 tok->decoding_buffer = buf; 554 } 555 } 556 return PyObject_Length(buf) == 0; 557 } 530 558 } 531 559 … … 534 562 static int 535 563 buf_getc(struct tok_state *tok) { 536 564 return Py_CHARMASK(*tok->str++); 537 565 } 538 566 … … 541 569 static void 542 570 buf_ungetc(int c, struct tok_state *tok) { 543 544 assert(Py_CHARMASK(*tok->str) == c);/* tok->cur may point to read-only segment */571 tok->str--; 572 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ 545 573 } 546 574 … … 550 578 static int 551 579 buf_setreadl(struct tok_state *tok, const char* enc) { 552 553 580 tok->enc = enc; 581 return 1; 554 582 } 555 583 … … 560 588 static PyObject * 561 589 translate_into_utf8(const char* str, const char* enc) { 562 563 564 565 566 567 568 590 PyObject *utf8; 591 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); 592 if (buf == NULL) 593 return NULL; 594 utf8 = PyUnicode_AsUTF8String(buf); 595 Py_DECREF(buf); 596 return utf8; 569 597 } 570 598 #endif 599 600 601 static char * 602 translate_newlines(const char *s, int exec_input, struct tok_state *tok) { 603 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length; 604 char *buf, *current; 605 char c = '\0'; 606 buf = PyMem_MALLOC(needed_length); 607 if (buf == NULL) { 608 tok->done = E_NOMEM; 609 return NULL; 610 } 611 for (current = buf; *s; s++, current++) { 612 c = *s; 613 if (skip_next_lf) { 614 skip_next_lf = 0; 615 if (c == '\n') { 616 c = *++s; 617 if (!c) 618 break; 619 } 620 } 621 if (c == '\r') { 622 skip_next_lf = 1; 623 c = '\n'; 624 } 625 *current = c; 626 } 627 /* If this is exec input, add a newline to the end of the string if 628 there isn't one already. */ 629 if (exec_input && c != '\n') { 630 *current = '\n'; 631 current++; 632 } 633 *current = '\0'; 634 final_length = current - buf + 1; 635 if (final_length < needed_length && final_length) 636 /* should never fail */ 637 buf = PyMem_REALLOC(buf, final_length); 638 return buf; 639 } 571 640 572 641 /* Decode a byte string STR for use as the buffer of TOK. … … 575 644 576 645 static const char * 577 decode_str(const char *str, struct tok_state *tok) 578 { 579 PyObject* utf8 = NULL; 580 const char *s; 581 const char *newl[2] = {NULL, NULL}; 582 int lineno = 0; 583 tok->enc = NULL; 584 tok->str = str; 585 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) 586 return error_ret(tok); 587 str = tok->str; /* string after BOM if any */ 588 assert(str); 646 decode_str(const char *input, int single, struct tok_state *tok) 647 { 648 PyObject* utf8 = NULL; 649 const char *str; 650 const char *s; 651 const char *newl[2] = {NULL, NULL}; 652 int lineno = 0; 653 tok->input = str = translate_newlines(input, single, tok); 654 if (str == NULL) 655 return NULL; 656 tok->enc = NULL; 657 tok->str = str; 658 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) 659 return error_ret(tok); 660 str = tok->str; /* string after BOM if any */ 661 assert(str); 589 662 #ifdef Py_USING_UNICODE 590 591 592 593 594 595 663 if (tok->enc != NULL) { 664 utf8 = translate_into_utf8(str, tok->enc); 665 if (utf8 == NULL) 666 return error_ret(tok); 667 str = PyString_AsString(utf8); 668 } 596 669 #endif 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 670 for (s = str;; s++) { 671 if (*s == '\0') break; 672 else if (*s == '\n') { 673 assert(lineno < 2); 674 newl[lineno] = s; 675 lineno++; 676 if (lineno == 2) break; 677 } 678 } 679 tok->enc = NULL; 680 /* need to check line 1 and 2 separately since check_coding_spec 681 assumes a single line as input */ 682 if (newl[0]) { 683 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) 684 return error_ret(tok); 685 if (tok->enc == NULL && newl[1]) { 686 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0], 687 tok, buf_setreadl)) 688 return error_ret(tok); 689 } 690 } 618 691 #ifdef Py_USING_UNICODE 619 if (tok->enc != NULL) { 620 assert(utf8 == NULL); 621 utf8 = translate_into_utf8(str, tok->enc); 622 if (utf8 == NULL) { 623 PyErr_Format(PyExc_SyntaxError, 624 "unknown encoding: %s", tok->enc); 625 return error_ret(tok); 626 } 627 str = PyString_AsString(utf8); 628 } 692 if (tok->enc != NULL) { 693 assert(utf8 == NULL); 694 utf8 = translate_into_utf8(str, tok->enc); 695 if (utf8 == NULL) 696 return error_ret(tok); 697 str = PyString_AsString(utf8); 698 } 629 699 #endif 630 631 632 700 assert(tok->decoding_buffer == NULL); 701 tok->decoding_buffer = utf8; /* CAUTION */ 702 return str; 633 703 } 634 704 … … 638 708 639 709 struct tok_state * 640 PyTokenizer_FromString(const char *str )641 { 642 643 644 645 str = (char *)decode_str(str, tok);646 647 648 649 650 651 652 653 710 PyTokenizer_FromString(const char *str, int exec_input) 711 { 712 struct tok_state *tok = tok_new(); 713 if (tok == NULL) 714 return NULL; 715 str = (char *)decode_str(str, exec_input, tok); 716 if (str == NULL) { 717 PyTokenizer_Free(tok); 718 return NULL; 719 } 720 721 /* XXX: constify members. */ 722 tok->buf = tok->cur = tok->end = tok->inp = (char*)str; 723 return tok; 654 724 } 655 725 … … 660 730 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2) 661 731 { 662 663 664 665 666 667 668 669 670 671 672 673 674 732 struct tok_state *tok = tok_new(); 733 if (tok == NULL) 734 return NULL; 735 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) { 736 PyTokenizer_Free(tok); 737 return NULL; 738 } 739 tok->cur = tok->inp = tok->buf; 740 tok->end = tok->buf + BUFSIZ; 741 tok->fp = fp; 742 tok->prompt = ps1; 743 tok->nextprompt = ps2; 744 return tok; 675 745 } 676 746 … … 681 751 PyTokenizer_Free(struct tok_state *tok) 682 752 { 683 684 753 if (tok->encoding != NULL) 754 PyMem_FREE(tok->encoding); 685 755 #ifndef PGEN 686 687 756 Py_XDECREF(tok->decoding_readline); 757 Py_XDECREF(tok->decoding_buffer); 688 758 #endif 689 if (tok->fp != NULL && tok->buf != NULL) 690 PyMem_FREE(tok->buf); 691 PyMem_FREE(tok); 759 if (tok->fp != NULL && tok->buf != NULL) 760 PyMem_FREE(tok->buf); 761 if (tok->input) 762 PyMem_FREE((char *)tok->input); 763 PyMem_FREE(tok); 692 764 } 693 765 … … 696 768 tok_stdin_decode(struct tok_state *tok, char **inp) 697 769 { 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 770 PyObject *enc, *sysstdin, *decoded, *utf8; 771 const char *encoding; 772 char *converted; 773 774 if (PySys_GetFile((char *)"stdin", NULL) != stdin) 775 return 0; 776 sysstdin = PySys_GetObject("stdin"); 777 if (sysstdin == NULL || !PyFile_Check(sysstdin)) 778 return 0; 779 780 enc = ((PyFileObject *)sysstdin)->f_encoding; 781 if (enc == NULL || !PyString_Check(enc)) 782 return 0; 783 Py_INCREF(enc); 784 785 encoding = PyString_AsString(enc); 786 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL); 787 if (decoded == NULL) 788 goto error_clear; 789 790 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL); 791 Py_DECREF(decoded); 792 if (utf8 == NULL) 793 goto error_clear; 794 795 assert(PyString_Check(utf8)); 796 converted = new_string(PyString_AS_STRING(utf8), 797 PyString_GET_SIZE(utf8)); 798 Py_DECREF(utf8); 799 if (converted == NULL) 800 goto error_nomem; 801 802 PyMem_FREE(*inp); 803 *inp = converted; 804 if (tok->encoding != NULL) 805 PyMem_FREE(tok->encoding); 806 tok->encoding = new_string(encoding, strlen(encoding)); 807 if (tok->encoding == NULL) 808 goto error_nomem; 809 810 Py_DECREF(enc); 811 return 0; 740 812 741 813 error_nomem: 742 743 744 814 Py_DECREF(enc); 815 tok->done = E_NOMEM; 816 return -1; 745 817 746 818 error_clear: 747 /* Fallback to iso-8859-1: for backward compatibility */ 748 Py_DECREF(enc); 749 PyErr_Clear(); 750 return 0; 819 Py_DECREF(enc); 820 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { 821 tok->done = E_ERROR; 822 return -1; 823 } 824 /* Fallback to iso-8859-1: for backward compatibility */ 825 PyErr_Clear(); 826 return 0; 751 827 } 752 828 #endif … … 757 833 tok_nextc(register struct tok_state *tok) 758 834 { 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 835 for (;;) { 836 if (tok->cur != tok->inp) { 837 return Py_CHARMASK(*tok->cur++); /* Fast path */ 838 } 839 if (tok->done != E_OK) 840 return EOF; 841 if (tok->fp == NULL) { 842 char *end = strchr(tok->inp, '\n'); 843 if (end != NULL) 844 end++; 845 else { 846 end = strchr(tok->inp, '\0'); 847 if (end == tok->inp) { 848 tok->done = E_EOF; 849 return EOF; 850 } 851 } 852 if (tok->start == NULL) 853 tok->buf = tok->cur; 854 tok->line_start = tok->cur; 855 tok->lineno++; 856 tok->inp = end; 857 return Py_CHARMASK(*tok->cur++); 858 } 859 if (tok->prompt != NULL) { 860 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt); 861 if (tok->nextprompt != NULL) 862 tok->prompt = tok->nextprompt; 863 if (newtok == NULL) 864 tok->done = E_INTR; 865 else if (*newtok == '\0') { 866 PyMem_FREE(newtok); 867 tok->done = E_EOF; 868 } 793 869 #if !defined(PGEN) && defined(Py_USING_UNICODE) 794 795 870 else if (tok_stdin_decode(tok, &newtok) != 0) 871 PyMem_FREE(newtok); 796 872 #endif 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 873 else if (tok->start != NULL) { 874 size_t start = tok->start - tok->buf; 875 size_t oldlen = tok->cur - tok->buf; 876 size_t newlen = oldlen + strlen(newtok); 877 char *buf = tok->buf; 878 buf = (char *)PyMem_REALLOC(buf, newlen+1); 879 tok->lineno++; 880 if (buf == NULL) { 881 PyMem_FREE(tok->buf); 882 tok->buf = NULL; 883 PyMem_FREE(newtok); 884 tok->done = E_NOMEM; 885 return EOF; 886 } 887 tok->buf = buf; 888 tok->cur = tok->buf + oldlen; 889 tok->line_start = tok->cur; 890 strcpy(tok->buf + oldlen, newtok); 891 PyMem_FREE(newtok); 892 tok->inp = tok->buf + newlen; 893 tok->end = tok->inp + 1; 894 tok->start = tok->buf + start; 895 } 896 else { 897 tok->lineno++; 898 if (tok->buf != NULL) 899 PyMem_FREE(tok->buf); 900 tok->buf = newtok; 901 tok->line_start = tok->buf; 902 tok->cur = tok->buf; 903 tok->line_start = tok->buf; 904 tok->inp = strchr(tok->buf, '\0'); 905 tok->end = tok->inp + 1; 906 } 907 } 908 else { 909 int done = 0; 910 Py_ssize_t cur = 0; 911 char *pt; 912 if (tok->start == NULL) { 913 if (tok->buf == NULL) { 914 tok->buf = (char *) 915 PyMem_MALLOC(BUFSIZ); 916 if (tok->buf == NULL) { 917 tok->done = E_NOMEM; 918 return EOF; 919 } 920 tok->end = tok->buf + BUFSIZ; 921 } 922 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf), 923 tok) == NULL) { 924 tok->done = E_EOF; 925 done = 1; 926 } 927 else { 928 tok->done = E_OK; 929 tok->inp = strchr(tok->buf, '\0'); 930 done = tok->inp[-1] == '\n'; 931 } 932 } 933 else { 934 cur = tok->cur - tok->buf; 935 if (decoding_feof(tok)) { 936 tok->done = E_EOF; 937 done = 1; 938 } 939 else 940 tok->done = E_OK; 941 } 942 tok->lineno++; 943 /* Read until '\n' or EOF */ 944 while (!done) { 945 Py_ssize_t curstart = tok->start == NULL ? -1 : 946 tok->start - tok->buf; 947 Py_ssize_t curvalid = tok->inp - tok->buf; 948 Py_ssize_t newsize = curvalid + BUFSIZ; 949 char *newbuf = tok->buf; 950 newbuf = (char *)PyMem_REALLOC(newbuf, 951 newsize); 952 if (newbuf == NULL) { 953 tok->done = E_NOMEM; 954 tok->cur = tok->inp; 955 return EOF; 956 } 957 tok->buf = newbuf; 958 tok->inp = tok->buf + curvalid; 959 tok->end = tok->buf + newsize; 960 tok->start = curstart < 0 ? NULL : 961 tok->buf + curstart; 962 if (decoding_fgets(tok->inp, 963 (int)(tok->end - tok->inp), 964 tok) == NULL) { 965 /* Break out early on decoding 966 errors, as tok->buf will be NULL 967 */ 968 if (tok->decoding_erred) 969 return EOF; 970 /* Last line does not end in \n, 971 fake one */ 972 strcpy(tok->inp, "\n"); 973 } 974 tok->inp = strchr(tok->inp, '\0'); 975 done = tok->inp[-1] == '\n'; 976 } 977 if (tok->buf != NULL) { 978 tok->cur = tok->buf + cur; 979 tok->line_start = tok->cur; 980 /* replace "\r\n" with "\n" */ 981 /* For Mac leave the \r, giving a syntax error */ 982 pt = tok->inp - 2; 983 if (pt >= tok->buf && *pt == '\r') { 984 *pt++ = '\n'; 985 *pt = '\0'; 986 tok->inp = pt; 987 } 988 } 989 } 990 if (tok->done != E_OK) { 991 if (tok->prompt != NULL) 992 PySys_WriteStderr("\n"); 993 tok->cur = tok->inp; 994 return EOF; 995 } 996 } 997 /*NOTREACHED*/ 922 998 } 923 999 … … 928 1004 tok_backup(register struct tok_state *tok, register int c) 929 1005 { 930 931 932 Py_FatalError("tok_backup: beginof buffer");933 934 935 1006 if (c != EOF) { 1007 if (--tok->cur < tok->buf) 1008 Py_FatalError("tok_backup: beginning of buffer"); 1009 if (*tok->cur != c) 1010 *tok->cur = c; 1011 } 936 1012 } 937 1013 … … 942 1018 PyToken_OneChar(int c) 943 1019 { 944 945 case '(':return LPAR;946 case ')':return RPAR;947 case '[':return LSQB;948 case ']':return RSQB;949 case ':':return COLON;950 case ',':return COMMA;951 case ';':return SEMI;952 case '+':return PLUS;953 case '-':return MINUS;954 case '*':return STAR;955 case '/':return SLASH;956 case '|':return VBAR;957 case '&':return AMPER;958 case '<':return LESS;959 case '>':return GREATER;960 case '=':return EQUAL;961 case '.':return DOT;962 case '%':return PERCENT;963 case '`':return BACKQUOTE;964 case '{':return LBRACE;965 case '}':return RBRACE;966 case '^':return CIRCUMFLEX;967 case '~':return TILDE;968 969 default:return OP;970 1020 switch (c) { 1021 case '(': return LPAR; 1022 case ')': return RPAR; 1023 case '[': return LSQB; 1024 case ']': return RSQB; 1025 case ':': return COLON; 1026 case ',': return COMMA; 1027 case ';': return SEMI; 1028 case '+': return PLUS; 1029 case '-': return MINUS; 1030 case '*': return STAR; 1031 case '/': return SLASH; 1032 case '|': return VBAR; 1033 case '&': return AMPER; 1034 case '<': return LESS; 1035 case '>': return GREATER; 1036 case '=': return EQUAL; 1037 case '.': return DOT; 1038 case '%': return PERCENT; 1039 case '`': return BACKQUOTE; 1040 case '{': return LBRACE; 1041 case '}': return RBRACE; 1042 case '^': return CIRCUMFLEX; 1043 case '~': return TILDE; 1044 case '@': return AT; 1045 default: return OP; 1046 } 971 1047 } 972 1048 … … 975 1051 PyToken_TwoChars(int c1, int c2) 976 1052 { 977 978 979 980 case '=':return EQEQUAL;981 982 983 984 985 case '=':return NOTEQUAL;986 987 988 989 990 case '>':return NOTEQUAL;991 case '=':return LESSEQUAL;992 case '<':return LEFTSHIFT;993 994 995 996 997 case '=':return GREATEREQUAL;998 case '>':return RIGHTSHIFT;999 1000 1001 1002 1003 case '=':return PLUSEQUAL;1004 1005 1006 1007 1008 case '=':return MINEQUAL;1009 1010 1011 1012 1013 case '*':return DOUBLESTAR;1014 case '=':return STAREQUAL;1015 1016 1017 1018 1019 case '/':return DOUBLESLASH;1020 case '=':return SLASHEQUAL;1021 1022 1023 1024 1025 case '=':return VBAREQUAL;1026 1027 1028 1029 1030 case '=':return PERCENTEQUAL;1031 1032 1033 1034 1035 case '=':return AMPEREQUAL;1036 1037 1038 1039 1040 case '=':return CIRCUMFLEXEQUAL;1041 1042 1043 1044 1053 switch (c1) { 1054 case '=': 1055 switch (c2) { 1056 case '=': return EQEQUAL; 1057 } 1058 break; 1059 case '!': 1060 switch (c2) { 1061 case '=': return NOTEQUAL; 1062 } 1063 break; 1064 case '<': 1065 switch (c2) { 1066 case '>': return NOTEQUAL; 1067 case '=': return LESSEQUAL; 1068 case '<': return LEFTSHIFT; 1069 } 1070 break; 1071 case '>': 1072 switch (c2) { 1073 case '=': return GREATEREQUAL; 1074 case '>': return RIGHTSHIFT; 1075 } 1076 break; 1077 case '+': 1078 switch (c2) { 1079 case '=': return PLUSEQUAL; 1080 } 1081 break; 1082 case '-': 1083 switch (c2) { 1084 case '=': return MINEQUAL; 1085 } 1086 break; 1087 case '*': 1088 switch (c2) { 1089 case '*': return DOUBLESTAR; 1090 case '=': return STAREQUAL; 1091 } 1092 break; 1093 case '/': 1094 switch (c2) { 1095 case '/': return DOUBLESLASH; 1096 case '=': return SLASHEQUAL; 1097 } 1098 break; 1099 case '|': 1100 switch (c2) { 1101 case '=': return VBAREQUAL; 1102 } 1103 break; 1104 case '%': 1105 switch (c2) { 1106 case '=': return PERCENTEQUAL; 1107 } 1108 break; 1109 case '&': 1110 switch (c2) { 1111 case '=': return AMPEREQUAL; 1112 } 1113 break; 1114 case '^': 1115 switch (c2) { 1116 case '=': return CIRCUMFLEXEQUAL; 1117 } 1118 break; 1119 } 1120 return OP; 1045 1121 } 1046 1122 … … 1048 1124 PyToken_ThreeChars(int c1, int c2, int c3) 1049 1125 { 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1126 switch (c1) { 1127 case '<': 1128 switch (c2) { 1129 case '<': 1130 switch (c3) { 1131 case '=': 1132 return LEFTSHIFTEQUAL; 1133 } 1134 break; 1135 } 1136 break; 1137 case '>': 1138 switch (c2) { 1139 case '>': 1140 switch (c3) { 1141 case '=': 1142 return RIGHTSHIFTEQUAL; 1143 } 1144 break; 1145 } 1146 break; 1147 case '*': 1148 switch (c2) { 1149 case '*': 1150 switch (c3) { 1151 case '=': 1152 return DOUBLESTAREQUAL; 1153 } 1154 break; 1155 } 1156 break; 1157 case '/': 1158 switch (c2) { 1159 case '/': 1160 switch (c3) { 1161 case '=': 1162 return DOUBLESLASHEQUAL; 1163 } 1164 break; 1165 } 1166 break; 1167 } 1168 return OP; 1093 1169 } 1094 1170 … … 1096 1172 indenterror(struct tok_state *tok) 1097 1173 { 1098 if (tok->alterror) { 1099 tok->done = E_TABSPACE; 1100 tok->cur = tok->inp; 1101 return 1; 1102 } 1103 if (tok->altwarning) { 1104 PySys_WriteStderr("%s: inconsistent use of tabs and spaces " 1105 "in indentation\n", tok->filename); 1106 tok->altwarning = 0; 1107 } 1108 return 0; 1109 } 1110 1174 if (tok->alterror) { 1175 tok->done = E_TABSPACE; 1176 tok->cur = tok->inp; 1177 return 1; 1178 } 1179 if (tok->altwarning) { 1180 PySys_WriteStderr("%s: inconsistent use of tabs and spaces " 1181 "in indentation\n", tok->filename); 1182 tok->altwarning = 0; 1183 } 1184 return 0; 1185 } 1111 1186 1112 1187 /* Get next token, after space stripping etc. */ … … 1115 1190 tok_get(register struct tok_state *tok, char **p_start, char **p_end) 1116 1191 { 1117 1118 1119 1120 1192 register int c; 1193 int blankline; 1194 1195 *p_start = *p_end = NULL; 1121 1196 nextline: 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1197 tok->start = NULL; 1198 blankline = 0; 1199 1200 /* Get indentation level */ 1201 if (tok->atbol) { 1202 register int col = 0; 1203 register int altcol = 0; 1204 tok->atbol = 0; 1205 for (;;) { 1206 c = tok_nextc(tok); 1207 if (c == ' ') 1208 col++, altcol++; 1209 else if (c == '\t') { 1210 col = (col/tok->tabsize + 1) * tok->tabsize; 1211 altcol = (altcol/tok->alttabsize + 1) 1212 * tok->alttabsize; 1213 } 1214 else if (c == '\014') /* Control-L (formfeed) */ 1215 col = altcol = 0; /* For Emacs users */ 1216 else 1217 break; 1218 } 1219 tok_backup(tok, c); 1220 if (c == '#' || c == '\n') { 1221 /* Lines with only whitespace and/or comments 1222 shouldn't affect the indentation and are 1223 not passed to the parser as NEWLINE tokens, 1224 except *totally* empty lines in interactive 1225 mode, which signal the end of a command group. */ 1226 if (col == 0 && c == '\n' && tok->prompt != NULL) 1227 blankline = 0; /* Let it through */ 1228 else 1229 blankline = 1; /* Ignore completely */ 1230 /* We can't jump back right here since we still 1231 may need to skip to the end of a comment */ 1232 } 1233 if (!blankline && tok->level == 0) { 1234 if (col == tok->indstack[tok->indent]) { 1235 /* No change */ 1236 if (altcol != tok->altindstack[tok->indent]) { 1237 if (indenterror(tok)) 1238 return ERRORTOKEN; 1239 } 1240 } 1241 else if (col > tok->indstack[tok->indent]) { 1242 /* Indent -- always one */ 1243 if (tok->indent+1 >= MAXINDENT) { 1244 tok->done = E_TOODEEP; 1245 tok->cur = tok->inp; 1246 return ERRORTOKEN; 1247 } 1248 if (altcol <= tok->altindstack[tok->indent]) { 1249 if (indenterror(tok)) 1250 return ERRORTOKEN; 1251 } 1252 tok->pendin++; 1253 tok->indstack[++tok->indent] = col; 1254 tok->altindstack[tok->indent] = altcol; 1255 } 1256 else /* col < tok->indstack[tok->indent] */ { 1257 /* Dedent -- any number, must be consistent */ 1258 while (tok->indent > 0 && 1259 col < tok->indstack[tok->indent]) { 1260 tok->pendin--; 1261 tok->indent--; 1262 } 1263 if (col != tok->indstack[tok->indent]) { 1264 tok->done = E_DEDENT; 1265 tok->cur = tok->inp; 1266 return ERRORTOKEN; 1267 } 1268 if (altcol != tok->altindstack[tok->indent]) { 1269 if (indenterror(tok)) 1270 return ERRORTOKEN; 1271 } 1272 } 1273 } 1274 } 1275 1276 tok->start = tok->cur; 1277 1278 /* Return pending indents/dedents */ 1279 if (tok->pendin != 0) { 1280 if (tok->pendin < 0) { 1281 tok->pendin++; 1282 return DEDENT; 1283 } 1284 else { 1285 tok->pendin--; 1286 return INDENT; 1287 } 1288 } 1214 1289 1215 1290 again: 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 "tab-width:",/* Emacs */1229 ":tabstop=",/* vim, full form */1230 ":ts=",/* vim, abbreviated form */1231 "set tabsize=",/* will vi never die? */1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 if (isalpha(c) || c == '_') {1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 while (isalnum(c) || c == '_') {1294 1291 tok->start = NULL; 1292 /* Skip spaces */ 1293 do { 1294 c = tok_nextc(tok); 1295 } while (c == ' ' || c == '\t' || c == '\014'); 1296 1297 /* Set start of current token */ 1298 tok->start = tok->cur - 1; 1299 1300 /* Skip comment, while looking for tab-setting magic */ 1301 if (c == '#') { 1302 static char *tabforms[] = { 1303 "tab-width:", /* Emacs */ 1304 ":tabstop=", /* vim, full form */ 1305 ":ts=", /* vim, abbreviated form */ 1306 "set tabsize=", /* will vi never die? */ 1307 /* more templates can be added here to support other editors */ 1308 }; 1309 char cbuf[80]; 1310 char *tp, **cp; 1311 tp = cbuf; 1312 do { 1313 *tp++ = c = tok_nextc(tok); 1314 } while (c != EOF && c != '\n' && 1315 (size_t)(tp - cbuf + 1) < sizeof(cbuf)); 1316 *tp = '\0'; 1317 for (cp = tabforms; 1318 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]); 1319 cp++) { 1320 if ((tp = strstr(cbuf, *cp))) { 1321 int newsize = atoi(tp + strlen(*cp)); 1322 1323 if (newsize >= 1 && newsize <= 40) { 1324 tok->tabsize = newsize; 1325 if (Py_VerboseFlag) 1326 PySys_WriteStderr( 1327 "Tab size set to %d\n", 1328 newsize); 1329 } 1330 } 1331 } 1332 while (c != EOF && c != '\n') 1333 c = tok_nextc(tok); 1334 } 1335 1336 /* Check for EOF and errors now */ 1337 if (c == EOF) { 1338 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; 1339 } 1340 1341 /* Identifier (most frequent token!) */ 1342 if (Py_ISALPHA(c) || c == '_') { 1343 /* Process r"", u"" and ur"" */ 1344 switch (c) { 1345 case 'b': 1346 case 'B': 1347 c = tok_nextc(tok); 1348 if (c == 'r' || c == 'R') 1349 c = tok_nextc(tok); 1350 if (c == '"' || c == '\'') 1351 goto letter_quote; 1352 break; 1353 case 'r': 1354 case 'R': 1355 c = tok_nextc(tok); 1356 if (c == '"' || c == '\'') 1357 goto letter_quote; 1358 break; 1359 case 'u': 1360 case 'U': 1361 c = tok_nextc(tok); 1362 if (c == 'r' || c == 'R') 1363 c = tok_nextc(tok); 1364 if (c == '"' || c == '\'') 1365 goto letter_quote; 1366 break; 1367 } 1368 while (c != EOF && (Py_ISALNUM(c) || c == '_')) { 1369 c = tok_nextc(tok); 1295 1370 #ifdef __KLIBC__ 1296 1297 1371 if (c == EOF) 1372 break; 1298 1373 #endif 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1374 } 1375 tok_backup(tok, c); 1376 *p_start = tok->start; 1377 *p_end = tok->cur; 1378 return NAME; 1379 } 1380 1381 /* Newline */ 1382 if (c == '\n') { 1383 tok->atbol = 1; 1384 if (blankline || tok->level > 0) 1385 goto nextline; 1386 *p_start = tok->start; 1387 *p_end = tok->cur - 1; /* Leave '\n' out of the string */ 1388 tok->cont_line = 0; 1389 return NEWLINE; 1390 } 1391 1392 /* Period or number starting with period? */ 1393 if (c == '.') { 1394 c = tok_nextc(tok); 1395 if (isdigit(c)) { 1396 goto fraction; 1397 } 1398 else { 1399 tok_backup(tok, c); 1400 *p_start = tok->start; 1401 *p_end = tok->cur; 1402 return DOT; 1403 } 1404 } 1405 1406 /* Number */ 1407 if (isdigit(c)) { 1408 if (c == '0') { 1409 /* Hex, octal or binary -- maybe. */ 1410 c = tok_nextc(tok); 1411 if (c == '.') 1412 goto fraction; 1338 1413 #ifndef WITHOUT_COMPLEX 1339 1340 1414 if (c == 'j' || c == 'J') 1415 goto imaginary; 1341 1416 #endif 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1417 if (c == 'x' || c == 'X') { 1418 1419 /* Hex */ 1420 c = tok_nextc(tok); 1421 if (!isxdigit(c)) { 1422 tok->done = E_TOKEN; 1423 tok_backup(tok, c); 1424 return ERRORTOKEN; 1425 } 1426 do { 1427 c = tok_nextc(tok); 1428 } while (isxdigit(c)); 1429 } 1430 else if (c == 'o' || c == 'O') { 1431 /* Octal */ 1432 c = tok_nextc(tok); 1433 if (c < '0' || c >= '8') { 1434 tok->done = E_TOKEN; 1435 tok_backup(tok, c); 1436 return ERRORTOKEN; 1437 } 1438 do { 1439 c = tok_nextc(tok); 1440 } while ('0' <= c && c < '8'); 1441 } 1442 else if (c == 'b' || c == 'B') { 1443 /* Binary */ 1444 c = tok_nextc(tok); 1445 if (c != '0' && c != '1') { 1446 tok->done = E_TOKEN; 1447 tok_backup(tok, c); 1448 return ERRORTOKEN; 1449 } 1450 do { 1451 c = tok_nextc(tok); 1452 } while (c == '0' || c == '1'); 1453 } 1454 else { 1455 int found_decimal = 0; 1456 /* Octal; c is first char of it */ 1457 /* There's no 'isoctdigit' macro, sigh */ 1458 while ('0' <= c && c < '8') { 1459 c = tok_nextc(tok); 1460 } 1461 if (isdigit(c)) { 1462 found_decimal = 1; 1463 do { 1464 c = tok_nextc(tok); 1465 } while (isdigit(c)); 1466 } 1467 if (c == '.') 1468 goto fraction; 1469 else if (c == 'e' || c == 'E') 1470 goto exponent; 1396 1471 #ifndef WITHOUT_COMPLEX 1397 1398 1472 else if (c == 'j' || c == 'J') 1473 goto imaginary; 1399 1474 #endif 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1475 else if (found_decimal) { 1476 tok->done = E_TOKEN; 1477 tok_backup(tok, c); 1478 return ERRORTOKEN; 1479 } 1480 } 1481 if (c == 'l' || c == 'L') 1482 c = tok_nextc(tok); 1483 } 1484 else { 1485 /* Decimal */ 1486 do { 1487 c = tok_nextc(tok); 1488 } while (isdigit(c)); 1489 if (c == 'l' || c == 'L') 1490 c = tok_nextc(tok); 1491 else { 1492 /* Accept floating point numbers. */ 1493 if (c == '.') { 1494 fraction: 1495 /* Fraction */ 1496 do { 1497 c = tok_nextc(tok); 1498 } while (isdigit(c)); 1499 } 1500 if (c == 'e' || c == 'E') { 1501 exponent: 1502 /* Exponent part */ 1503 c = tok_nextc(tok); 1504 if (c == '+' || c == '-') 1505 c = tok_nextc(tok); 1506 if (!isdigit(c)) { 1507 tok->done = E_TOKEN; 1508 tok_backup(tok, c); 1509 return ERRORTOKEN; 1510 } 1511 do { 1512 c = tok_nextc(tok); 1513 } while (isdigit(c)); 1514 } 1440 1515 #ifndef WITHOUT_COMPLEX 1441 1442 1443 1444 1516 if (c == 'j' || c == 'J') 1517 /* Imaginary part */ 1518 imaginary: 1519 c = tok_nextc(tok); 1445 1520 #endif 1446 1447 1448 1449 1450 1451 1452 1521 } 1522 } 1523 tok_backup(tok, c); 1524 *p_start = tok->start; 1525 *p_end = tok->cur; 1526 return NUMBER; 1527 } 1453 1528 1454 1529 letter_quote: 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1530 /* String */ 1531 if (c == '\'' || c == '"') { 1532 Py_ssize_t quote2 = tok->cur - tok->start + 1; 1533 int quote = c; 1534 int triple = 0; 1535 int tripcount = 0; 1536 for (;;) { 1537 c = tok_nextc(tok); 1538 if (c == '\n') { 1539 if (!triple) { 1540 tok->done = E_EOLS; 1541 tok_backup(tok, c); 1542 return ERRORTOKEN; 1543 } 1544 tripcount = 0; 1545 tok->cont_line = 1; /* multiline string. */ 1546 } 1547 else if (c == EOF) { 1548 if (triple) 1549 tok->done = E_EOFS; 1550 else 1551 tok->done = E_EOLS; 1552 tok->cur = tok->inp; 1553 return ERRORTOKEN; 1554 } 1555 else if (c == quote) { 1556 tripcount++; 1557 if (tok->cur - tok->start == quote2) { 1558 c = tok_nextc(tok); 1559 if (c == quote) { 1560 triple = 1; 1561 tripcount = 0; 1562 continue; 1563 } 1564 tok_backup(tok, c); 1565 } 1566 if (!triple || tripcount == 3) 1567 break; 1568 } 1569 else if (c == '\\') { 1570 tripcount = 0; 1571 c = tok_nextc(tok); 1572 if (c == EOF) { 1573 tok->done = E_EOLS; 1574 tok->cur = tok->inp; 1575 return ERRORTOKEN; 1576 } 1577 } 1578 else 1579 tripcount = 0; 1580 } 1581 *p_start = tok->start; 1582 *p_end = tok->cur; 1583 return STRING; 1584 } 1585 1586 /* Line continuation */ 1587 if (c == '\\') { 1588 c = tok_nextc(tok); 1589 if (c != '\n') { 1590 tok->done = E_LINECONT; 1591 tok->cur = tok->inp; 1592 return ERRORTOKEN; 1593 } 1594 tok->cont_line = 1; 1595 goto again; /* Read next line */ 1596 } 1597 1598 /* Check for two-character token */ 1599 { 1600 int c2 = tok_nextc(tok); 1601 int token = PyToken_TwoChars(c, c2); 1527 1602 #ifndef PGEN 1528 1529 1530 1531 1532 1533 1534 1535 1603 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') { 1604 if (PyErr_WarnExplicit(PyExc_DeprecationWarning, 1605 "<> not supported in 3.x; use !=", 1606 tok->filename, tok->lineno, 1607 NULL, NULL)) { 1608 return ERRORTOKEN; 1609 } 1610 } 1536 1611 #endif 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1612 if (token != OP) { 1613 int c3 = tok_nextc(tok); 1614 int token3 = PyToken_ThreeChars(c, c2, c3); 1615 if (token3 != OP) { 1616 token = token3; 1617 } else { 1618 tok_backup(tok, c3); 1619 } 1620 *p_start = tok->start; 1621 *p_end = tok->cur; 1622 return token; 1623 } 1624 tok_backup(tok, c2); 1625 } 1626 1627 /* Keep track of parentheses nesting level */ 1628 switch (c) { 1629 case '(': 1630 case '[': 1631 case '{': 1632 tok->level++; 1633 break; 1634 case ')': 1635 case ']': 1636 case '}': 1637 tok->level--; 1638 break; 1639 } 1640 1641 /* Punctuation character */ 1642 *p_start = tok->start; 1643 *p_end = tok->cur; 1644 return PyToken_OneChar(c); 1570 1645 } 1571 1646 … … 1573 1648 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) 1574 1649 { 1575 1576 1577 1578 1579 1580 1650 int result = tok_get(tok, p_start, p_end); 1651 if (tok->decoding_erred) { 1652 result = ERRORTOKEN; 1653 tok->done = E_DECODE; 1654 } 1655 return result; 1581 1656 } 1582 1657 … … 1589 1664 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset) 1590 1665 { 1591 1666 return NULL; 1592 1667 } 1593 1668 #else … … 1595 1670 static PyObject * 1596 1671 dec_utf8(const char *enc, const char *text, size_t len) { 1597 PyObject *ret = NULL; 1598 1599 1600 1601 1602 1603 1604 1605 1606 1672 PyObject *ret = NULL; 1673 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace"); 1674 if (unicode_text) { 1675 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace"); 1676 Py_DECREF(unicode_text); 1677 } 1678 if (!ret) { 1679 PyErr_Clear(); 1680 } 1681 return ret; 1607 1682 } 1608 1683 char * 1609 1684 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset) 1610 1685 { 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 PyObject *offsetobj = dec_utf8(tok->encoding, 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1686 char *text = NULL; 1687 if (tok->encoding) { 1688 /* convert source to original encondig */ 1689 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len); 1690 if (lineobj != NULL) { 1691 int linelen = PyString_Size(lineobj); 1692 const char *line = PyString_AsString(lineobj); 1693 text = PyObject_MALLOC(linelen + 1); 1694 if (text != NULL && line != NULL) { 1695 if (linelen) 1696 strncpy(text, line, linelen); 1697 text[linelen] = '\0'; 1698 } 1699 Py_DECREF(lineobj); 1700 1701 /* adjust error offset */ 1702 if (*offset > 1) { 1703 PyObject *offsetobj = dec_utf8(tok->encoding, 1704 tok->buf, *offset-1); 1705 if (offsetobj) { 1706 *offset = PyString_Size(offsetobj) + 1; 1707 Py_DECREF(offsetobj); 1708 } 1709 } 1710 1711 } 1712 } 1713 return text; 1639 1714 1640 1715 } … … 1648 1723 tok_dump(int type, char *start, char *end) 1649 1724 { 1650 1651 1652 1725 printf("%s", _PyParser_TokenNames[type]); 1726 if (type == NAME || type == NUMBER || type == STRING || type == OP) 1727 printf("(%.*s)", (int)(end - start), start); 1653 1728 } 1654 1729
Note:
See TracChangeset
for help on using the changeset viewer.