NotificationsYou must be signed in to change notification settings
Fork32k
Star67.3k

Commitd4aa857

authored

gh-102856: Clean some of the PEP 701 tokenizer implementation (#103634)

1 parent5f7d68e commitd4aa857Copy full SHA for d4aa857

File tree

2 files changed

+67

-74

lines changed

Parser
- tokenizer.c
- tokenizer.h

2 files changed

+67

-74

lines changed

`‎Parser/tokenizer.c`

Lines changed: 65 additions & 71 deletions

Original file line number	Diff line number	Diff line change
`@@ -11,11 +11,6 @@`
`11`	`11`	`#include"tokenizer.h"`
`12`	`12`	`#include"errcode.h"`
`13`	`13`
`14`		`-#include"unicodeobject.h"`
`15`		`-#include"bytesobject.h"`
`16`		`-#include"fileobject.h"`
`17`		`-#include"abstract.h"`
`18`		`-`
`19`	`14`	`/* Alternate tab spacing */`
`20`	`15`	`#defineALTTABSIZE 1`
`21`	`16`
`@@ -43,6 +38,8 @@`
`43`	`38`	`tok->lineno++; \`
`44`	`39`	`tok->col_offset = 0;`
`45`	`40`
	`41`	`+#defineINSIDE_FSTRING(tok) (tok->tok_mode_stack_index > 0)`
	`42`	`+#defineINSIDE_FSTRING_EXPR(tok) (tok->curly_bracket_expr_start_depth >= 0)`
`46`	`43`	`#ifdefPy_DEBUG`
`47`	`44`	`staticinlinetokenizer_modeTOK_GET_MODE(structtok_statetok) {`
`48`	`45`	`assert(tok->tok_mode_stack_index >=0);`
`@@ -54,15 +51,9 @@ static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) {`
`54`	`51`	`assert(tok->tok_mode_stack_index<MAXLEVEL);`
`55`	`52`	`return&(tok->tok_mode_stack[++tok->tok_mode_stack_index]);`
`56`	`53`	`}`
`57`		`-staticinlineintTOK_GET_BRACKET_MARK(tokenizer_modemode) {`
`58`		`-assert(mode->bracket_mark_index >=0);`
`59`		`-assert(mode->bracket_mark_index<MAX_EXPR_NESTING);`
`60`		`-return&(mode->bracket_mark[mode->bracket_mark_index]);`
`61`		`-}`
`62`	`54`	`#else`
`63`	`55`	`#defineTOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index]))`
`64`	`56`	`#defineTOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index]))`
`65`		`-#defineTOK_GET_BRACKET_MARK(mode) (&(mode->bracket_mark[mode->bracket_mark_index]))`
`66`	`57`	`#endif`
`67`	`58`
`68`	`59`	`/* Forward */`
`@@ -398,20 +389,7 @@ update_fstring_expr(struct tok_state *tok, char cur)`
`398`	`389`	`tokenizer_mode*tok_mode=TOK_GET_MODE(tok);`
`399`	`390`
`400`	`391`	`switch (cur) {`
`401`		`-case'{':`
`402`		`-if (tok_mode->last_expr_buffer!=NULL) {`
`403`		`-PyMem_Free(tok_mode->last_expr_buffer);`
`404`		`- }`
`405`		`-tok_mode->last_expr_buffer=PyMem_Malloc(size);`
`406`		`-if (tok_mode->last_expr_buffer==NULL) {`
`407`		`-tok->done=E_NOMEM;`
`408`		`-return0;`
`409`		`- }`
`410`		`-tok_mode->last_expr_size=size;`
`411`		`-tok_mode->last_expr_end=-1;`
`412`		`-strncpy(tok_mode->last_expr_buffer,tok->cur,size);`
`413`		`-break;`
`414`		`-case0:`
	`392`	`+case0:`
`415`	`393`	`if (!tok_mode->last_expr_buffer\|\|tok_mode->last_expr_end >=0) {`
`416`	`394`	`return1;`
`417`	`395`	`}`
`@@ -421,23 +399,38 @@ update_fstring_expr(struct tok_state *tok, char cur)`
`421`	`399`	`);`
`422`	`400`	`if (new_buffer==NULL) {`
`423`	`401`	`PyMem_Free(tok_mode->last_expr_buffer);`
`424`		`-tok->done=E_NOMEM;`
`425`		`-return0;`
	`402`	`+ gotoerror;`
`426`	`403`	`}`
`427`	`404`	`tok_mode->last_expr_buffer=new_buffer;`
`428`	`405`	`strncpy(tok_mode->last_expr_buffer+tok_mode->last_expr_size,tok->cur,size);`
`429`	`406`	`tok_mode->last_expr_size+=size;`
`430`	`407`	`break;`
	`408`	`+case'{':`
	`409`	`+if (tok_mode->last_expr_buffer!=NULL) {`
	`410`	`+PyMem_Free(tok_mode->last_expr_buffer);`
	`411`	`+ }`
	`412`	`+tok_mode->last_expr_buffer=PyMem_Malloc(size);`
	`413`	`+if (tok_mode->last_expr_buffer==NULL) {`
	`414`	`+ gotoerror;`
	`415`	`+ }`
	`416`	`+tok_mode->last_expr_size=size;`
	`417`	`+tok_mode->last_expr_end=-1;`
	`418`	`+strncpy(tok_mode->last_expr_buffer,tok->cur,size);`
	`419`	`+break;`
`431`	`420`	`case'}':`
`432`	`421`	`case'!':`
`433`	`422`	`case':':`
`434`	`423`	`if (tok_mode->last_expr_end==-1) {`
`435`	`424`	`tok_mode->last_expr_end=strlen(tok->start);`
`436`	`425`	`}`
`437`	`426`	`break;`
	`427`	`+default:`
	`428`	`+Py_UNREACHABLE();`
`438`	`429`	`}`
`439`		`-`
`440`	`430`	`return1;`
	`431`	`+error:`
	`432`	`+tok->done=E_NOMEM;`
	`433`	`+return0;`
`441`	`434`	`}`
`442`	`435`
`443`	`436`	`staticvoid`
`@@ -1766,7 +1759,7 @@ tok_get_normal_mode(struct tok_state tok, tokenizer_mode current_tok, struct t`
`1766`	`1759`	`/* Skip comment, unless it's a type comment */`
`1767`	`1760`	`if (c=='#') {`
`1768`	`1761`
`1769`		`-if (tok->tok_mode_stack_index>0) {`
	`1762`	`+if (INSIDE_FSTRING(tok)) {`
`1770`	`1763`	`returnMAKE_TOKEN(syntaxerror(tok,"f-string expression part cannot include '#'"));`
`1771`	`1764`	`}`
`1772`	`1765`
`@@ -2208,32 +2201,31 @@ tok_get_normal_mode(struct tok_state tok, tokenizer_mode current_tok, struct t`
`2208`	`2201`
`2209`	`2202`	`p_start=tok->start;`
`2210`	`2203`	`p_end=tok->cur;`
`2211`		`-tokenizer_mode*current_tok=TOK_NEXT_MODE(tok);`
`2212`		`-current_tok->kind=TOK_FSTRING_MODE;`
`2213`		`-current_tok->f_string_quote=quote;`
`2214`		`-current_tok->f_string_quote_size=quote_size;`
`2215`		`-current_tok->f_string_start=tok->start;`
`2216`		`-current_tok->f_string_multi_line_start=tok->line_start;`
`2217`		`-current_tok->last_expr_buffer=NULL;`
`2218`		`-current_tok->last_expr_size=0;`
`2219`		`-current_tok->last_expr_end=-1;`
	`2204`	`+tokenizer_mode*the_current_tok=TOK_NEXT_MODE(tok);`
	`2205`	`+the_current_tok->kind=TOK_FSTRING_MODE;`
	`2206`	`+the_current_tok->f_string_quote=quote;`
	`2207`	`+the_current_tok->f_string_quote_size=quote_size;`
	`2208`	`+the_current_tok->f_string_start=tok->start;`
	`2209`	`+the_current_tok->f_string_multi_line_start=tok->line_start;`
	`2210`	`+the_current_tok->last_expr_buffer=NULL;`
	`2211`	`+the_current_tok->last_expr_size=0;`
	`2212`	`+the_current_tok->last_expr_end=-1;`
`2220`	`2213`
`2221`	`2214`	`switch (*tok->start) {`
`2222`	`2215`	`case'F':`
`2223`	`2216`	`case'f':`
`2224`		`-current_tok->f_string_raw=tolower(*(tok->start+1))=='r';`
	`2217`	`+the_current_tok->f_string_raw=tolower(*(tok->start+1))=='r';`
`2225`	`2218`	`break;`
`2226`	`2219`	`case'R':`
`2227`	`2220`	`case'r':`
`2228`		`-current_tok->f_string_raw=1;`
	`2221`	`+the_current_tok->f_string_raw=1;`
`2229`	`2222`	`break;`
`2230`	`2223`	`default:`
`2231`	`2224`	`Py_UNREACHABLE();`
`2232`	`2225`	`}`
`2233`	`2226`
`2234`		`-current_tok->bracket_stack=0;`
`2235`		`-current_tok->bracket_mark[0]=0;`
`2236`		`-current_tok->bracket_mark_index=-1;`
	`2227`	`+the_current_tok->curly_bracket_depth=0;`
	`2228`	`+the_current_tok->curly_bracket_expr_start_depth=-1;`
`2237`	`2229`	`returnMAKE_TOKEN(FSTRING_START);`
`2238`	`2230`	`}`
`2239`	`2231`
`@@ -2282,15 +2274,15 @@ tok_get_normal_mode(struct tok_state tok, tokenizer_mode current_tok, struct t`
`2282`	`2274`	`intstart=tok->lineno;`
`2283`	`2275`	`tok->lineno=tok->first_lineno;`
`2284`	`2276`
`2285`		`-if (tok->tok_mode_stack_index>0) {`
	`2277`	`+if (INSIDE_FSTRING(tok)) {`
`2286`	`2278`	`/* When we are in an f-string, before raising the`
`2287`	`2279`	`* unterminated string literal error, check whether`
`2288`	`2280`	`* does the initial quote matches with f-strings quotes`
`2289`	`2281`	`* and if it is, then this must be a missing '}' token`
`2290`	`2282`	`* so raise the proper error */`
`2291`		`-tokenizer_mode*current_tok=TOK_GET_MODE(tok);`
`2292`		`-if (current_tok->f_string_quote==quote&&`
`2293`		`-current_tok->f_string_quote_size==quote_size) {`
	`2283`	`+tokenizer_mode*the_current_tok=TOK_GET_MODE(tok);`
	`2284`	`+if (the_current_tok->f_string_quote==quote&&`
	`2285`	`+the_current_tok->f_string_quote_size==quote_size) {`
`2294`	`2286`	`returnMAKE_TOKEN(syntaxerror(tok,"f-string: expecting '}'",start));`
`2295`	`2287`	`}`
`2296`	`2288`	`}`
`@@ -2339,18 +2331,17 @@ tok_get_normal_mode(struct tok_state tok, tokenizer_mode current_tok, struct t`
`2339`	`2331`
`2340`	`2332`	`/* Punctuation character */`
`2341`	`2333`	`intis_punctuation= (c==':'\|\|c=='}'\|\|c=='!'\|\|c=='{');`
`2342`		`-if (is_punctuation&&tok->tok_mode_stack_index>0&&current_tok->bracket_mark_index >=0) {`
`2343`		`-intmark=*TOK_GET_BRACKET_MARK(current_tok);`
`2344`		`-/* This code block gets executed before the bracket_stack is incremented`
	`2334`	`+if (is_punctuation&&INSIDE_FSTRING(tok)&&INSIDE_FSTRING_EXPR(current_tok)) {`
	`2335`	`+/* This code block gets executed before the curly_bracket_depth is incremented`
`2345`	`2336`	* by the `{` case, so for ensuring that we are on the 0th level, we need
`2346`	`2337`	`* to adjust it manually */`
`2347`		`-intcursor=current_tok->bracket_stack- (c!='{');`
	`2338`	`+intcursor=current_tok->curly_bracket_depth- (c!='{');`
`2348`	`2339`
`2349`	`2340`	`if (cursor==0&& !update_fstring_expr(tok,c)) {`
`2350`	`2341`	`returnMAKE_TOKEN(ENDMARKER);`
`2351`	`2342`	`}`
`2352`	`2343`
`2353`		`-if (c==':'&&cursor==mark) {`
	`2344`	`+if (c==':'&&cursor==current_tok->curly_bracket_expr_start_depth) {`
`2354`	`2345`	`current_tok->kind=TOK_FSTRING_MODE;`
`2355`	`2346`	`p_start=tok->start;`
`2356`	`2347`	`p_end=tok->cur;`
`@@ -2390,16 +2381,15 @@ tok_get_normal_mode(struct tok_state tok, tokenizer_mode current_tok, struct t`
`2390`	`2381`	`tok->parenlinenostack[tok->level]=tok->lineno;`
`2391`	`2382`	`tok->parencolstack[tok->level]= (int)(tok->start-tok->line_start);`
`2392`	`2383`	`tok->level++;`
`2393`		`-`
`2394`		`-if (tok->tok_mode_stack_index>0) {`
`2395`		`-current_tok->bracket_stack++;`
	`2384`	`+if (INSIDE_FSTRING(tok)) {`
	`2385`	`+current_tok->curly_bracket_depth++;`
`2396`	`2386`	`}`
`2397`	`2387`	`break;`
`2398`	`2388`	`case')':`
`2399`	`2389`	`case']':`
`2400`	`2390`	`case'}':`
`2401`	`2391`	`if (!tok->level) {`
`2402`		`-if (tok->tok_mode_stack_index>0&& !current_tok->bracket_stack&&c=='}') {`
	`2392`	`+if (INSIDE_FSTRING(tok)&& !current_tok->curly_bracket_depth&&c=='}') {`
`2403`	`2393`	`returnMAKE_TOKEN(syntaxerror(tok,"f-string: single '}' is not allowed"));`
`2404`	`2394`	`}`
`2405`	`2395`	`returnMAKE_TOKEN(syntaxerror(tok,"unmatched '%c'",c));`
`@@ -2415,10 +2405,10 @@ tok_get_normal_mode(struct tok_state tok, tokenizer_mode current_tok, struct t`
`2415`	`2405`	`nested expression, then instead of matching a different`
`2416`	`2406`	`syntactical construct with it; we'll throw an unmatched`
`2417`	`2407`	`parentheses error. */`
`2418`		`-if (tok->tok_mode_stack_index>0&&opening=='{') {`
`2419`		`-assert(current_tok->bracket_stack >=0);`
`2420`		`-intprevious_bracket=current_tok->bracket_stack-1;`
`2421`		`-if (previous_bracket==*TOK_GET_BRACKET_MARK(current_tok)) {`
	`2408`	`+if (INSIDE_FSTRING(tok)&&opening=='{') {`
	`2409`	`+assert(current_tok->curly_bracket_depth >=0);`
	`2410`	`+intprevious_bracket=current_tok->curly_bracket_depth-1;`
	`2411`	`+if (previous_bracket==current_tok->curly_bracket_expr_start_depth) {`
`2422`	`2412`	`returnMAKE_TOKEN(syntaxerror(tok,"f-string: unmatched '%c'",c));`
`2423`	`2413`	`}`
`2424`	`2414`	`}`
`@@ -2436,14 +2426,16 @@ tok_get_normal_mode(struct tok_state tok, tokenizer_mode current_tok, struct t`
`2436`	`2426`	`}`
`2437`	`2427`	`}`
`2438`	`2428`
`2439`		`-if (tok->tok_mode_stack_index>0) {`
`2440`		`-current_tok->bracket_stack--;`
`2441`		`-if (c=='}'&&current_tok->bracket_stack==*TOK_GET_BRACKET_MARK(current_tok)) {`
`2442`		`-current_tok->bracket_mark_index--;`
	`2429`	`+if (INSIDE_FSTRING(tok)) {`
	`2430`	`+current_tok->curly_bracket_depth--;`
	`2431`	`+if (c=='}'&&current_tok->curly_bracket_depth==current_tok->curly_bracket_expr_start_depth) {`
	`2432`	`+current_tok->curly_bracket_expr_start_depth--;`
`2443`	`2433`	`current_tok->kind=TOK_FSTRING_MODE;`
`2444`	`2434`	`}`
`2445`	`2435`	`}`
`2446`	`2436`	`break;`
	`2437`	`+default:`
	`2438`	`+break;`
`2447`	`2439`	`}`
`2448`	`2440`
`2449`	`2441`	`if (!Py_UNICODE_ISPRINTABLE(c)) {`
`@@ -2479,11 +2471,10 @@ tok_get_fstring_mode(struct tok_state tok, tokenizer_mode current_tok, struct`
`2479`	`2471`
`2480`	`2472`	`if ((start_char=='{'&&peek1!='{')\|\| (start_char=='}'&&peek1!='}')) {`
`2481`	`2473`	`if (start_char=='{') {`
`2482`		`-current_tok->bracket_mark_index++;`
`2483`		`-if (current_tok->bracket_mark_index >=MAX_EXPR_NESTING) {`
	`2474`	`+current_tok->curly_bracket_expr_start_depth++;`
	`2475`	`+if (current_tok->curly_bracket_expr_start_depth >=MAX_EXPR_NESTING) {`
`2484`	`2476`	`returnMAKE_TOKEN(syntaxerror(tok,"f-string: expressions nested too deeply"));`
`2485`	`2477`	`}`
`2486`		`-*TOK_GET_BRACKET_MARK(current_tok)=current_tok->bracket_stack;`
`2487`	`2478`	`}`
`2488`	`2479`	`TOK_GET_MODE(tok)->kind=TOK_REGULAR_MODE;`
`2489`	`2480`	`returntok_get_normal_mode(tok,current_tok,token);`
`@@ -2544,17 +2535,20 @@ tok_get_fstring_mode(struct tok_state tok, tokenizer_mode current_tok, struct`
`2544`	`2535`	`end_quote_size=0;`
`2545`	`2536`	`}`
`2546`	`2537`
`2547`		`-intin_format_spec=current_tok->last_expr_end!=-1&&current_tok->bracket_mark_index >=0;`
	`2538`	`+intin_format_spec= (`
	`2539`	`+current_tok->last_expr_end!=-1`
	`2540`	`+&&`
	`2541`	`+INSIDE_FSTRING_EXPR(current_tok)`
	`2542`	`+ );`
`2548`	`2543`	`if (c=='{') {`
`2549`	`2544`	`intpeek=tok_nextc(tok);`
`2550`	`2545`	`if (peek!='{'\|\|in_format_spec) {`
`2551`	`2546`	`tok_backup(tok,peek);`
`2552`	`2547`	`tok_backup(tok,c);`
`2553`		`-current_tok->bracket_mark_index++;`
`2554`		`-if (current_tok->bracket_mark_index >=MAX_EXPR_NESTING) {`
	`2548`	`+current_tok->curly_bracket_expr_start_depth++;`
	`2549`	`+if (current_tok->curly_bracket_expr_start_depth >=MAX_EXPR_NESTING) {`
`2555`	`2550`	`returnMAKE_TOKEN(syntaxerror(tok,"f-string: expressions nested too deeply"));`
`2556`	`2551`	`}`
`2557`		`-*TOK_GET_BRACKET_MARK(current_tok)=current_tok->bracket_stack;`
`2558`	`2552`	`TOK_GET_MODE(tok)->kind=TOK_REGULAR_MODE;`
`2559`	`2553`	`p_start=tok->start;`
`2560`	`2554`	`p_end=tok->cur;`

`‎Parser/tokenizer.h`

Lines changed: 2 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -43,9 +43,8 @@ enum tokenizer_mode_kind_t {`
`43`	`43`	`typedefstruct_tokenizer_mode {`
`44`	`44`	`enumtokenizer_mode_kind_tkind;`
`45`	`45`
`46`		`-intbracket_stack;`
`47`		`-intbracket_mark[MAX_EXPR_NESTING];`
`48`		`-intbracket_mark_index;`
	`46`	`+intcurly_bracket_depth;`
	`47`	`+intcurly_bracket_expr_start_depth;`
`49`	`48`
`50`	`49`	`charf_string_quote;`
`51`	`50`	`intf_string_quote_size;`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commitd4aa857

File tree

2 files changed

2 files changed

`‎Parser/tokenizer.c`

`‎Parser/tokenizer.h`

0 commit comments