NotificationsYou must be signed in to change notification settings
Fork33.5k
Star70.1k

Commitb1fae75

committed

Use tokenize from stdlib, detach completely from lib2to3 and fix some typos

1 parent3d593ef commitb1fae75Copy full SHA for b1fae75

File tree

5 files changed

+140

-48

lines changed

Makefile.pre.in
Parser/pgen

5 files changed

+140

-48

lines changed

`‎Makefile.pre.in‎`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -788,6 +788,7 @@ regen-grammar: regen-token`
`788`	`788`	`# from Grammar/Grammar using pgen`
`789`	`789`	`@$(MKDIR_P) Include`
`790`	`790`	`$(PYTHON_FOR_REGEN) -m Parser.pgen $(srcdir)/Grammar/Grammar \`
	`791`	`+$(srcdir)/Grammar/Tokens \`
`791`	`792`	`$(srcdir)/Include/graminit.h.new \`
`792`	`793`	`$(srcdir)/Python/graminit.c.new`
`793`	`794`	`$(UPDATE_FILE) $(srcdir)/Include/graminit.h $(srcdir)/Include/graminit.h.new`

`‎Parser/pgen/main.py‎`

Lines changed: 9 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -8,22 +8,26 @@ def main():`
`8`	`8`	`"grammar",type=str,help="The file with the grammar definition in EBNF format"`
`9`	`9`	`)`
`10`	`10`	`parser.add_argument(`
`11`		`-"gramminit_h",`
	`11`	`+"tokens",type=str,help="The file with the token definitions"`
	`12`	`+ )`
	`13`	`+parser.add_argument(`
	`14`	`+"graminit_h",`
`12`	`15`	`type=argparse.FileType('w'),`
`13`	`16`	`help="The path to write the grammar's non-terminals as #defines",`
`14`	`17`	`)`
`15`	`18`	`parser.add_argument(`
`16`		`-"gramminit_c",`
	`19`	`+"graminit_c",`
`17`	`20`	`type=argparse.FileType('w'),`
`18`	`21`	`help="The path to write the grammar as initialized data",`
`19`	`22`	`)`
	`23`	`+`
`20`	`24`	`parser.add_argument("--verbose","-v",action="count")`
`21`	`25`	`args=parser.parse_args()`
`22`	`26`
`23`		`-p=ParserGenerator(args.grammar,verbose=args.verbose)`
	`27`	`+p=ParserGenerator(args.grammar,args.tokens,verbose=args.verbose)`
`24`	`28`	`grammar=p.make_grammar()`
`25`		`-grammar.produce_graminit_h(args.gramminit_h.write)`
`26`		`-grammar.produce_graminit_c(args.gramminit_c.write)`
	`29`	`+grammar.produce_graminit_h(args.graminit_h.write)`
	`30`	`+grammar.produce_graminit_c(args.graminit_c.write)`
`27`	`31`
`28`	`32`
`29`	`33`	`if__name__=="__main__":`

`‎Parser/pgen/grammar.py‎`

Lines changed: 65 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,69 @@`
`1`		`-fromlib2to3.pgen2importgrammar`
	`1`	`+importcollections`
`2`	`2`
`3`		`-classGrammar(grammar.Grammar):`
	`3`	`+classGrammar:`
	`4`	`+"""Pgen parsing tables conversion class.`
	`5`	`+`
	`6`	`+ Once initialized, this class supplies the grammar tables for the`
	`7`	`+ parsing engine implemented by parse.py. The parsing engine`
	`8`	`+ accesses the instance variables directly. The class here does not`
	`9`	`+ provide initialization of the tables; several subclasses exist to`
	`10`	`+ do this (see the conv and pgen modules).`
	`11`	`+`
	`12`	`+ The load() method reads the tables from a pickle file, which is`
	`13`	`+ much faster than the other ways offered by subclasses. The pickle`
	`14`	`+ file is written by calling dump() (after loading the grammar`
	`15`	`+ tables using a subclass). The report() method prints a readable`
	`16`	`+ representation of the tables to stdout, for debugging.`
	`17`	`+`
	`18`	`+ The instance variables are as follows:`
	`19`	`+`
	`20`	`+ symbol2number -- a dict mapping symbol names to numbers. Symbol`
	`21`	`+ numbers are always 256 or higher, to distinguish`
	`22`	`+ them from token numbers, which are between 0 and`
	`23`	`+ 255 (inclusive).`
	`24`	`+`
	`25`	`+ number2symbol -- a dict mapping numbers to symbol names;`
	`26`	`+ these two are each other's inverse.`
	`27`	`+`
	`28`	`+ states -- a list of DFAs, where each DFA is a list of`
	`29`	`+ states, each state is a list of arcs, and each`
	`30`	`+ arc is a (i, j) pair where i is a label and j is`
	`31`	`+ a state number. The DFA number is the index into`
	`32`	`+ this list. (This name is slightly confusing.)`
	`33`	`+ Final states are represented by a special arc of`
	`34`	`+ the form (0, j) where j is its own state number.`
	`35`	`+`
	`36`	`+ dfas -- a dict mapping symbol numbers to (DFA, first)`
	`37`	`+ pairs, where DFA is an item from the states list`
	`38`	`+ above, and first is a set of tokens that can`
	`39`	`+ begin this grammar rule (represented by a dict`
	`40`	`+ whose values are always 1).`
	`41`	`+`
	`42`	`+ labels -- a list of (x, y) pairs where x is either a token`
	`43`	`+ number or a symbol number, and y is either None`
	`44`	`+ or a string; the strings are keywords. The label`
	`45`	`+ number is the index in this list; label numbers`
	`46`	`+ are used to mark state transitions (arcs) in the`
	`47`	`+ DFAs.`
	`48`	`+`
	`49`	`+ start -- the number of the grammar's start symbol.`
	`50`	`+`
	`51`	`+ keywords -- a dict mapping keyword strings to arc labels.`
	`52`	`+`
	`53`	`+ tokens -- a dict mapping token numbers to arc labels.`
	`54`	`+`
	`55`	`+ """`
	`56`	`+`
	`57`	`+def__init__(self):`
	`58`	`+self.symbol2number=collections.OrderedDict()`
	`59`	`+self.number2symbol=collections.OrderedDict()`
	`60`	`+self.states= []`
	`61`	`+self.dfas=collections.OrderedDict()`
	`62`	`+self.labels= [(0,"EMPTY")]`
	`63`	`+self.keywords=collections.OrderedDict()`
	`64`	`+self.tokens=collections.OrderedDict()`
	`65`	`+self.symbol2label=collections.OrderedDict()`
	`66`	`+self.start=256`
`4`	`67`
`5`	`68`	`defproduce_graminit_h(self,writer):`
`6`	`69`	`writer("/* Generated by Parser/pgen */\n\n")`

`‎Parser/pgen/pgen.py‎`

Lines changed: 25 additions & 41 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,40 +1,23 @@`
`1`		`-importos`
`2`		`-importsys`
`3`	`1`	`importcollections`
`4`		`-importimportlib.machinery`
	`2`	`+importtokenize# from stdlib`
`5`	`3`
`6`		`-# Use Lib/token.py and Lib/tokenize.py to obtain the tokens. To maintain this`
`7`		`-# compatible with older versions of Python, we need to make sure that we only`
`8`		`-# import these two files (and not any of the dependencies of these files).`
`9`		`-`
`10`		`-CURRENT_FOLDER_LOCATION=os.path.dirname(os.path.realpath(__file__))`
`11`		`-LIB_LOCATION=os.path.realpath(os.path.join(CURRENT_FOLDER_LOCATION,'..','..','Lib'))`
`12`		`-TOKEN_LOCATION=os.path.join(LIB_LOCATION,'token.py')`
`13`		`-TOKENIZE_LOCATION=os.path.join(LIB_LOCATION,'tokenize.py')`
`14`		`-`
`15`		`-token=importlib.machinery.SourceFileLoader('token',`
`16`		`-TOKEN_LOCATION).load_module()`
`17`		`-# Add token to the module cache so tokenize.py uses that excact one instead of`
`18`		`-# the one in the stdlib of the interpreter executing this file.`
`19`		`-sys.modules['token']=token`
`20`		`-tokenize=importlib.machinery.SourceFileLoader('tokenize',`
`21`		`-TOKENIZE_LOCATION).load_module()`
`22`		`-`
`23`		`-from .importgrammar`
	`4`	`+from .importgrammar,token`
`24`	`5`
`25`	`6`	`classParserGenerator(object):`
`26`	`7`
`27`		`-def__init__(self,filename,stream=None,verbose=False):`
	`8`	`+def__init__(self,grammar_file,token_file,stream=None,verbose=False):`
`28`	`9`	`close_stream=None`
`29`	`10`	`ifstreamisNone:`
`30`		`-stream=open(filename)`
	`11`	`+stream=open(grammar_file)`
`31`	`12`	`close_stream=stream.close`
`32`		`-self.tokens=token`
`33`		`-self.opmap=token.EXACT_TOKEN_TYPES`
	`13`	`+withopen(token_file)astok_file:`
	`14`	`+token_lines=tok_file.readlines()`
	`15`	`+self.tokens=dict(token.generate_tokens(token_lines))`
	`16`	`+self.opmap=dict(token.generate_opmap(token_lines))`
`34`	`17`	`# Manually add <> so it does not collide with !=`
`35`		`-self.opmap['<>']=self.tokens.NOTEQUAL`
	`18`	`+self.opmap['<>']="NOTEQUAL"`
`36`	`19`	`self.verbose=verbose`
`37`		`-self.filename=filename`
	`20`	`+self.filename=grammar_file`
`38`	`21`	`self.stream=stream`
`39`	`22`	`self.generator=tokenize.generate_tokens(stream.readline)`
`40`	`23`	`self.gettoken()# Initialize lookahead`
`@@ -108,9 +91,9 @@ def make_label(self, c, label):`
`108`	`91`	`returnilabel`
`109`	`92`	`else:`
`110`	`93`	`# A named token (NAME, NUMBER, STRING)`
`111`		`-itoken=getattr(self.tokens,label,None)`
	`94`	`+itoken=self.tokens.get(label,None)`
`112`	`95`	`assertisinstance(itoken,int),label`
`113`		`-assertitokeninself.tokens.tok_name,label`
	`96`	`+assertitokeninself.tokens.values(),label`
`114`	`97`	`ifitokeninc.tokens:`
`115`	`98`	`returnc.tokens[itoken]`
`116`	`99`	`else:`
`@@ -126,12 +109,13 @@ def make_label(self, c, label):`
`126`	`109`	`ifvalueinc.keywords:`
`127`	`110`	`returnc.keywords[value]`
`128`	`111`	`else:`
`129`		`-c.labels.append((self.tokens.NAME,value))`
	`112`	`+c.labels.append((self.tokens["NAME"],value))`
`130`	`113`	`c.keywords[value]=ilabel`
`131`	`114`	`returnilabel`
`132`	`115`	`else:`
`133`	`116`	`# An operator (any non-numeric token)`
`134`		`-itoken=self.opmap[value]# Fails if unknown token`
	`117`	`+tok_name=self.opmap[value]# Fails if unknown token`
	`118`	`+itoken=self.tokens[tok_name]`
`135`	`119`	`ifitokeninc.tokens:`
`136`	`120`	`returnc.tokens[itoken]`
`137`	`121`	`else:`
`@@ -184,16 +168,16 @@ def parse(self):`
`184`	`168`	`dfas=collections.OrderedDict()`
`185`	`169`	`startsymbol=None`
`186`	`170`	`# MSTART: (NEWLINE \| RULE)* ENDMARKER`
`187`		`-whileself.type!=self.tokens.ENDMARKER:`
`188`		`-whileself.type==self.tokens.NEWLINE:`
	`171`	`+whileself.type!=tokenize.ENDMARKER:`
	`172`	`+whileself.type==tokenize.NEWLINE:`
`189`	`173`	`self.gettoken()`
`190`	`174`	`# RULE: NAME ':' RHS NEWLINE`
`191`		`-name=self.expect(self.tokens.NAME)`
	`175`	`+name=self.expect(tokenize.NAME)`
`192`	`176`	`ifself.verbose:`
`193`	`177`	`print("Processing rule {dfa_name}".format(dfa_name=name))`
`194`		`-self.expect(self.tokens.OP,":")`
	`178`	`+self.expect(tokenize.OP,":")`
`195`	`179`	`a,z=self.parse_rhs()`
`196`		`-self.expect(self.tokens.NEWLINE)`
	`180`	`+self.expect(tokenize.NEWLINE)`
`197`	`181`	`ifself.verbose:`
`198`	`182`	`self.dump_nfa(name,a,z)`
`199`	`183`	`dfa=self.make_dfa(a,z)`
`@@ -309,7 +293,7 @@ def parse_alt(self):`
`309`	`293`	`# ALT: ITEM+`
`310`	`294`	`a,b=self.parse_item()`
`311`	`295`	`while (self.valuein ("(","[")or`
`312`		`-self.typein (self.tokens.NAME,self.tokens.STRING)):`
	`296`	`+self.typein (tokenize.NAME,tokenize.STRING)):`
`313`	`297`	`c,d=self.parse_item()`
`314`	`298`	`b.addarc(c)`
`315`	`299`	`b=d`
`@@ -320,7 +304,7 @@ def parse_item(self):`
`320`	`304`	`ifself.value=="[":`
`321`	`305`	`self.gettoken()`
`322`	`306`	`a,z=self.parse_rhs()`
`323`		`-self.expect(self.tokens.OP,"]")`
	`307`	`+self.expect(tokenize.OP,"]")`
`324`	`308`	`a.addarc(z)`
`325`	`309`	`returna,z`
`326`	`310`	`else:`
`@@ -340,9 +324,9 @@ def parse_atom(self):`
`340`	`324`	`ifself.value=="(":`
`341`	`325`	`self.gettoken()`
`342`	`326`	`a,z=self.parse_rhs()`
`343`		`-self.expect(self.tokens.OP,")")`
	`327`	`+self.expect(tokenize.OP,")")`
`344`	`328`	`returna,z`
`345`		`-elifself.typein (self.tokens.NAME,self.tokens.STRING):`
	`329`	`+elifself.typein (tokenize.NAME,tokenize.STRING):`
`346`	`330`	`a=NFAState()`
`347`	`331`	`z=NFAState()`
`348`	`332`	`a.addarc(z,self.value)`
`@@ -365,7 +349,7 @@ def gettoken(self):`
`365`	`349`	`whiletup[0]in (tokenize.COMMENT,tokenize.NL):`
`366`	`350`	`tup=next(self.generator)`
`367`	`351`	`self.type,self.value,self.begin,self.end,self.line=tup`
`368`		`-#print self.tokens['tok_name'][self.type], repr(self.value)`
	`352`	`+#print(getattr(tokenize,'tok_name')[self.type], repr(self.value))`
`369`	`353`
`370`	`354`	`defraise_error(self,msg,*args):`
`371`	`355`	`ifargs:`

`‎Parser/pgen/token.py‎`

Lines changed: 40 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,40 @@`
	`1`	`+importitertools`
	`2`	`+`
	`3`	`+defgenerate_tokens(tokens):`
	`4`	`+numbers=itertools.count(0)`
	`5`	`+forlineintokens:`
	`6`	`+line=line.strip()`
	`7`	`+`
	`8`	`+ifnotline:`
	`9`	`+continue`
	`10`	`+ifline.strip().startswith('#'):`
	`11`	`+continue`
	`12`	`+`
	`13`	`+name=line.split()[0]`
	`14`	`+yield (name,next(numbers))`
	`15`	`+`
	`16`	`+yield ('N_TOKENS',next(numbers))`
	`17`	`+yield ('NT_OFFSET',256)`
	`18`	`+`
	`19`	`+defgenerate_opmap(tokens):`
	`20`	`+forlineintokens:`
	`21`	`+line=line.strip()`
	`22`	`+`
	`23`	`+ifnotline:`
	`24`	`+continue`
	`25`	`+ifline.strip().startswith('#'):`
	`26`	`+continue`
	`27`	`+`
	`28`	`+pieces=line.split()`
	`29`	`+`
	`30`	`+iflen(pieces)!=2:`
	`31`	`+continue`
	`32`	`+`
	`33`	`+name,op=pieces`
	`34`	`+yield (op.strip("'"),name)`
	`35`	`+`
	`36`	`+# Yield independently <>. This is needed so it does not collide`
	`37`	`+# with the token generation in "generate_tokens" because if this`
	`38`	`+# symbol is included in Grammar/Tokens, it will collide with !=`
	`39`	`+# as it has the same name (NOTEQUAL).`
	`40`	`+yield ('<>','NOTEQUAL')`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commitb1fae75

File tree

5 files changed

5 files changed

`‎Makefile.pre.in‎`

`‎Parser/pgen/main.py‎`

`‎Parser/pgen/grammar.py‎`

`‎Parser/pgen/pgen.py‎`

`‎Parser/pgen/token.py‎`

0 commit comments

Movatterモバイル変換

Uh oh!

File tree

5 files changed

5 files changed

‎Makefile.pre.in‎

‎Parser/pgen/__main__.py‎

‎Parser/pgen/grammar.py‎

‎Parser/pgen/pgen.py‎

‎Parser/pgen/token.py‎

0 commit comments

`‎Makefile.pre.in‎`

`‎Parser/pgen/main.py‎`

`‎Parser/pgen/grammar.py‎`

`‎Parser/pgen/pgen.py‎`

`‎Parser/pgen/token.py‎`