Expand Up @@ -213,6 +213,14 @@ def untokenize(self, iterable): self.tokens.append(indent) self.prev_col = len(indent) startline = False elif tok_type == FSTRING_MIDDLE: if '{' in token or '}' in token: end_line, end_col = end end = (end_line, end_col + token.count('{') + token.count('}')) token = re.sub('{', '{{', token) token = re.sub('}', '}}', token) self.add_whitespace(start) self.tokens.append(token) self.prev_row, self.prev_col = end Expand Down Expand Up @@ -255,6 +263,11 @@ def compat(self, token, iterable): elif startline and indents: toks_append(indents[-1]) startline = False elif toknum == FSTRING_MIDDLE: if '{' in tokval or '}' in tokval: tokval = re.sub('{', '{{', tokval) tokval = re.sub('}', '}}', tokval) toks_append(tokval) Expand Down Expand Up @@ -404,36 +417,6 @@ def open(filename): buffer.close() raise def tokenize2(readline): encoding, consumed = detect_encoding(readline) rl_gen = _itertools.chain(consumed, iter(readline, b"")) if encoding is not None: if encoding == "utf-8-sig": # BOM will already have been stripped. encoding = "utf-8" yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') yield from _tokenize2(rl_gen, encoding) def _tokenize2(rl_gen, encoding): source = b"".join(rl_gen) token = None for token in _generate_tokens_from_c_tokenizer(source.decode(encoding), extra_tokens=True): # TODO: Marta -> limpiar esto if 6 < token.type <= 54: token = token._replace(type=OP) if token.type in {ASYNC, AWAIT}: token = token._replace(type=NAME) if token.type == NEWLINE: l_start, c_start = token.start l_end, c_end = token.end token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1)) yield token if token is not None: last_line, _ = token.start yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '') def tokenize(readline): """ The tokenize() generator requires one argument, readline, which Expand All @@ -454,194 +437,33 @@ def tokenize(readline): which tells you which encoding was used to decode the bytes stream. """ encoding, consumed = detect_encoding(readline) empty = _itertools.repeat(b"") rl_gen = _itertools.chain(consumed, iter(readline, b""), empty) return _tokenize(rl_gen.__next__, encoding) def _tokenize(readline, encoding): lnum = parenlev = continued = 0 numchars = '0123456789' contstr, needcont = '', 0 contline = None indents = [0] rl_gen = _itertools.chain(consumed, iter(readline, b"")) if encoding is not None: if encoding == "utf-8-sig": # BOM will already have been stripped. encoding = "utf-8" yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') last_line = b'' line = b'' while True: # loop over lines in stream try: # We capture the value of the line variable here because # readline uses the empty string '' to signal end of input, # hence `line` itself will always be overwritten at the end # of this loop. last_line = line line = readline() except StopIteration: line = b'' if encoding is not None: line = line.decode(encoding) lnum += 1 pos, max = 0, len(line) if contstr: # continued string if not line: raise TokenError("EOF in multi-line string", strstart) endmatch = endprog.match(line) if endmatch: pos = end = endmatch.end(0) yield TokenInfo(STRING, contstr + line[:end], strstart, (lnum, end), contline + line) contstr, needcont = '', 0 contline = None elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': yield TokenInfo(ERRORTOKEN, contstr + line, strstart, (lnum, len(line)), contline) contstr = '' contline = None continue else: contstr = contstr + line contline = contline + line continue elif parenlev == 0 and not continued: # new statement if not line: break column = 0 while pos < max: # measure leading whitespace if line[pos] == ' ': column += 1 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize elif line[pos] == '\f': column = 0 else: break pos += 1 if pos == max: break if line[pos] in '#\r\n': # skip comments or blank lines if line[pos] == '#': comment_token = line[pos:].rstrip('\r\n') yield TokenInfo(COMMENT, comment_token, (lnum, pos), (lnum, pos + len(comment_token)), line) pos += len(comment_token) yield TokenInfo(NL, line[pos:], (lnum, pos), (lnum, len(line)), line) continue if column > indents[-1]: # count indents or dedents indents.append(column) yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) while column < indents[-1]: if column not in indents: raise IndentationError( "unindent does not match any outer indentation level", ("<tokenize>", lnum, pos, line)) indents = indents[:-1] yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) else: # continued statement if not line: raise TokenError("EOF in multi-line statement", (lnum, 0)) continued = 0 while pos < max: pseudomatch = _compile(PseudoToken).match(line, pos) if pseudomatch: # scan for tokens start, end = pseudomatch.span(1) spos, epos, pos = (lnum, start), (lnum, end), end if start == end: continue token, initial = line[start:end], line[start] if (initial in numchars or # ordinary number (initial == '.' and token != '.' and token != '...')): yield TokenInfo(NUMBER, token, spos, epos, line) elif initial in '\r\n': if parenlev > 0: yield TokenInfo(NL, token, spos, epos, line) else: yield TokenInfo(NEWLINE, token, spos, epos, line) elif initial == '#': assert not token.endswith("\n") yield TokenInfo(COMMENT, token, spos, epos, line) elif token in triple_quoted: endprog = _compile(endpats[token]) endmatch = endprog.match(line, pos) if endmatch: # all on one line pos = endmatch.end(0) token = line[start:pos] yield TokenInfo(STRING, token, spos, (lnum, pos), line) else: strstart = (lnum, start) # multiple lines contstr = line[start:] contline = line break # Check up to the first 3 chars of the token to see if # they're in the single_quoted set. If so, they start # a string. # We're using the first 3, because we're looking for # "rb'" (for example) at the start of the token. If # we switch to longer prefixes, this needs to be # adjusted. # Note that initial == token[:1]. # Also note that single quote checking must come after # triple quote checking (above). elif (initial in single_quoted or token[:2] in single_quoted or token[:3] in single_quoted): if token[-1] == '\n': # continued string strstart = (lnum, start) # Again, using the first 3 chars of the # token. This is looking for the matching end # regex for the correct type of quote # character. So it's really looking for # endpats["'"] or endpats['"'], by trying to # skip string prefix characters, if any. endprog = _compile(endpats.get(initial) or endpats.get(token[1]) or endpats.get(token[2])) contstr, needcont = line[start:], 1 contline = line break else: # ordinary string yield TokenInfo(STRING, token, spos, epos, line) elif initial.isidentifier(): # ordinary name yield TokenInfo(NAME, token, spos, epos, line) elif initial == '\\': # continued stmt continued = 1 else: if initial in '([{': parenlev += 1 elif initial in ')]}': parenlev -= 1 yield TokenInfo(OP, token, spos, epos, line) else: yield TokenInfo(ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos+1), line) pos += 1 yield from _tokenize(rl_gen, encoding) def _tokenize(rl_gen, encoding): source = b"".join(rl_gen).decode(encoding) token = None for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True): # TODO: Marta -> limpiar esto if 6 < token.type <= 54: token = token._replace(type=OP) if token.type in {ASYNC, AWAIT}: token = token._replace(type=NAME) if token.type == NEWLINE: l_start, c_start = token.start l_end, c_end = token.end token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1)) # Add an implicit NEWLINE if the input doesn't end in one if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"): yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '') for indent in indents[1:]: # pop remaining indent levels yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') yield token if token is not None: last_line, _ = token.start yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '') tokenize = tokenize2 def generate_tokens(readline): """Tokenize a source reading Python code as unicode strings. Expand All @@ -658,7 +480,7 @@ def _gen(): if not line: return yield line.encode() return_tokenize2 (_gen(), 'utf-8') return_tokenize (_gen(), 'utf-8') def main(): import argparse Expand Down