| #!/usr/bin/env python3 |
| # Copyright 2017 The Chromium Authors |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| import argparse |
| import colorsys |
| import difflib |
| import html |
| import random |
| import os |
| import re |
| import subprocess |
| import sys |
| import tempfile |
| import textwrap |
| import webbrowser |
| |
| |
| classTokenContext(object): |
| """Metadata about a token. |
| |
| Attributes: |
| row: Row index of the token in the data file. |
| column: Column index of the token in the data file. |
| token: The token string. |
| commit: A Commit object that corresponds to the commit that added |
| this token. |
| """ |
| |
| def __init__(self, row, column, token, commit=None): |
| self.row= row |
| self.column= column |
| self.token= token |
| self.commit= commit |
| |
| |
| classCommit(object): |
| """Commit data. |
| |
| Attributes: |
| hash: The commit hash. |
| author_name: The author's name. |
| author_email: the author's email. |
| author_date: The date and time the author created this commit. |
| message: The commit message. |
| diff: The commit diff. |
| """ |
| |
| def __init__(self, hash, author_name, author_email, author_date, message, |
| diff): |
| self.hash= hash |
| self.author_name= author_name |
| self.author_email= author_email |
| self.author_date= author_date |
| self.message= message |
| self.diff= diff |
| |
| |
| def tokenize_data(data, tokenize_by_char, tokenize_whitespace): |
| """Tokenizes |data|. |
| |
| Args: |
| data: String to tokenize. |
| tokenize_by_char: If true, individual characters are treated as tokens. |
| Otherwise, tokens are either symbols or strings of both alphanumeric |
| characters and underscores. |
| tokenize_whitespace: Treat non-newline whitespace characters as tokens. |
| |
| Returns: |
| A list of lists of TokenContexts. Each list represents a line. |
| """ |
| contexts=[] |
| in_identifier=False |
| identifier_start=0 |
| identifier='' |
| row=0 |
| column=0 |
| line_contexts=[] |
| |
| for cin data: |
| ifnot tokenize_by_charand(c.isalnum()or c=='_'): |
| if in_identifier: |
| identifier+= c |
| else: |
| in_identifier=True |
| identifier_start= column |
| identifier= c |
| else: |
| if in_identifier: |
| line_contexts.append(TokenContext(row, identifier_start, identifier)) |
| in_identifier=False |
| ifnot c.isspace()or(tokenize_whitespaceand c!='\n'): |
| line_contexts.append(TokenContext(row, column, c)) |
| |
| if c=='\n': |
| row+=1 |
| column=0 |
| contexts.append(line_contexts) |
| line_tokens=[] |
| line_contexts=[] |
| else: |
| column+=1 |
| contexts.append(line_contexts) |
| return contexts |
| |
| |
| def compute_unified_diff(old_tokens, new_tokens): |
| """Computes the diff between |old_tokens| and |new_tokens|. |
| |
| Args: |
| old_tokens: Token strings corresponding to the old data. |
| new_tokens: Token strings corresponding to the new data. |
| |
| Returns: |
| The diff, in unified diff format. |
| """ |
| return difflib.unified_diff(old_tokens, new_tokens, n=0, lineterm='') |
| |
| |
| def parse_chunk_header_file_range(file_range): |
| """Parses a chunk header file range. |
| |
| Diff chunk headers have the form: |
| @@ -<file-range> +<file-range> @@ |
| File ranges have the form: |
| <start line number>,<number of lines changed> |
| |
| Args: |
| file_range: A chunk header file range. |
| |
| Returns: |
| A tuple (range_start, range_end). The endpoints are adjusted such that |
| iterating over [range_start, range_end) will give the changed indices. |
| """ |
| if','in file_range: |
| file_range_parts= file_range.split(',') |
| start= int(file_range_parts[0]) |
| amount= int(file_range_parts[1]) |
| if amount==0: |
| return(start, start) |
| return(start-1, start+ amount-1) |
| else: |
| return(int(file_range)-1, int(file_range)) |
| |
| |
| def compute_changed_token_indices(previous_tokens, current_tokens): |
| """Computes changed and added tokens. |
| |
| Args: |
| previous_tokens: Tokens corresponding to the old file. |
| current_tokens: Tokens corresponding to the new file. |
| |
| Returns: |
| A tuple (added_tokens, changed_tokens). |
| added_tokens: A list of indices into |current_tokens|. |
| changed_tokens: A map of indices into |current_tokens| to |
| indices into |previous_tokens|. |
| """ |
| prev_file_chunk_end=0 |
| prev_patched_chunk_end=0 |
| added_tokens=[] |
| changed_tokens={} |
| for linein compute_unified_diff(previous_tokens, current_tokens): |
| if line.startswith("@@"): |
| parts= line.split(' ') |
| removed= parts[1].lstrip('-') |
| removed_start, removed_end= parse_chunk_header_file_range(removed) |
| added= parts[2].lstrip('+') |
| added_start, added_end= parse_chunk_header_file_range(added) |
| for iin range(added_start, added_end): |
| added_tokens.append(i) |
| for iin range(0, removed_start- prev_patched_chunk_end): |
| changed_tokens[prev_file_chunk_end+ i]= prev_patched_chunk_end+ i |
| prev_patched_chunk_end= removed_end |
| prev_file_chunk_end= added_end |
| for iin range(0, len(previous_tokens)- prev_patched_chunk_end): |
| changed_tokens[prev_file_chunk_end+ i]= prev_patched_chunk_end+ i |
| return added_tokens, changed_tokens |
| |
| |
| def flatten_nested_list(l): |
| """Flattens a list and provides a mapping from elements in the list back |
| into the nested list. |
| |
| Args: |
| l: A list of lists. |
| |
| Returns: |
| A tuple (flattened, index_to_position): |
| flattened: The flattened list. |
| index_to_position: A list of pairs (r, c) such that |
| index_to_position[i] == (r, c); flattened[i] == l[r][c] |
| """ |
| flattened=[] |
| index_to_position={} |
| r=0 |
| c=0 |
| for nested_listin l: |
| for elementin nested_list: |
| index_to_position[len(flattened)]=(r, c) |
| flattened.append(element) |
| c+=1 |
| r+=1 |
| c=0 |
| return(flattened, index_to_position) |
| |
| |
| def compute_changed_token_positions(previous_tokens, current_tokens): |
| """Computes changed and added token positions. |
| |
| Args: |
| previous_tokens: A list of lists of token strings. Lines in the file |
| correspond to the nested lists. |
| current_tokens: A list of lists of token strings. Lines in the file |
| correspond to the nested lists. |
| |
| Returns: |
| A tuple (added_token_positions, changed_token_positions): |
| added_token_positions: A list of pairs that index into |current_tokens|. |
| changed_token_positions: A map from pairs that index into |
| |current_tokens| to pairs that index into |previous_tokens|. |
| """ |
| flat_previous_tokens, previous_index_to_position= flatten_nested_list( |
| previous_tokens) |
| flat_current_tokens, current_index_to_position= flatten_nested_list( |
| current_tokens) |
| added_indices, changed_indices= compute_changed_token_indices( |
| flat_previous_tokens, flat_current_tokens) |
| added_token_positions=[current_index_to_position[i]for iin added_indices] |
| changed_token_positions={ |
| current_index_to_position[current_i]: |
| previous_index_to_position[changed_indices[current_i]] |
| for current_iin changed_indices |
| } |
| return(added_token_positions, changed_token_positions) |
| |
| |
| def parse_chunks_from_diff(diff): |
| """Returns a generator of chunk data from a diff. |
| |
| Args: |
| diff: A list of strings, with each string being a line from a diff |
| in unified diff format. |
| |
| Returns: |
| A generator of tuples (added_lines_start, added_lines_end, removed_lines) |
| """ |
| it= iter(diff) |
| for linein it: |
| whilenot line.startswith('@@'): |
| try: |
| line= next(it) |
| exceptStopIteration: |
| return |
| parts= line.split(' ') |
| previous_start, previous_end= parse_chunk_header_file_range( |
| parts[1].lstrip('-')) |
| current_start, current_end= parse_chunk_header_file_range( |
| parts[2].lstrip('+')) |
| |
| in_delta=False |
| added_lines_start=None |
| added_lines_end=None |
| removed_lines=[] |
| while previous_start< previous_endor current_start< current_end: |
| line= next(it) |
| firstchar= line[0] |
| line= line[1:] |
| ifnot in_deltaand(firstchar=='-'or firstchar=='+'): |
| in_delta=True |
| added_lines_start= current_start |
| added_lines_end= current_start |
| removed_lines=[] |
| |
| if firstchar=='-': |
| removed_lines.append(line) |
| previous_start+=1 |
| elif firstchar=='+': |
| current_start+=1 |
| added_lines_end= current_start |
| elif firstchar==' ': |
| if in_delta: |
| in_delta=False |
| yield(added_lines_start, added_lines_end, removed_lines) |
| previous_start+=1 |
| current_start+=1 |
| if in_delta: |
| yield(added_lines_start, added_lines_end, removed_lines) |
| |
| |
| def should_skip_commit(commit): |
| """Decides if |commit| should be skipped when computing the blame. |
| |
| Commit 5d4451e deleted all files in the repo except for DEPS. The |
| next commit, 1e7896, brought them back. This is a hack to skip |
| those commits (except for the files they modified). If we did not |
| do this, changes would be incorrectly attributed to 1e7896. |
| |
| Args: |
| commit: A Commit object. |
| |
| Returns: |
| A boolean indicating if this commit should be skipped. |
| """ |
| banned_commits=[ |
| '1e78967ed2f1937b3809c19d91e7dd62d756d307', |
| '5d4451ebf298d9d71f716cc0135f465cec41fcd0', |
| ] |
| if commit.hashnotin banned_commits: |
| returnFalse |
| banned_commits_file_exceptions=[ |
| 'DEPS', |
| 'chrome/browser/ui/views/file_manager_dialog_browsertest.cc', |
| ] |
| for linein commit.diff: |
| if line.startswith('---')or line.startswith('+++'): |
| if line.split(' ')[1]in banned_commits_file_exceptions: |
| returnFalse |
| elif line.startswith('@@'): |
| returnTrue |
| assertFalse |
| |
| |
| def generate_substrings(file): |
| """Generates substrings from a file stream, where substrings are |
| separated by '\0'. |
| |
| For example, the input: |
| 'a\0bc\0\0\0d\0' |
| would produce the output: |
| ['a', 'bc', 'd'] |
| |
| Args: |
| file: A readable file. |
| """ |
| BUF_SIZE=448# Experimentally found to be pretty fast. |
| data=[] |
| whileTrue: |
| buf= file.read(BUF_SIZE) |
| parts= buf.split(b'\0') |
| data.append(parts[0]) |
| if len(parts)>1: |
| joined= b''.join(data) |
| if joined!= b'': |
| yield joined.decode() |
| for iin range(1, len(parts)-1): |
| if parts[i]!= b'': |
| yield parts[i].decode() |
| data=[parts[-1]] |
| if len(buf)< BUF_SIZE: |
| joined= b''.join(data) |
| if joined!= b'': |
| yield joined.decode() |
| return |
| |
| |
| def generate_commits(git_log_stdout): |
| """Parses git log output into a stream of Commit objects. |
| """ |
| substring_generator= generate_substrings(git_log_stdout) |
| try: |
| whileTrue: |
| hash= next(substring_generator) |
| author_name= next(substring_generator) |
| author_email= next(substring_generator) |
| author_date= next(substring_generator) |
| message= next(substring_generator).rstrip('\n') |
| diff= next(substring_generator).split('\n')[1:-1] |
| yieldCommit(hash, author_name, author_email, author_date, message, diff) |
| exceptStopIteration: |
| pass |
| |
| |
| def uberblame_aux(file_name, git_log_stdout, data, tokenization_method): |
| """Computes the uberblame of file |file_name|. |
| |
| Args: |
| file_name: File to uberblame. |
| git_log_stdout: A file object that represents the git log output. |
| data: A string containing the data of file |file_name|. |
| tokenization_method: A function that takes a string and returns a list of |
| TokenContexts. |
| |
| Returns: |
| A tuple (data, blame). |
| data: File contents. |
| blame: A list of TokenContexts. |
| """ |
| blame= tokenization_method(data) |
| |
| blamed_tokens=0 |
| uber_blame=(data, blame[:]) |
| |
| for commitin generate_commits(git_log_stdout): |
| if should_skip_commit(commit): |
| continue |
| |
| offset=0 |
| for(added_lines_start, added_lines_end, |
| removed_lines)in parse_chunks_from_diff(commit.diff): |
| added_lines_start+= offset |
| added_lines_end+= offset |
| previous_contexts=[ |
| token_lines |
| for line_previousin removed_lines |
| for token_linesin tokenization_method(line_previous) |
| ] |
| previous_tokens=[[context.tokenfor contextin contexts] |
| for contextsin previous_contexts] |
| current_contexts= blame[added_lines_start:added_lines_end] |
| current_tokens=[[context.tokenfor contextin contexts] |
| for contextsin current_contexts] |
| added_token_positions, changed_token_positions=( |
| compute_changed_token_positions(previous_tokens, current_tokens)) |
| for r, cin added_token_positions: |
| current_contexts[r][c].commit= commit |
| blamed_tokens+=1 |
| for r, cin changed_token_positions: |
| pr, pc= changed_token_positions[(r, c)] |
| previous_contexts[pr][pc]= current_contexts[r][c] |
| |
| assert added_lines_start<= added_lines_end<= len(blame) |
| current_blame_size= len(blame) |
| blame[added_lines_start:added_lines_end]= previous_contexts |
| offset+= len(blame)- current_blame_size |
| |
| assert blame==[]or blame==[[]] |
| return uber_blame |
| |
| |
| def uberblame(file_name, revision, tokenization_method): |
| """Computes the uberblame of file |file_name|. |
| |
| Args: |
| file_name: File to uberblame. |
| revision: The revision to start the uberblame at. |
| tokenization_method: A function that takes a string and returns a list of |
| TokenContexts. |
| |
| Returns: |
| A tuple (data, blame). |
| data: File contents. |
| blame: A list of TokenContexts. |
| """ |
| DIFF_CONTEXT=3 |
| cmd_git_log=[ |
| 'git','log','--minimal','--no-prefix','--follow','-m', |
| '--first-parent','-p', |
| '-U%d'% DIFF_CONTEXT,'-z','--format=%x00%H%x00%an%x00%ae%x00%ad%x00%B', |
| revision,'--', file_name |
| ] |
| git_log= subprocess.Popen( |
| cmd_git_log, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
| data= subprocess.check_output( |
| ['git','show','%s:%s'%(revision, file_name)]).decode() |
| data, blame= uberblame_aux(file_name, git_log.stdout, data, |
| tokenization_method) |
| |
| stderr= git_log.communicate()[1].decode() |
| if git_log.returncode!=0: |
| raise subprocess.CalledProcessError(git_log.returncode, cmd_git_log, stderr) |
| return data, blame |
| |
| |
| def generate_pastel_color(): |
| """Generates a random color from a nice looking pastel palette. |
| |
| Returns: |
| The color, formatted as hex string. For example, white is "#FFFFFF". |
| """ |
| (h, l, s)=(random.uniform(0,1), random.uniform(0.8,0.9), random.uniform( |
| 0.5,1)) |
| (r, g, b)= colorsys.hls_to_rgb(h, l, s) |
| return"#%0.2X%0.2X%0.2X"%(int(r*255), int(g*255), int(b*255)) |
| |
| |
| def colorize_diff(diff): |
| """Colorizes a diff for use in an HTML page. |
| |
| Args: |
| diff: The diff, in unified diff format, as a list of line strings. |
| |
| Returns: |
| The HTML-formatted diff, as a string. The diff will already be escaped. |
| """ |
| |
| colorized=[] |
| for linein diff: |
| escaped= html.escape(line.replace('\r',''), quote=True) |
| if line.startswith('+'): |
| colorized.append('<span class=\\"addition\\">%s</span>'% escaped) |
| elif line.startswith('-'): |
| colorized.append('<span class=\\"deletion\\">%s</span>'% escaped) |
| elif line.startswith('@@'): |
| context_begin= escaped.find('@@',2) |
| assert context_begin!=-1 |
| colorized.append( |
| '<span class=\\"chunk_meta\\">%s</span>' |
| '<span class=\\"chunk_context\\">%s</span' |
| %(escaped[0:context_begin+2], escaped[context_begin+2:])) |
| elif line.startswith('diff')or line.startswith('index'): |
| colorized.append('<span class=\\"file_header\\">%s</span>'% escaped) |
| else: |
| colorized.append('<span class=\\"context_line\\">%s</span>'% escaped) |
| return'\n'.join(colorized) |
| |
| |
| def create_visualization(data, blame): |
| """Creates a web page to visualize |blame|. |
| |
| Args: |
| data: The data file as returned by uberblame(). |
| blame: A list of TokenContexts as returned by uberblame(). |
| |
| Returns: |
| The HTML for the generated page, as a string. |
| """ |
| # Use the same seed for the color generator on each run so that |
| # loading the same blame of the same file twice will result in the |
| # same generated HTML page. |
| random.seed(0x52937865ec62d1ea) |
| page="""\ |
| <html> |
| <head> |
| <style> |
| body { |
| font-family: monospace; |
| } |
| pre { |
| display: inline; |
| } |
| .token { |
| outline: 1pt solid #00000030; |
| outline-offset: -1pt; |
| cursor: pointer; |
| } |
| .addition { |
| color: #080; |
| } |
| .deletion { |
| color: #c00; |
| } |
| .chunk_meta { |
| color: #099; |
| } |
| .context_line .chunk_context { |
| // Just normal text. |
| } |
| .file_header { |
| font-weight: bold; |
| } |
| #linenums { |
| text-align: right; |
| } |
| #file_display { |
| position: absolute; |
| left: 0; |
| top: 0; |
| width: 50%%; |
| height: 100%%; |
| overflow: scroll; |
| } |
| #commit_display_container { |
| position: absolute; |
| left: 50%%; |
| top: 0; |
| width: 50%%; |
| height: 100%%; |
| overflow: scroll; |
| } |
| </style> |
| <script> |
| commit_data = %s; |
| function display_commit(hash) { |
| var e = document.getElementById("commit_display"); |
| e.innerHTML = commit_data[hash] |
| } |
| </script> |
| </head> |
| <body> |
| <div id="file_display"> |
| <table> |
| <tbody> |
| <tr> |
| <td valign="top" id="linenums"> |
| <pre>%s</pre> |
| </td> |
| <td valign="top"> |
| <pre>%s</pre> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| </div> |
| <div id="commit_display_container" valign="top"> |
| <pre id="commit_display" /> |
| </div> |
| </body> |
| </html> |
| """ |
| page= textwrap.dedent(page) |
| commits={} |
| lines=[] |
| commit_colors={} |
| blame_index=0 |
| blame=[contextfor contextsin blamefor contextin contexts] |
| row=0 |
| lastline='' |
| for linein data.split('\n'): |
| lastline= line |
| column=0 |
| for cin line+'\n': |
| if blame_index< len(blame): |
| token_context= blame[blame_index] |
| if(row== token_context.rowand |
| column== token_context.column+ len(token_context.token)): |
| if(blame_index+1== len(blame)or blame[blame_index].commit.hash!= |
| blame[blame_index+1].commit.hash): |
| lines.append('</span>') |
| blame_index+=1 |
| if blame_index< len(blame): |
| token_context= blame[blame_index] |
| if row== token_context.rowand column== token_context.column: |
| if(blame_index==0or blame[blame_index-1].commit.hash!= |
| blame[blame_index].commit.hash): |
| hash= token_context.commit.hash |
| commits[hash]= token_context.commit |
| if hashnotin commit_colors: |
| commit_colors[hash]= generate_pastel_color() |
| color= commit_colors[hash] |
| lines.append(('<span class="token" style="background-color: %s" '+ |
| 'onclick="display_commit("%s")">')%(color, |
| hash)) |
| lines.append(html.escape(c)) |
| column+=1 |
| row+=1 |
| commit_data=['{\n'] |
| commit_display_format="""\ |
| commit: {hash} |
| Author: {author_name} <{author_email}> |
| Date: {author_date} |
| |
| {message} |
| |
| """ |
| commit_display_format= textwrap.dedent(commit_display_format) |
| links= re.compile(r'(https?:\/\/\S+)') |
| for hashin commits: |
| commit= commits[hash] |
| commit_display= commit_display_format.format( |
| hash=hash, |
| author_name=commit.author_name, |
| author_email=commit.author_email, |
| author_date=commit.author_date, |
| message=commit.message) |
| commit_display= html.escape(commit_display, quote=True) |
| commit_display+= colorize_diff(commit.diff) |
| commit_display= re.sub(links,'<a href=\\"\\1\\">\\1</a>', commit_display) |
| commit_display= commit_display.replace('\n','\\n') |
| commit_data.append('"%s": "%s",\n'%(hash, commit_display)) |
| commit_data.append('}') |
| commit_data=''.join(commit_data) |
| line_nums= range(1, rowif lastline.strip()==''else row+1) |
| line_nums='\n'.join([str(num)for numin line_nums]) |
| lines=''.join(lines) |
| return page%(commit_data, line_nums, lines) |
| |
| |
| def show_visualization(page): |
| """Display |html| in a web browser. |
| |
| Args: |
| html: The contents of the file to display, as a string. |
| """ |
| # Keep the temporary file around so the browser has time to open it. |
| # TODO(thomasanderson): spin up a temporary web server to serve this |
| # file so we don't have to leak it. |
| html_file= tempfile.NamedTemporaryFile(delete=False, suffix='.html') |
| html_file.write(page.encode()) |
| html_file.flush() |
| if sys.platform.startswith('linux'): |
| # Don't show any messages when starting the browser. |
| saved_stdout= os.dup(1) |
| saved_stderr= os.dup(2) |
| os.close(1) |
| os.close(2) |
| os.open(os.devnull, os.O_RDWR) |
| os.open(os.devnull, os.O_RDWR) |
| webbrowser.open('file://'+ html_file.name) |
| if sys.platform.startswith('linux'): |
| os.dup2(saved_stdout,1) |
| os.dup2(saved_stderr,2) |
| os.close(saved_stdout) |
| os.close(saved_stderr) |
| |
| |
| def main(argv): |
| parser= argparse.ArgumentParser( |
| description='Show what revision last modified each token of a file.') |
| parser.add_argument( |
| 'revision', |
| default='HEAD', |
| nargs='?', |
| help='show only commits starting from a revision') |
| parser.add_argument('file', help='the file to uberblame') |
| parser.add_argument( |
| '--skip-visualization', |
| action='store_true', |
| help='do not display the blame visualization in a web browser') |
| parser.add_argument( |
| '--tokenize-by-char', |
| action='store_true', |
| help='treat individual characters as tokens') |
| parser.add_argument( |
| '--tokenize-whitespace', |
| action='store_true', |
| help='also blame non-newline whitespace characters') |
| args= parser.parse_args(argv) |
| |
| def tokenization_method(data): |
| return tokenize_data(data, args.tokenize_by_char, args.tokenize_whitespace) |
| |
| data, blame= uberblame(args.file, args.revision, tokenization_method) |
| html= create_visualization(data, blame) |
| ifnot args.skip_visualization: |
| show_visualization(html) |
| return0 |
| |
| |
| if __name__=='__main__': |
| sys.exit(main(sys.argv[1:])) |