Movatterモバイル変換

chromium /chromium /src /refs/heads/main /. /tools /uberblame.py

blob: d91e88c01b0e84dacc6f728da2d87be1f98d79f4 [file] [log] [blame] [edit]

	#!/usr/bin/env python3
	# Copyright 2017 The Chromium Authors
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	import argparse
	import colorsys
	import difflib
	import html
	import random
	import os
	import re
	import subprocess
	import sys
	import tempfile
	import textwrap
	import webbrowser


	classTokenContext(object):
	"""Metadata about a token.

	Attributes:
	row: Row index of the token in the data file.
	column: Column index of the token in the data file.
	token: The token string.
	commit: A Commit object that corresponds to the commit that added
	this token.
	"""

	def __init__(self, row, column, token, commit=None):
	self.row= row
	self.column= column
	self.token= token
	self.commit= commit


	classCommit(object):
	"""Commit data.

	Attributes:
	hash: The commit hash.
	author_name: The author's name.
	author_email: the author's email.
	author_date: The date and time the author created this commit.
	message: The commit message.
	diff: The commit diff.
	"""

	def __init__(self, hash, author_name, author_email, author_date, message,
	diff):
	self.hash= hash
	self.author_name= author_name
	self.author_email= author_email
	self.author_date= author_date
	self.message= message
	self.diff= diff


	def tokenize_data(data, tokenize_by_char, tokenize_whitespace):
	"""Tokenizes \|data\|.

	Args:
	data: String to tokenize.
	tokenize_by_char: If true, individual characters are treated as tokens.
	Otherwise, tokens are either symbols or strings of both alphanumeric
	characters and underscores.
	tokenize_whitespace: Treat non-newline whitespace characters as tokens.

	Returns:
	A list of lists of TokenContexts. Each list represents a line.
	"""
	contexts=[]
	in_identifier=False
	identifier_start=0
	identifier=''
	row=0
	column=0
	line_contexts=[]

	for cin data:
	ifnot tokenize_by_charand(c.isalnum()or c=='_'):
	if in_identifier:
	identifier+= c
	else:
	in_identifier=True
	identifier_start= column
	identifier= c
	else:
	if in_identifier:
	line_contexts.append(TokenContext(row, identifier_start, identifier))
	in_identifier=False
	ifnot c.isspace()or(tokenize_whitespaceand c!='\n'):
	line_contexts.append(TokenContext(row, column, c))

	if c=='\n':
	row+=1
	column=0
	contexts.append(line_contexts)
	line_tokens=[]
	line_contexts=[]
	else:
	column+=1
	contexts.append(line_contexts)
	return contexts


	def compute_unified_diff(old_tokens, new_tokens):
	"""Computes the diff between \|old_tokens\| and \|new_tokens\|.

	Args:
	old_tokens: Token strings corresponding to the old data.
	new_tokens: Token strings corresponding to the new data.

	Returns:
	The diff, in unified diff format.
	"""
	return difflib.unified_diff(old_tokens, new_tokens, n=0, lineterm='')


	def parse_chunk_header_file_range(file_range):
	"""Parses a chunk header file range.

	Diff chunk headers have the form:
	@@ -<file-range> +<file-range> @@
	File ranges have the form:
	<start line number>,<number of lines changed>

	Args:
	file_range: A chunk header file range.

	Returns:
	A tuple (range_start, range_end). The endpoints are adjusted such that
	iterating over [range_start, range_end) will give the changed indices.
	"""
	if','in file_range:
	file_range_parts= file_range.split(',')
	start= int(file_range_parts[0])
	amount= int(file_range_parts[1])
	if amount==0:
	return(start, start)
	return(start-1, start+ amount-1)
	else:
	return(int(file_range)-1, int(file_range))


	def compute_changed_token_indices(previous_tokens, current_tokens):
	"""Computes changed and added tokens.

	Args:
	previous_tokens: Tokens corresponding to the old file.
	current_tokens: Tokens corresponding to the new file.

	Returns:
	A tuple (added_tokens, changed_tokens).
	added_tokens: A list of indices into \|current_tokens\|.
	changed_tokens: A map of indices into \|current_tokens\| to
	indices into \|previous_tokens\|.
	"""
	prev_file_chunk_end=0
	prev_patched_chunk_end=0
	added_tokens=[]
	changed_tokens={}
	for linein compute_unified_diff(previous_tokens, current_tokens):
	if line.startswith("@@"):
	parts= line.split(' ')
	removed= parts[1].lstrip('-')
	removed_start, removed_end= parse_chunk_header_file_range(removed)
	added= parts[2].lstrip('+')
	added_start, added_end= parse_chunk_header_file_range(added)
	for iin range(added_start, added_end):
	added_tokens.append(i)
	for iin range(0, removed_start- prev_patched_chunk_end):
	changed_tokens[prev_file_chunk_end+ i]= prev_patched_chunk_end+ i
	prev_patched_chunk_end= removed_end
	prev_file_chunk_end= added_end
	for iin range(0, len(previous_tokens)- prev_patched_chunk_end):
	changed_tokens[prev_file_chunk_end+ i]= prev_patched_chunk_end+ i
	return added_tokens, changed_tokens


	def flatten_nested_list(l):
	"""Flattens a list and provides a mapping from elements in the list back
	into the nested list.

	Args:
	l: A list of lists.

	Returns:
	A tuple (flattened, index_to_position):
	flattened: The flattened list.
	index_to_position: A list of pairs (r, c) such that
	index_to_position[i] == (r, c); flattened[i] == l[r][c]
	"""
	flattened=[]
	index_to_position={}
	r=0
	c=0
	for nested_listin l:
	for elementin nested_list:
	index_to_position[len(flattened)]=(r, c)
	flattened.append(element)
	c+=1
	r+=1
	c=0
	return(flattened, index_to_position)


	def compute_changed_token_positions(previous_tokens, current_tokens):
	"""Computes changed and added token positions.

	Args:
	previous_tokens: A list of lists of token strings. Lines in the file
	correspond to the nested lists.
	current_tokens: A list of lists of token strings. Lines in the file
	correspond to the nested lists.

	Returns:
	A tuple (added_token_positions, changed_token_positions):
	added_token_positions: A list of pairs that index into \|current_tokens\|.
	changed_token_positions: A map from pairs that index into
	\|current_tokens\| to pairs that index into \|previous_tokens\|.
	"""
	flat_previous_tokens, previous_index_to_position= flatten_nested_list(
	previous_tokens)
	flat_current_tokens, current_index_to_position= flatten_nested_list(
	current_tokens)
	added_indices, changed_indices= compute_changed_token_indices(
	flat_previous_tokens, flat_current_tokens)
	added_token_positions=[current_index_to_position[i]for iin added_indices]
	changed_token_positions={
	current_index_to_position[current_i]:
	previous_index_to_position[changed_indices[current_i]]
	for current_iin changed_indices
	}
	return(added_token_positions, changed_token_positions)


	def parse_chunks_from_diff(diff):
	"""Returns a generator of chunk data from a diff.

	Args:
	diff: A list of strings, with each string being a line from a diff
	in unified diff format.

	Returns:
	A generator of tuples (added_lines_start, added_lines_end, removed_lines)
	"""
	it= iter(diff)
	for linein it:
	whilenot line.startswith('@@'):
	try:
	line= next(it)
	exceptStopIteration:
	return
	parts= line.split(' ')
	previous_start, previous_end= parse_chunk_header_file_range(
	parts[1].lstrip('-'))
	current_start, current_end= parse_chunk_header_file_range(
	parts[2].lstrip('+'))

	in_delta=False
	added_lines_start=None
	added_lines_end=None
	removed_lines=[]
	while previous_start< previous_endor current_start< current_end:
	line= next(it)
	firstchar= line[0]
	line= line[1:]
	ifnot in_deltaand(firstchar=='-'or firstchar=='+'):
	in_delta=True
	added_lines_start= current_start
	added_lines_end= current_start
	removed_lines=[]

	if firstchar=='-':
	removed_lines.append(line)
	previous_start+=1
	elif firstchar=='+':
	current_start+=1
	added_lines_end= current_start
	elif firstchar==' ':
	if in_delta:
	in_delta=False
	yield(added_lines_start, added_lines_end, removed_lines)
	previous_start+=1
	current_start+=1
	if in_delta:
	yield(added_lines_start, added_lines_end, removed_lines)


	def should_skip_commit(commit):
	"""Decides if \|commit\| should be skipped when computing the blame.

	Commit 5d4451e deleted all files in the repo except for DEPS. The
	next commit, 1e7896, brought them back. This is a hack to skip
	those commits (except for the files they modified). If we did not
	do this, changes would be incorrectly attributed to 1e7896.

	Args:
	commit: A Commit object.

	Returns:
	A boolean indicating if this commit should be skipped.
	"""
	banned_commits=[
	'1e78967ed2f1937b3809c19d91e7dd62d756d307',
	'5d4451ebf298d9d71f716cc0135f465cec41fcd0',
	]
	if commit.hashnotin banned_commits:
	returnFalse
	banned_commits_file_exceptions=[
	'DEPS',
	'chrome/browser/ui/views/file_manager_dialog_browsertest.cc',
	]
	for linein commit.diff:
	if line.startswith('---')or line.startswith('+++'):
	if line.split(' ')[1]in banned_commits_file_exceptions:
	returnFalse
	elif line.startswith('@@'):
	returnTrue
	assertFalse


	def generate_substrings(file):
	"""Generates substrings from a file stream, where substrings are
	separated by '\0'.

	For example, the input:
	'a\0bc\0\0\0d\0'
	would produce the output:
	['a', 'bc', 'd']

	Args:
	file: A readable file.
	"""
	BUF_SIZE=448# Experimentally found to be pretty fast.
	data=[]
	whileTrue:
	buf= file.read(BUF_SIZE)
	parts= buf.split(b'\0')
	data.append(parts[0])
	if len(parts)>1:
	joined= b''.join(data)
	if joined!= b'':
	yield joined.decode()
	for iin range(1, len(parts)-1):
	if parts[i]!= b'':
	yield parts[i].decode()
	data=[parts[-1]]
	if len(buf)< BUF_SIZE:
	joined= b''.join(data)
	if joined!= b'':
	yield joined.decode()
	return


	def generate_commits(git_log_stdout):
	"""Parses git log output into a stream of Commit objects.
	"""
	substring_generator= generate_substrings(git_log_stdout)
	try:
	whileTrue:
	hash= next(substring_generator)
	author_name= next(substring_generator)
	author_email= next(substring_generator)
	author_date= next(substring_generator)
	message= next(substring_generator).rstrip('\n')
	diff= next(substring_generator).split('\n')[1:-1]
	yieldCommit(hash, author_name, author_email, author_date, message, diff)
	exceptStopIteration:
	pass


	def uberblame_aux(file_name, git_log_stdout, data, tokenization_method):
	"""Computes the uberblame of file \|file_name\|.

	Args:
	file_name: File to uberblame.
	git_log_stdout: A file object that represents the git log output.
	data: A string containing the data of file \|file_name\|.
	tokenization_method: A function that takes a string and returns a list of
	TokenContexts.

	Returns:
	A tuple (data, blame).
	data: File contents.
	blame: A list of TokenContexts.
	"""
	blame= tokenization_method(data)

	blamed_tokens=0
	uber_blame=(data, blame[:])

	for commitin generate_commits(git_log_stdout):
	if should_skip_commit(commit):
	continue

	offset=0
	for(added_lines_start, added_lines_end,
	removed_lines)in parse_chunks_from_diff(commit.diff):
	added_lines_start+= offset
	added_lines_end+= offset
	previous_contexts=[
	token_lines
	for line_previousin removed_lines
	for token_linesin tokenization_method(line_previous)
	]
	previous_tokens=[[context.tokenfor contextin contexts]
	for contextsin previous_contexts]
	current_contexts= blame[added_lines_start:added_lines_end]
	current_tokens=[[context.tokenfor contextin contexts]
	for contextsin current_contexts]
	added_token_positions, changed_token_positions=(
	compute_changed_token_positions(previous_tokens, current_tokens))
	for r, cin added_token_positions:
	current_contexts[r][c].commit= commit
	blamed_tokens+=1
	for r, cin changed_token_positions:
	pr, pc= changed_token_positions[(r, c)]
	previous_contexts[pr][pc]= current_contexts[r][c]

	assert added_lines_start<= added_lines_end<= len(blame)
	current_blame_size= len(blame)
	blame[added_lines_start:added_lines_end]= previous_contexts
	offset+= len(blame)- current_blame_size

	assert blame==[]or blame==[[]]
	return uber_blame


	def uberblame(file_name, revision, tokenization_method):
	"""Computes the uberblame of file \|file_name\|.

	Args:
	file_name: File to uberblame.
	revision: The revision to start the uberblame at.
	tokenization_method: A function that takes a string and returns a list of
	TokenContexts.

	Returns:
	A tuple (data, blame).
	data: File contents.
	blame: A list of TokenContexts.
	"""
	DIFF_CONTEXT=3
	cmd_git_log=[
	'git','log','--minimal','--no-prefix','--follow','-m',
	'--first-parent','-p',
	'-U%d'% DIFF_CONTEXT,'-z','--format=%x00%H%x00%an%x00%ae%x00%ad%x00%B',
	revision,'--', file_name
	]
	git_log= subprocess.Popen(
	cmd_git_log, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	data= subprocess.check_output(
	['git','show','%s:%s'%(revision, file_name)]).decode()
	data, blame= uberblame_aux(file_name, git_log.stdout, data,
	tokenization_method)

	stderr= git_log.communicate()[1].decode()
	if git_log.returncode!=0:
	raise subprocess.CalledProcessError(git_log.returncode, cmd_git_log, stderr)
	return data, blame


	def generate_pastel_color():
	"""Generates a random color from a nice looking pastel palette.

	Returns:
	The color, formatted as hex string. For example, white is "#FFFFFF".
	"""
	(h, l, s)=(random.uniform(0,1), random.uniform(0.8,0.9), random.uniform(
	0.5,1))
	(r, g, b)= colorsys.hls_to_rgb(h, l, s)
	return"#%0.2X%0.2X%0.2X"%(int(r255), int(g255), int(b*255))


	def colorize_diff(diff):
	"""Colorizes a diff for use in an HTML page.

	Args:
	diff: The diff, in unified diff format, as a list of line strings.

	Returns:
	The HTML-formatted diff, as a string. The diff will already be escaped.
	"""

	colorized=[]
	for linein diff:
	escaped= html.escape(line.replace('\r',''), quote=True)
	if line.startswith('+'):
	colorized.append('<span class=\\"addition\\">%s</span>'% escaped)
	elif line.startswith('-'):
	colorized.append('<span class=\\"deletion\\">%s</span>'% escaped)
	elif line.startswith('@@'):
	context_begin= escaped.find('@@',2)
	assert context_begin!=-1
	colorized.append(
	'<span class=\\"chunk_meta\\">%s</span>'
	'<span class=\\"chunk_context\\">%s</span'
	%(escaped[0:context_begin+2], escaped[context_begin+2:]))
	elif line.startswith('diff')or line.startswith('index'):
	colorized.append('<span class=\\"file_header\\">%s</span>'% escaped)
	else:
	colorized.append('<span class=\\"context_line\\">%s</span>'% escaped)
	return'\n'.join(colorized)


	def create_visualization(data, blame):
	"""Creates a web page to visualize \|blame\|.

	Args:
	data: The data file as returned by uberblame().
	blame: A list of TokenContexts as returned by uberblame().

	Returns:
	The HTML for the generated page, as a string.
	"""
	# Use the same seed for the color generator on each run so that
	# loading the same blame of the same file twice will result in the
	# same generated HTML page.
	random.seed(0x52937865ec62d1ea)
	page="""\
	<html>
	<head>
	<style>
	body {
	font-family: monospace;
	}
	pre {
	display: inline;
	}
	.token {
	outline: 1pt solid #00000030;
	outline-offset: -1pt;
	cursor: pointer;
	}
	.addition {
	color: #080;
	}
	.deletion {
	color: #c00;
	}
	.chunk_meta {
	color: #099;
	}
	.context_line .chunk_context {
	// Just normal text.
	}
	.file_header {
	font-weight: bold;
	}
	#linenums {
	text-align: right;
	}
	#file_display {
	position: absolute;
	left: 0;
	top: 0;
	width: 50%%;
	height: 100%%;
	overflow: scroll;
	}
	#commit_display_container {
	position: absolute;
	left: 50%%;
	top: 0;
	width: 50%%;
	height: 100%%;
	overflow: scroll;
	}
	</style>
	<script>
	commit_data = %s;
	function display_commit(hash) {
	var e = document.getElementById("commit_display");
	e.innerHTML = commit_data[hash]
	}
	</script>
	</head>
	<body>
	<div id="file_display">
	<table>
	<tbody>
	<tr>
	<td valign="top" id="linenums">
	<pre>%s</pre>
	</td>
	<td valign="top">
	<pre>%s</pre>
	</td>
	</tr>
	</tbody>
	</table>
	</div>
	<div id="commit_display_container" valign="top">
	<pre id="commit_display" />
	</div>
	</body>
	</html>
	"""
	page= textwrap.dedent(page)
	commits={}
	lines=[]
	commit_colors={}
	blame_index=0
	blame=[contextfor contextsin blamefor contextin contexts]
	row=0
	lastline=''
	for linein data.split('\n'):
	lastline= line
	column=0
	for cin line+'\n':
	if blame_index< len(blame):
	token_context= blame[blame_index]
	if(row== token_context.rowand
	column== token_context.column+ len(token_context.token)):
	if(blame_index+1== len(blame)or blame[blame_index].commit.hash!=
	blame[blame_index+1].commit.hash):
	lines.append('</span>')
	blame_index+=1
	if blame_index< len(blame):
	token_context= blame[blame_index]
	if row== token_context.rowand column== token_context.column:
	if(blame_index==0or blame[blame_index-1].commit.hash!=
	blame[blame_index].commit.hash):
	hash= token_context.commit.hash
	commits[hash]= token_context.commit
	if hashnotin commit_colors:
	commit_colors[hash]= generate_pastel_color()
	color= commit_colors[hash]
	lines.append(('<span class="token" style="background-color: %s" '+
	'onclick="display_commit("%s")">')%(color,
	hash))
	lines.append(html.escape(c))
	column+=1
	row+=1
	commit_data=['{\n']
	commit_display_format="""\
	commit: {hash}
	Author: {author_name} <{author_email}>
	Date: {author_date}

	{message}

	"""
	commit_display_format= textwrap.dedent(commit_display_format)
	links= re.compile(r'(https?:\/\/\S+)')
	for hashin commits:
	commit= commits[hash]
	commit_display= commit_display_format.format(
	hash=hash,
	author_name=commit.author_name,
	author_email=commit.author_email,
	author_date=commit.author_date,
	message=commit.message)
	commit_display= html.escape(commit_display, quote=True)
	commit_display+= colorize_diff(commit.diff)
	commit_display= re.sub(links,'<a href=\\"\\1\\">\\1</a>', commit_display)
	commit_display= commit_display.replace('\n','\\n')
	commit_data.append('"%s": "%s",\n'%(hash, commit_display))
	commit_data.append('}')
	commit_data=''.join(commit_data)
	line_nums= range(1, rowif lastline.strip()==''else row+1)
	line_nums='\n'.join([str(num)for numin line_nums])
	lines=''.join(lines)
	return page%(commit_data, line_nums, lines)


	def show_visualization(page):
	"""Display \|html\| in a web browser.

	Args:
	html: The contents of the file to display, as a string.
	"""
	# Keep the temporary file around so the browser has time to open it.
	# TODO(thomasanderson): spin up a temporary web server to serve this
	# file so we don't have to leak it.
	html_file= tempfile.NamedTemporaryFile(delete=False, suffix='.html')
	html_file.write(page.encode())
	html_file.flush()
	if sys.platform.startswith('linux'):
	# Don't show any messages when starting the browser.
	saved_stdout= os.dup(1)
	saved_stderr= os.dup(2)
	os.close(1)
	os.close(2)
	os.open(os.devnull, os.O_RDWR)
	os.open(os.devnull, os.O_RDWR)
	webbrowser.open('file://'+ html_file.name)
	if sys.platform.startswith('linux'):
	os.dup2(saved_stdout,1)
	os.dup2(saved_stderr,2)
	os.close(saved_stdout)
	os.close(saved_stderr)


	def main(argv):
	parser= argparse.ArgumentParser(
	description='Show what revision last modified each token of a file.')
	parser.add_argument(
	'revision',
	default='HEAD',
	nargs='?',
	help='show only commits starting from a revision')
	parser.add_argument('file', help='the file to uberblame')
	parser.add_argument(
	'--skip-visualization',
	action='store_true',
	help='do not display the blame visualization in a web browser')
	parser.add_argument(
	'--tokenize-by-char',
	action='store_true',
	help='treat individual characters as tokens')
	parser.add_argument(
	'--tokenize-whitespace',
	action='store_true',
	help='also blame non-newline whitespace characters')
	args= parser.parse_args(argv)

	def tokenization_method(data):
	return tokenize_data(data, args.tokenize_by_char, args.tokenize_whitespace)

	data, blame= uberblame(args.file, args.revision, tokenization_method)
	html= create_visualization(data, blame)
	ifnot args.skip_visualization:
	show_visualization(html)
	return0


	if __name__=='__main__':
	sys.exit(main(sys.argv[1:]))