From 01d2cde990f22d409e74e239de7e4d347102d6f6 Mon Sep 17 00:00:00 2001
From: "mgiuca@chromium.org"
Date: Fri, 5 Feb 2016 03:25:41 +0000
Subject: [PATCH] git hyper-blame: Added approx. line number translation.
Previously, when a commit was skipped, it would be blamed on the line
number the line had *after* the skipped commit. This could mean a
totally unrelated commit gets blamed. Now, a heuristic analyses the diff
of the skipped commit to discover approximately what line number the
line had *before* the skipped commit, so it can hopefully be blamed on
the right commit.
BUG=574290
Review URL: https://codereview.chromium.org/1629253002
git-svn-id: svn://svn.chromium.org/chrome/trunk/tools/depot_tools@298609 0039d316-1c4b-4281-b951-d872f2087c98
---
git_common.py | 4 ++
git_hyper_blame.py | 124 ++++++++++++++++++++++++++++++----
man/html/git-hyper-blame.html | 23 ++++---
man/man1/git-hyper-blame.1 | 22 +++---
man/src/git-hyper-blame.txt | 17 +++--
tests/git_common_test.py | 17 +++++
tests/git_hyper_blame_test.py | 99 ++++++++++++++++++++++-----
7 files changed, 250 insertions(+), 56 deletions(-)
diff --git a/git_common.py b/git_common.py
index d571f820b..949ba4695 100644
--- a/git_common.py
+++ b/git_common.py
@@ -374,6 +374,10 @@ def del_config(option, scope='local'):
pass
+def diff(oldrev, newrev, *args):
+ return run('diff', oldrev, newrev, *args)
+
+
def freeze():
took_action = False
diff --git a/git_hyper_blame.py b/git_hyper_blame.py
index 17424511a..5a7daa0cf 100755
--- a/git_hyper_blame.py
+++ b/git_hyper_blame.py
@@ -149,6 +149,110 @@ def get_parsed_blame(filename, revision='HEAD'):
return list(parse_blame(blame))
+# Map from (oldrev, newrev) to hunk list (caching the results of git diff, but
+# only the hunk line numbers, not the actual diff contents).
+# hunk list contains (old, new) pairs, where old and new are (start, length)
+# pairs. A hunk list can also be None (if the diff failed).
+diff_hunks_cache = {}
+
+
+def cache_diff_hunks(oldrev, newrev):
+ def parse_start_length(s):
+ # Chop the '-' or '+'.
+ s = s[1:]
+ # Length is optional (defaults to 1).
+ try:
+ start, length = s.split(',')
+ except ValueError:
+ start = s
+ length = 1
+ return int(start), int(length)
+
+ try:
+ return diff_hunks_cache[(oldrev, newrev)]
+ except KeyError:
+ pass
+
+ # Use -U0 to get the smallest possible hunks.
+ diff = git_common.diff(oldrev, newrev, '-U0')
+
+ # Get all the hunks.
+ hunks = []
+ for line in diff.split('\n'):
+ if not line.startswith('@@'):
+ continue
+ ranges = line.split(' ', 3)[1:3]
+ ranges = tuple(parse_start_length(r) for r in ranges)
+ hunks.append(ranges)
+
+ diff_hunks_cache[(oldrev, newrev)] = hunks
+ return hunks
+
+
+def approx_lineno_across_revs(filename, newfilename, revision, newrevision,
+ lineno):
+ """Computes the approximate movement of a line number between two revisions.
+
+ Consider line |lineno| in |filename| at |revision|. This function computes the
+ line number of that line in |newfilename| at |newrevision|. This is
+ necessarily approximate.
+
+ Args:
+ filename: The file (within the repo) at |revision|.
+ newfilename: The name of the same file at |newrevision|.
+ revision: A git revision.
+ newrevision: Another git revision. Note: Can be ahead or behind |revision|.
+ lineno: Line number within |filename| at |revision|.
+
+ Returns:
+ Line number within |newfilename| at |newrevision|.
+ """
+ # This doesn't work that well if there are a lot of line changes within the
+ # hunk (demonstrated by GitHyperBlameLineMotionTest.testIntraHunkLineMotion).
+ # A fuzzy heuristic that takes the text of the new line and tries to find a
+ # deleted line within the hunk that mostly matches the new line could help.
+
+ # Use the : syntax to diff between two blobs. This is the
+ # only way to diff a file that has been renamed.
+ old = '%s:%s' % (revision, filename)
+ new = '%s:%s' % (newrevision, newfilename)
+ hunks = cache_diff_hunks(old, new)
+
+ cumulative_offset = 0
+
+ # Find the hunk containing lineno (if any).
+ for (oldstart, oldlength), (newstart, newlength) in hunks:
+ cumulative_offset += newlength - oldlength
+
+ if lineno >= oldstart + oldlength:
+ # Not there yet.
+ continue
+
+ if lineno < oldstart:
+ # Gone too far.
+ break
+
+ # lineno is in [oldstart, oldlength] at revision; [newstart, newlength] at
+ # newrevision.
+
+ # If newlength == 0, newstart will be the line before the deleted hunk.
+ # Since the line must have been deleted, just return that as the nearest
+ # line in the new file. Caution: newstart can be 0 in this case.
+ if newlength == 0:
+ return max(1, newstart)
+
+ newend = newstart + newlength - 1
+
+ # Move lineno based on the amount the entire hunk shifted.
+ lineno = lineno + newstart - oldstart
+ # Constrain the output within the range [newstart, newend].
+ return min(newend, max(newstart, lineno))
+
+ # Wasn't in a hunk. Figure out the line motion based on the difference in
+ # length between the hunks seen so far.
+ return lineno + cumulative_offset
+
+
def hyper_blame(ignored, filename, revision='HEAD', out=sys.stdout,
err=sys.stderr):
# Map from commit to parsed blame from that commit.
@@ -189,23 +293,19 @@ def hyper_blame(ignored, filename, revision='HEAD', out=sys.stdout,
# ignore this commit.
break
- # line.lineno_then is the line number in question at line.commit.
- # TODO(mgiuca): This will be incorrect if line.commit added or removed
- # lines. Translate that line number so that it refers to the position of
- # the same line on previouscommit.
- lineno_previous = line.lineno_then
+ # line.lineno_then is the line number in question at line.commit. We need
+ # to translate that line number so that it refers to the position of the
+ # same line on previouscommit.
+ lineno_previous = approx_lineno_across_revs(
+ line.commit.filename, previousfilename, line.commit.commithash,
+ previouscommit, line.lineno_then)
logging.debug('ignore commit %s on line p%d/t%d/n%d',
line.commit.commithash, lineno_previous, line.lineno_then,
line.lineno_now)
# Get the line at lineno_previous in the parent commit.
- assert lineno_previous > 0
- try:
- newline = parent_blame[lineno_previous - 1]
- except IndexError:
- # lineno_previous is a guess, so it may be past the end of the file.
- # Just grab the last line in the file.
- newline = parent_blame[-1]
+ assert 1 <= lineno_previous <= len(parent_blame)
+ newline = parent_blame[lineno_previous - 1]
# Replace the commit and lineno_then, but not the lineno_now or context.
logging.debug(' replacing with %r', newline)
diff --git a/man/html/git-hyper-blame.html b/man/html/git-hyper-blame.html
index a08de12f5..62f7cc82a 100644
--- a/man/html/git-hyper-blame.html
+++ b/man/html/git-hyper-blame.html
@@ -820,19 +820,24 @@ commit) since the given person wrote it.
+
CAVEATS
+
+
When a line skips over an ignored commit, a guess is made as to which commit
+previously modified that line, but it is not always clear where the line came
+from. If the ignored commit makes lots of changes in close proximity, in
+particular adding/removing/reordering lines, then the wrong authors may be
+blamed for nearby edits.
+
For this reason, hyper-blame
works best when the ignored commits are be
+limited to minor changes such as formatting and renaming, not refactoring or
+other more invasive changes.
+
+
+
BUGS
-
-When a commit is ignored, hyper-blame currently just blames the same line in
- the previous version of the file. This can be wildly inaccurate if the ignored
- commit adds or removes lines, resulting in a completely wrong commit being
- blamed.
-
-
--
-
There is currently no way to pass the ignore list as a file.
@@ -864,7 +869,7 @@ from