From 01d2cde990f22d409e74e239de7e4d347102d6f6 Mon Sep 17 00:00:00 2001 From: "mgiuca@chromium.org" Date: Fri, 5 Feb 2016 03:25:41 +0000 Subject: [PATCH] git hyper-blame: Added approx. line number translation. Previously, when a commit was skipped, it would be blamed on the line number the line had *after* the skipped commit. This could mean a totally unrelated commit gets blamed. Now, a heuristic analyses the diff of the skipped commit to discover approximately what line number the line had *before* the skipped commit, so it can hopefully be blamed on the right commit. BUG=574290 Review URL: https://codereview.chromium.org/1629253002 git-svn-id: svn://svn.chromium.org/chrome/trunk/tools/depot_tools@298609 0039d316-1c4b-4281-b951-d872f2087c98 --- git_common.py | 4 ++ git_hyper_blame.py | 124 ++++++++++++++++++++++++++++++---- man/html/git-hyper-blame.html | 23 ++++--- man/man1/git-hyper-blame.1 | 22 +++--- man/src/git-hyper-blame.txt | 17 +++-- tests/git_common_test.py | 17 +++++ tests/git_hyper_blame_test.py | 99 ++++++++++++++++++++++----- 7 files changed, 250 insertions(+), 56 deletions(-) diff --git a/git_common.py b/git_common.py index d571f820b..949ba4695 100644 --- a/git_common.py +++ b/git_common.py @@ -374,6 +374,10 @@ def del_config(option, scope='local'): pass +def diff(oldrev, newrev, *args): + return run('diff', oldrev, newrev, *args) + + def freeze(): took_action = False diff --git a/git_hyper_blame.py b/git_hyper_blame.py index 17424511a..5a7daa0cf 100755 --- a/git_hyper_blame.py +++ b/git_hyper_blame.py @@ -149,6 +149,110 @@ def get_parsed_blame(filename, revision='HEAD'): return list(parse_blame(blame)) +# Map from (oldrev, newrev) to hunk list (caching the results of git diff, but +# only the hunk line numbers, not the actual diff contents). +# hunk list contains (old, new) pairs, where old and new are (start, length) +# pairs. A hunk list can also be None (if the diff failed). +diff_hunks_cache = {} + + +def cache_diff_hunks(oldrev, newrev): + def parse_start_length(s): + # Chop the '-' or '+'. + s = s[1:] + # Length is optional (defaults to 1). + try: + start, length = s.split(',') + except ValueError: + start = s + length = 1 + return int(start), int(length) + + try: + return diff_hunks_cache[(oldrev, newrev)] + except KeyError: + pass + + # Use -U0 to get the smallest possible hunks. + diff = git_common.diff(oldrev, newrev, '-U0') + + # Get all the hunks. + hunks = [] + for line in diff.split('\n'): + if not line.startswith('@@'): + continue + ranges = line.split(' ', 3)[1:3] + ranges = tuple(parse_start_length(r) for r in ranges) + hunks.append(ranges) + + diff_hunks_cache[(oldrev, newrev)] = hunks + return hunks + + +def approx_lineno_across_revs(filename, newfilename, revision, newrevision, + lineno): + """Computes the approximate movement of a line number between two revisions. + + Consider line |lineno| in |filename| at |revision|. This function computes the + line number of that line in |newfilename| at |newrevision|. This is + necessarily approximate. + + Args: + filename: The file (within the repo) at |revision|. + newfilename: The name of the same file at |newrevision|. + revision: A git revision. + newrevision: Another git revision. Note: Can be ahead or behind |revision|. + lineno: Line number within |filename| at |revision|. + + Returns: + Line number within |newfilename| at |newrevision|. + """ + # This doesn't work that well if there are a lot of line changes within the + # hunk (demonstrated by GitHyperBlameLineMotionTest.testIntraHunkLineMotion). + # A fuzzy heuristic that takes the text of the new line and tries to find a + # deleted line within the hunk that mostly matches the new line could help. + + # Use the : syntax to diff between two blobs. This is the + # only way to diff a file that has been renamed. + old = '%s:%s' % (revision, filename) + new = '%s:%s' % (newrevision, newfilename) + hunks = cache_diff_hunks(old, new) + + cumulative_offset = 0 + + # Find the hunk containing lineno (if any). + for (oldstart, oldlength), (newstart, newlength) in hunks: + cumulative_offset += newlength - oldlength + + if lineno >= oldstart + oldlength: + # Not there yet. + continue + + if lineno < oldstart: + # Gone too far. + break + + # lineno is in [oldstart, oldlength] at revision; [newstart, newlength] at + # newrevision. + + # If newlength == 0, newstart will be the line before the deleted hunk. + # Since the line must have been deleted, just return that as the nearest + # line in the new file. Caution: newstart can be 0 in this case. + if newlength == 0: + return max(1, newstart) + + newend = newstart + newlength - 1 + + # Move lineno based on the amount the entire hunk shifted. + lineno = lineno + newstart - oldstart + # Constrain the output within the range [newstart, newend]. + return min(newend, max(newstart, lineno)) + + # Wasn't in a hunk. Figure out the line motion based on the difference in + # length between the hunks seen so far. + return lineno + cumulative_offset + + def hyper_blame(ignored, filename, revision='HEAD', out=sys.stdout, err=sys.stderr): # Map from commit to parsed blame from that commit. @@ -189,23 +293,19 @@ def hyper_blame(ignored, filename, revision='HEAD', out=sys.stdout, # ignore this commit. break - # line.lineno_then is the line number in question at line.commit. - # TODO(mgiuca): This will be incorrect if line.commit added or removed - # lines. Translate that line number so that it refers to the position of - # the same line on previouscommit. - lineno_previous = line.lineno_then + # line.lineno_then is the line number in question at line.commit. We need + # to translate that line number so that it refers to the position of the + # same line on previouscommit. + lineno_previous = approx_lineno_across_revs( + line.commit.filename, previousfilename, line.commit.commithash, + previouscommit, line.lineno_then) logging.debug('ignore commit %s on line p%d/t%d/n%d', line.commit.commithash, lineno_previous, line.lineno_then, line.lineno_now) # Get the line at lineno_previous in the parent commit. - assert lineno_previous > 0 - try: - newline = parent_blame[lineno_previous - 1] - except IndexError: - # lineno_previous is a guess, so it may be past the end of the file. - # Just grab the last line in the file. - newline = parent_blame[-1] + assert 1 <= lineno_previous <= len(parent_blame) + newline = parent_blame[lineno_previous - 1] # Replace the commit and lineno_then, but not the lineno_now or context. logging.debug(' replacing with %r', newline) diff --git a/man/html/git-hyper-blame.html b/man/html/git-hyper-blame.html index a08de12f5..62f7cc82a 100644 --- a/man/html/git-hyper-blame.html +++ b/man/html/git-hyper-blame.html @@ -820,19 +820,24 @@ commit) since the given person wrote it.

+

CAVEATS

+
+

When a line skips over an ignored commit, a guess is made as to which commit +previously modified that line, but it is not always clear where the line came +from. If the ignored commit makes lots of changes in close proximity, in +particular adding/removing/reordering lines, then the wrong authors may be +blamed for nearby edits.

+

For this reason, hyper-blame works best when the ignored commits are be +limited to minor changes such as formatting and renaming, not refactoring or +other more invasive changes.

+
+
+

BUGS