[git cl split] Add trie structure for clustering

This CL defines a trie-based datastructure for representing files based
on their path. It directly mirrors the structure of a file system,
keeping track of directories and the files inside them. It also stores
some information about OWNERS files, for use during clustering (we
won't cluster files together if there's a "break" in ownership due to
`set noparent). Optionally, the ownership information can be overridden;
this will be done via a command-line flag when the algorithm is fully
implemented.

Bug: 335797528
Change-Id: I5dcdf36695a1da5714ec021e5e18b6c36855a4f1
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/tools/depot_tools/+/6321290
Reviewed-by: Josip Sokcevic <sokcevic@chromium.org>
Commit-Queue: Devon Loehr <dloehr@google.com>
changes/90/6321290/4
Devon Loehr 3 months ago committed by LUCI CQ
parent c48f866fcf
commit 5825f91d8c

@ -564,6 +564,10 @@ def SelectReviewersForFiles(cl, author, files, max_depth):
return info_split_by_reviewers
################################################################################
# Code for saving, editing, and loading splittings.
################################################################################
def SaveSplittingToFile(cl_infos: List[CLInfo], filename: str, silent=False):
"""
Writes the listed CLs to the designated file, in a human-readable and
@ -773,3 +777,90 @@ def EditSplittingInteractively(
SaveSplittingToFile(cl_infos, tmp_file)
ValidateSplitting(cl_infos, "the provided splitting", files_on_disk)
return cl_infos, tmp_file
################################################################################
# Code for the clustering-based splitting algorithm.
################################################################################
### Trie Code
def FolderHasParent(path: str) -> bool:
"""
Check if a folder inherits owners from a higher-level directory:
i.e. it's not at top level, and doesn't have an OWNERS file that contains
`set noparent`
"""
# Treat each top-leve directory as having no parent, as well as the root
# directory.
if len(path.split(os.path.sep)) <= 1:
# Top level
return False
owners_file = os.path.join(path, 'OWNERS')
if (os.path.isfile(owners_file)):
with (open(owners_file)) as f:
for line in f.readlines():
# Strip whitespace and comments
line = line.split('#')[0].strip()
if (line == 'set noparent'):
return False
return True
class DirectoryTrie():
"""
Trie structure: Nested dictionaries representing file paths.
Each level represents one folder, and contains:
- The path to that folder (its prefix)
- A list of files that reside in that folder
- A boolean for whether that folder inherits owners from a parent folder
- One Trie representing each of that folder's subdirectories
Files are stored with their entire path, so we don't need to reconstruct
it every time we read them.
"""
def __init__(self, expect_owners_override, prefix: str = ""):
""" Create an empty DirectoryTrie with the specified prefix """
has_parent = expect_owners_override or FolderHasParent(prefix)
# yapf: disable
self.subdirectories : Dict[str, DirectoryTrie] = {}
self.files : List[str] = []
self.prefix : str = prefix
self.has_parent : bool = has_parent
self.expect_owners_override : bool = expect_owners_override
# yapf: enable
def AddFile(self, path: List[str]):
"""
Add a file to the Trie, adding new subdirectories if necessary.
The file should be represented as a list of directories, with the final
entry being the filename.
"""
if len(path) == 1:
self.files.append(os.path.join(self.prefix, path[0]))
else:
directory = path[0]
if directory not in self.subdirectories:
prefix = os.path.join(self.prefix, directory)
self.subdirectories[directory] = DirectoryTrie(
self.expect_owners_override, prefix)
self.subdirectories[directory].AddFile(path[1:])
def AddFiles(self, paths: List[List[str]]):
""" Convenience function to add many files at once. """
for path in paths:
self.AddFile(path)
def ToList(self) -> List[str]:
""" Return a list of all files in the trie. """
files = []
files += self.files
for subdir in self.subdirectories.values():
files += subdir.ToList()
return files

@ -556,5 +556,40 @@ class SplitClTest(unittest.TestCase):
mock_file_write.reset_mock()
@mock.patch("os.path.isfile", return_value=False)
def testDirectoryTrie(self, _):
"""
Simple unit tests for creating and reading from a DirectoryTrie.
"""
# The trie code uses OS paths so we need to do the same here
path_abc = os.path.join("a", "b", "c.cc")
path_abd = os.path.join("a", "b", "d.h")
path_aefgh = os.path.join("a", "e", "f", "g", "h.hpp")
path_ijk = os.path.join("i", "j", "k.cc")
path_al = os.path.join("a", "l.cpp")
path_top = os.path.join("top.gn")
files = [path_abc, path_abd, path_aefgh, path_ijk, path_al, path_top]
split_files = [file.split(os.path.sep) for file in files]
trie = split_cl.DirectoryTrie(False)
trie.AddFiles(split_files)
self.assertEqual(trie.files, [path_top])
self.assertEqual(trie.subdirectories["a"].files, [path_al])
self.assertEqual(trie.subdirectories["a"].subdirectories["b"].files,
[path_abc, path_abd])
self.assertEqual(sorted(trie.ToList()), sorted(files))
self.assertFalse(trie.has_parent)
self.assertFalse(trie.subdirectories["a"].has_parent)
self.assertTrue(trie.subdirectories["a"].subdirectories["b"].has_parent)
self.assertEqual(trie.prefix, "")
self.assertEqual(trie.subdirectories["a"].prefix, "a")
self.assertEqual(trie.subdirectories["a"].subdirectories["b"].prefix,
os.path.join("a", "b"))
if __name__ == '__main__':
unittest.main()

Loading…
Cancel
Save