diff --git a/git_cl.py b/git_cl.py index 9d60c4fae..d13adcead 100755 --- a/git_cl.py +++ b/git_cl.py @@ -4433,10 +4433,8 @@ def CMDsplit(parser, args): Creates a branch and uploads a CL for each group of files modified in the current branch that share a common OWNERS file. In the CL description and - comment, '$directory' is replaced with the directory containing the changes - in this CL, '$cl_index' is replaced with the index of the CL we're currently - sending out, and '$num_cls' is replaced with the total number of CLs that - we're sending out in this split. + comment, the string '$directory', is replaced with the directory containing + the shared OWNERS file. """ parser.add_option('-d', '--description', dest='description_file', help='A text file containing a CL description in which ' diff --git a/presubmit_support.py b/presubmit_support.py index cb02fb6f9..9f02e85ad 100755 --- a/presubmit_support.py +++ b/presubmit_support.py @@ -854,22 +854,6 @@ class _GitDiffCache(_DiffCache): return scm.GIT.GetOldContents(local_root, path, branch=self._upstream) -def _ParseDiffHeader(line): - """Searches |line| for diff headers and returns a tuple - (header, old_line, old_size, new_line, new_size), or None if line doesn't - contain a diff header. - - This relies on the scm diff output describing each changed code section - with a line of the form - - ^@@ , , @@$ - """ - m = re.match(r'^@@ \-([0-9]+)\,([0-9]+) \+([0-9]+)\,([0-9]+) @@', line) - if m: - return (m.group(0), int(m.group(1)), int(m.group(2)), int(m.group(3)), - int(m.group(4))) - - class AffectedFile(object): """Representation of a file in a change.""" @@ -883,7 +867,6 @@ class AffectedFile(object): self._local_root = repository_root self._is_directory = None self._cached_changed_contents = None - self._cached_change_size_in_bytes = None self._cached_new_contents = None self._diff_cache = diff_cache logging.debug('%s(%s)', self.__class__.__name__, self._path) @@ -960,9 +943,9 @@ class AffectedFile(object): line_num = 0 for line in self.GenerateScmDiff().splitlines(): - h = _ParseDiffHeader(line) - if h: - line_num = h[3] + m = re.match(r'^@@ [0-9\,\+\-]+ \+([0-9]+)\,[0-9]+ @@', line) + if m: + line_num = int(m.groups(1)[0]) continue if line.startswith('+') and not line.startswith('++'): self._cached_changed_contents.append((line_num, line[1:])) @@ -970,25 +953,6 @@ class AffectedFile(object): line_num += 1 return self._cached_changed_contents[:] - def ChangeSizeInBytes(self): - """Returns a list of tuples (deleted bytes, added bytes) of all changes - in this file. - - This relies on the scm diff output describing each changed code section - with a line of the form - - ^@@ , , @@$ - """ - if self._cached_change_size_in_bytes is not None: - return self._cached_change_size_in_bytes[:] - self._cached_change_size_in_bytes = [] - - for line in self.GenerateScmDiff().splitlines(): - h = _ParseDiffHeader(line) - if h: - self._cached_change_size_in_bytes.append((h[2], h[4])) - return self._cached_change_size_in_bytes[:] - def __str__(self): return self.LocalPath() diff --git a/split_cl.py b/split_cl.py index 9d16b1c6e..c08bf4da4 100644 --- a/split_cl.py +++ b/split_cl.py @@ -9,7 +9,6 @@ from __future__ import print_function import collections import os -import random import re import subprocess2 import sys @@ -22,8 +21,6 @@ import owners_finder import git_common as git -import third_party.pygtrie as trie - # If a call to `git cl split` will generate more than this number of CLs, the # command will prompt the user to make sure they know what they're doing. Large @@ -44,25 +41,23 @@ def EnsureInGitRepository(): git.run('rev-parse') -def CreateBranchForDirectory(prefix, cl_index, directory, upstream): - """Creates a branch named |prefix| + "_" + |cl_index| + "_" + |directory|. +def CreateBranchForDirectory(prefix, directory, upstream): + """Creates a branch named |prefix| + "_" + |directory| + "_split". Return false if the branch already exists. |upstream| is used as upstream for the created branch. """ existing_branches = set(git.branches(use_limit = False)) - branch_name = '_'.join([prefix, str(cl_index), directory]) + branch_name = prefix + '_' + directory + '_split' if branch_name in existing_branches: return False git.run('checkout', '-t', upstream, '-b', branch_name) return True -def FormatDescriptionOrComment(txt, directory, cl_index, num_cls): - """Replaces $directory with |directory|, $cl_index with |cl_index|, and - $num_cls with |num_cls| in |txt|.""" - return txt.replace('$directory', '/' + directory).replace( - '$cl_index', str(cl_index)).replace('$num_cls', str(num_cls)) +def FormatDescriptionOrComment(txt, directory): + """Replaces $directory with |directory| in |txt|.""" + return txt.replace('$directory', '/' + directory) def AddUploadedByGitClSplitToDescription(description): @@ -81,14 +76,12 @@ def AddUploadedByGitClSplitToDescription(description): return '\n'.join(lines) -def UploadCl(cl_index, num_cls, refactor_branch, refactor_branch_upstream, - directory, files, description, comment, reviewer, changelist, - cmd_upload, cq_dry_run, enable_auto_submit): +def UploadCl(refactor_branch, refactor_branch_upstream, directory, files, + description, comment, reviewers, changelist, cmd_upload, + cq_dry_run, enable_auto_submit): """Uploads a CL with all changes to |files| in |refactor_branch|. Args: - cl_index: The index of this CL in the list of CLs to upload. - num_cls: The total number of CLs that will be uploaded. refactor_branch: Name of the branch that contains the changes to upload. refactor_branch_upstream: Name of the upstream of |refactor_branch|. directory: Path to the directory that contains the OWNERS file for which @@ -96,17 +89,16 @@ def UploadCl(cl_index, num_cls, refactor_branch, refactor_branch_upstream, files: List of AffectedFile instances to include in the uploaded CL. description: Description of the uploaded CL. comment: Comment to post on the uploaded CL. - reviewer: The reviewer for the CL. + reviewers: A set of reviewers for the CL. changelist: The Changelist class. cmd_upload: The function associated with the git cl upload command. cq_dry_run: If CL uploads should also do a cq dry run. enable_auto_submit: If CL uploads should also enable auto submit. """ # Create a branch. - if not CreateBranchForDirectory(refactor_branch, cl_index, directory, - refactor_branch_upstream): - print('Skipping CL ' + cl_index + ' for directory "' + directory + - '" for which a branch already exists.') + if not CreateBranchForDirectory( + refactor_branch, directory, refactor_branch_upstream): + print('Skipping ' + directory + ' for which a branch already exists.') return # Checkout all changes to files in |files|. @@ -122,12 +114,11 @@ def UploadCl(cl_index, num_cls, refactor_branch, refactor_branch_upstream, # when it is closed. with gclient_utils.temporary_file() as tmp_file: gclient_utils.FileWrite( - tmp_file, - FormatDescriptionOrComment(description, directory, cl_index, num_cls)) + tmp_file, FormatDescriptionOrComment(description, directory)) git.run('commit', '-F', tmp_file) # Upload a CL. - upload_args = ['-f', '-r', reviewer] + upload_args = ['-f', '-r', ','.join(reviewers)] if cq_dry_run: upload_args.append('--cq-dry-run') if not comment: @@ -137,140 +128,26 @@ def UploadCl(cl_index, num_cls, refactor_branch, refactor_branch_upstream, print('Uploading CL for ' + directory + '.') cmd_upload(upload_args) if comment: - changelist().AddComment( - FormatDescriptionOrComment(comment, directory, cl_index, num_cls), - publish=True) - - -class ChangeList(object): - """Representation of a CL and the files affected by it.""" - - def __init__(self, path, owners_db, author, files): - self._path = path - self._files = files - self._owners_db = owners_db - self._author = author - self._owners = None - - def _EnsureOwners(self): - if not self._owners: - self._owners = set() - files = [f.LocalPath() for f in self.GetFiles()] - if not files: - files = [self.GetPath()] - possible_owners = self._owners_db.all_possible_owners( - files, self._author).keys() - for owner in possible_owners: - if 0 == len(self._owners_db.files_not_covered_by(files, [owner])): - self._owners |= set([owner]) - assert len(self._owners) - - def Merge(self, other): - self._owners = self.GetCommonOwners(other) - self._files |= other.GetFiles() - - def GetPath(self): - return self._path - - def GetFiles(self): - return self._files - - def GetOwners(self): - self._EnsureOwners() - return self._owners - - def GetCommonOwners(self, other): - return self.GetOwners() & other.GetOwners() + changelist().AddComment(FormatDescriptionOrComment(comment, directory), + publish=True) - def HaveCommonOwners(self, other): - return len(self.GetCommonOwners(other)) > 0 - def GetChangeSizeInBytes(self): - return sum( - [c[0] + c[1] for f in self._files for c in f.ChangeSizeInBytes()]) - - -def SplitCLs(owners_database, author, files): +def GetFilesSplitByOwners(owners_database, files): """Returns a map of files split by OWNERS file. Returns: A map where keys are paths to directories containing an OWNERS file and values are lists of files sharing an OWNERS file. """ - - # The target CL size in # of changed bytes. - # TODO(yannic): Use # of changed lines instead and make this configurable. - max_cl_size = 1000 - - candidates = trie.Trie() - # Enable sorting so dry-run will split the CL the same way the CL is uploaded. - candidates.enable_sorting() - - # 1. Create one CL candidate for every affected file. + files_split_by_owners = collections.defaultdict(list) for f in files: - path = f.LocalPath() - candidates[path] = ChangeList(path, owners_database, author, set([f])) - - change_lists = [] - - # 2. Try to merge CL in common directories up to a maximum size of - # |max_cl_size|. - # This is O( len(files) * max([len(f.path) for f in files]) ). - edited = True - while edited: - edited = False - - # 2.1. Iterate over all candidates and merge candidates into the candidate - # for their parent directory if the resulting CL doesn't exceed - # |max_cl_size|. - for item in candidates.items(): - path = ''.join(item[0]) - candidate = item[1] - - # The number of CL candidates in subdirectories is equivalent to the - # number of nodes with prefix |path| in the Trie. - # Only try to merge |candidate| with the candidate for the parent - # directory if there are no more CLs for subdirectories. - sub_cls = len([''.join(k) for k in candidates.keys(path)]) - 1 - if not sub_cls: - parent_path = os.path.dirname(path) - if len(parent_path) < 1: - # Don't create CLs for more than one top-level directory. - continue - - if parent_path not in candidates: - candidates[parent_path] = ChangeList(parent_path, owners_database, - author, set()) - parent_cl = candidates[parent_path] - - if not parent_cl.HaveCommonOwners(candidate): - # Don't merge if the resulting CL would need more than one reviewer. - continue - - # Merge |candidate| into the CL for it's parent directory and remove - # candidate. - edited = True - del candidates[path] - parent_cl.Merge(candidate) - - # Add |parent_cl| to list of CLs to submit if the CL is larger than - # |max_cl_size|. - # TODO(yannic): Doing it this way, we might end up with CLs of size - # 2 * max_cl_size if we merged two candidates that just don't exceed - # the maximal size. - if parent_cl.GetChangeSizeInBytes() > max_cl_size: - change_lists.append(parent_cl) - del candidates[parent_path] - - # 3. Add all remaining candidates to the list of CLs. - for item in candidates.items(): - change_lists.append(item[1]) - - return change_lists + files_split_by_owners[owners_database.enclosing_dir_with_owners( + f.LocalPath())].append(f) + return files_split_by_owners def PrintClInfo(cl_index, num_cls, directory, file_paths, description, - reviewer): + reviewers): """Prints info about a CL. Args: @@ -280,42 +157,20 @@ def PrintClInfo(cl_index, num_cls, directory, file_paths, description, to upload a CL. file_paths: A list of files in this CL. description: The CL description. - reviewer: The reviewer for this CL. + reviewers: A set of reviewers for this CL. """ - description_lines = FormatDescriptionOrComment( - description, directory, cl_index, num_cls).splitlines() + description_lines = FormatDescriptionOrComment(description, + directory).splitlines() indented_description = '\n'.join([' ' + l for l in description_lines]) print('CL {}/{}'.format(cl_index, num_cls)) print('Path: {}'.format(directory)) - print('Reviewers: {}'.format(reviewer)) + print('Reviewers: {}'.format(', '.join(reviewers))) print('\n' + indented_description + '\n') print('\n'.join(file_paths)) print() -def _SelectReviewer(possible_owners, used_reviewers): - """Select a reviewer from |owners| and adds them to the set of used reviewers. - - Returns: - The reviewer. - """ - - # It's debatable whether we want to avoid reusing reviewers. It could be - # easier to ask the smallest possible amount of reviewers to become familiar - # with the change being split. However, doing so would mean we send all CLs to - # top-level owners, which might be too much to ask from them. - # We may revisit this decicion later. - unused_reviewers = possible_owners.difference(used_reviewers) - if len(unused_reviewers) < 1: - unused_reviewers = possible_owners - # Pick a random reviwer from the set of owners so we don't prefer owners - # with emails of low lexical order. - reviewer = random.choice(tuple(unused_reviewers)) - used_reviewers.add(reviewer) - return reviewer - - def SplitCl(description_file, comment_file, changelist, cmd_upload, dry_run, cq_dry_run, enable_auto_submit): """"Splits a branch into smaller branches and uploads CLs. @@ -356,9 +211,11 @@ def SplitCl(description_file, comment_file, changelist, cmd_upload, dry_run, owners_database = owners.Database(change.RepositoryRoot(), file, os.path) owners_database.load_data_needed_for([f.LocalPath() for f in files]) - change_lists = SplitCLs(owners_database, author, set(files)) + files_split_by_owners = GetFilesSplitByOwners(owners_database, files) - num_cls = len(change_lists) + num_cls = len(files_split_by_owners) + print('Will split current branch (' + refactor_branch + ') into ' + + str(num_cls) + ' CLs.\n') if cq_dry_run and num_cls > CL_SPLIT_FORCE_LIMIT: print( 'This will generate "%r" CLs. This many CLs can potentially generate' @@ -370,21 +227,21 @@ def SplitCl(description_file, comment_file, changelist, cmd_upload, dry_run, if answer.lower() != 'y': return 0 - reviewers = set() - for cl_index, cl in enumerate(change_lists, 1): + for cl_index, (directory, files) in \ + enumerate(files_split_by_owners.items(), 1): # Use '/' as a path separator in the branch name and the CL description # and comment. - directory = cl.GetPath().replace(os.path.sep, '/') - file_paths = [f.LocalPath() for f in cl.GetFiles()] - reviewer = _SelectReviewer(cl.GetOwners(), reviewers) + directory = directory.replace(os.path.sep, '/') + file_paths = [f.LocalPath() for f in files] + reviewers = owners_database.reviewers_for(file_paths, author) if dry_run: PrintClInfo(cl_index, num_cls, directory, file_paths, description, - reviewer) + reviewers) else: - UploadCl(cl_index, num_cls, refactor_branch, refactor_branch_upstream, - directory, files, description, comment, reviewer, changelist, - cmd_upload, cq_dry_run, enable_auto_submit) + UploadCl(refactor_branch, refactor_branch_upstream, directory, files, + description, comment, reviewers, changelist, cmd_upload, + cq_dry_run, enable_auto_submit) # Go back to the original branch. git.run('checkout', refactor_branch) diff --git a/third_party/pygtrie/LICENSE b/third_party/pygtrie/LICENSE deleted file mode 100644 index d64569567..000000000 --- a/third_party/pygtrie/LICENSE +++ /dev/null @@ -1,202 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/third_party/pygtrie/README.chromium b/third_party/pygtrie/README.chromium deleted file mode 100644 index ce5b043eb..000000000 --- a/third_party/pygtrie/README.chromium +++ /dev/null @@ -1,10 +0,0 @@ -URL: https://github.com/google/pygtrie -Version: 64ee0836f41a59919ecf8a59b0c7e2f7f1b8c5ba -License: Apache 2.0 -License File: LICENSE - -Description: -This directory contains the Python pygtrie module. - -Local Modifications: -None diff --git a/third_party/pygtrie/__init__.py b/third_party/pygtrie/__init__.py deleted file mode 100644 index 0c41e055a..000000000 --- a/third_party/pygtrie/__init__.py +++ /dev/null @@ -1,1376 +0,0 @@ -# -*- coding: utf-8 -*- -"""Implementation of a trie data structure. - -`Trie data structure `_, also known as radix -or prefix tree, is a tree associating keys to values where all the descendants -of a node have a common prefix (associated with that node). - -The trie module contains :class:`pygtrie.Trie`, :class:`pygtrie.CharTrie` and -:class:`pygtrie.StringTrie` classes each implementing a mutable mapping -interface, i.e. :class:`dict` interface. As such, in most circumstances, -:class:`pygtrie.Trie` could be used as a drop-in replacement for -a :class:`dict`, but the prefix nature of the data structure is trie’s real -strength. - -The module also contains :class:`pygtrie.PrefixSet` class which uses a trie to -store a set of prefixes such that a key is contained in the set if it or its -prefix is stored in the set. - -Features --------- - -- A full mutable mapping implementation. - -- Supports iterating over as well as deleting a subtrie. - -- Supports prefix checking as well as shortest and longest prefix - look-up. - -- Extensible for any kind of user-defined keys. - -- A PrefixSet supports “all keys starting with given prefix” logic. - -- Can store any value including None. - -For some simple examples see ``example.py`` file. -""" - -__author__ = 'Michal Nazarewicz ' -__copyright__ = 'Copyright 2014 Google Inc.' - - -import collections as _collections - -# Python 2.x and 3.x compatibility stuff -if hasattr(dict, 'iteritems'): - # pylint: disable=invalid-name - _iteritems = lambda d: d.iteritems() - _iterkeys = lambda d: d.iterkeys() - def _sorted_iteritems(d): - """Returns d's items in sorted order.""" - items = d.items() - items.sort() - return iter(items) -else: - _sorted_iteritems = lambda d: sorted(d.items()) # pylint: disable=invalid-name - _iteritems = lambda d: iter(d.items()) # pylint: disable=invalid-name - _iterkeys = lambda d: iter(d.keys()) # pylint: disable=invalid-name - -try: - _basestring = basestring -except NameError: - _basestring = str - - -class ShortKeyError(KeyError): - """Raised when given key is a prefix of a longer key.""" - pass - - -_SENTINEL = object() - - -class _Node(object): - """A single node of a trie. - - Stores value associated with the node and dictionary of children. - """ - __slots__ = ('children', 'value') - - def __init__(self): - self.children = {} - self.value = _SENTINEL - - def iterate(self, path, shallow, iteritems): - """Yields all the nodes with values associated to them in the trie. - - Args: - path: Path leading to this node. Used to construct the key when - returning value of this node and as a prefix for children. - shallow: Perform a shallow traversal, i.e. do not yield nodes if - their prefix has been yielded. - iteritems: A function taking dictionary as argument and returning - iterator over its items. Something other than dict.iteritems - may be given to enable sorting. - - Yields: - ``(path, value)`` tuples. - """ - # Use iterative function with stack on the heap so we don't hit Python's - # recursion depth limits. - node = self - stack = [] - while True: - if node.value is not _SENTINEL: - yield path, node.value - - if (not shallow or node.value is _SENTINEL) and node.children: - stack.append(iter(iteritems(node.children))) - path.append(None) - - while True: - try: - step, node = next(stack[-1]) - path[-1] = step - break - except StopIteration: - stack.pop() - path.pop() - except IndexError: - return - - def traverse(self, node_factory, path_conv, path, iteritems): - """Traverses the node and returns another type of node from factory. - - Args: - node_factory: Callable function to construct new nodes. - path_conv: Callable function to convert node path to a key. - path: Current path for this node. - iteritems: A function taking dictionary as argument and returning - iterator over its items. Something other than dict.iteritems - may be given to enable sorting. - - Returns: - An object constructed by calling node_factory(path_conv, path, - children, value=...), where children are constructed by node_factory - from the children of this node. There doesn't need to be 1:1 - correspondence between original nodes in the trie and constructed - nodes (see make_test_node_and_compress in test.py). - """ - def children(): - """Recursively traverses all of node's children.""" - for step, node in iteritems(self.children): - yield node.traverse(node_factory, path_conv, path + [step], - iteritems) - - args = [path_conv, tuple(path), children()] - - if self.value is not _SENTINEL: - args.append(self.value) - - return node_factory(*args) - - def __eq__(self, other): - # Like iterate, we don't recurse so this works on deep tries. - a, b = self, other - stack = [] - while True: - if a.value != b.value or len(a.children) != len(b.children): - return False - if a.children: - stack.append((_iteritems(a.children), b.children)) - - while True: - try: - key, a = next(stack[-1][0]) - b = stack[-1][1].get(key) - if b is None: - return False - break - except StopIteration: - stack.pop() - except IndexError: - return True - - return self.value == other.value and self.children == other.children - - def __ne__(self, other): - return not self.__eq__(other) - - def __bool__(self): - return bool(self.value is not _SENTINEL or self.children) - - __nonzero__ = __bool__ - - __hash__ = None - - def __getstate__(self): - """Get state used for pickling. - - The state is encoded as a list of simple commands which consist of an - integer and some command-dependent number of arguments. The commands - modify what the current node is by navigating the trie up and down and - setting node values. Possible commands are: - - * [n, step0, step1, ..., stepn-1, value], for n >= 0, specifies step - needed to reach the next current node as well as its new value. There - is no way to create a child node without setting its (or its - descendant's) value. - - * [-n], for -n < 0, specifies to go up n steps in the trie. - - When encoded as a state, the commands are flattened into a single list. - - For example:: - - [ 0, 'Root', - 2, 'Foo', 'Bar', 'Root/Foo/Bar Node', - -1, - 1, 'Baz', 'Root/Foo/Baz Node', - -2, - 1, 'Qux', 'Root/Qux Node' ] - - Creates the following hierarchy:: - - -* value: Root - +-- Foo --* no value - | +-- Bar -- * value: Root/Foo/Bar Node - | +-- Baz -- * value: Root/Foo/Baz Node - +-- Qux -- * value: Root/Qux Node - - Returns: - A pickable state which can be passed to :func:`_Node.__setstate__` - to reconstruct the node and its full hierarchy. - """ - # Like iterate, we don't recurse so pickling works on deep tries. - state = [] if self.value is _SENTINEL else [0] - last_cmd = 0 - node = self - stack = [] - while True: - if node.value is not _SENTINEL: - last_cmd = 0 - state.append(node.value) - stack.append(_iteritems(node.children)) - - while True: - try: - step, node = next(stack[-1]) - except StopIteration: - if last_cmd < 0: - state[-1] -= 1 - else: - last_cmd = -1 - state.append(-1) - stack.pop() - continue - except IndexError: - if last_cmd < 0: - state.pop() - return state - - if last_cmd > 0: - last_cmd += 1 - state[-last_cmd] += 1 - else: - last_cmd = 1 - state.append(1) - state.append(step) - break - - def __setstate__(self, state): - """Unpickles node. See :func:`_Node.__getstate__`.""" - self.__init__() - state = iter(state) - stack = [self] - for cmd in state: - if cmd < 0: - del stack[cmd:] - else: - while cmd > 0: - stack.append(type(self)()) - stack[-2].children[next(state)] = stack[-1] - cmd -= 1 - stack[-1].value = next(state) - - -_NONE_PAIR = type('NonePair', (tuple,), { - '__nonzero__': lambda _: False, - '__bool__': lambda _: False, - '__slots__': (), -})((None, None)) - - -class Trie(_collections.MutableMapping): - """A trie implementation with dict interface plus some extensions. - - Keys used with the :class:`pygtrie.Trie` must be iterable, yielding hashable - objects. In other words, for a given key, ``dict.fromkeys(key)`` must be - valid. - - In particular, strings work fine as trie keys, however when getting keys - back from iterkeys() method for example, instead of strings, tuples of - characters are produced. For that reason, :class:`pygtrie.CharTrie` or - :class:`pygtrie.StringTrie` may be preferred when using - :class:`pygtrie.Trie` with string keys. - """ - - def __init__(self, *args, **kwargs): - """Initialises the trie. - - Arguments are interpreted the same way :func:`Trie.update` interprets - them. - """ - self._root = _Node() - self._sorted = False - self.update(*args, **kwargs) - - @property - def _iteritems(self): - """Returns function yielding over dict's items possibly in sorted order. - - Returns: - A function iterating over items of a dictionary given as an - argument. If child nodes sorting has been enabled (via - :func:`Trie.enable_sorting` method), returned function will go - through the items in sorted order.. - """ - return _sorted_iteritems if self._sorted else _iteritems - - def enable_sorting(self, enable=True): - """Enables sorting of child nodes when iterating and traversing. - - Normally, child nodes are not sorted when iterating or traversing over - the trie (just like dict elements are not sorted). This method allows - sorting to be enabled (which was the behaviour prior to pygtrie 2.0 - release). - - For Trie class, enabling sorting of children is identical to simply - sorting the list of items since Trie returns keys as tuples. However, - for other implementations such as StringTrie the two may behove subtly - different. For example, sorting items might produce:: - - root/foo-bar - root/foo/baz - - even though foo comes before foo-bar. - - Args: - enable: Whether to enable sorting of child nodes. - """ - self._sorted = enable - - def clear(self): - """Removes all the values from the trie.""" - self._root = _Node() - - def update(self, *args, **kwargs): - """Updates stored values. Works like :func:`dict.update`.""" - if len(args) > 1: - raise ValueError('update() takes at most one positional argument, ' - '%d given.' % len(args)) - # We have this here instead of just letting MutableMapping.update() - # handle things because it will iterate over keys and for each key - # retrieve the value. With Trie, this may be expensive since the path - # to the node would have to be walked twice. Instead, we have our own - # implementation where iteritems() is used avoiding the unnecessary - # value look-up. - if args and isinstance(args[0], Trie): - for key, value in _iteritems(args[0]): - self[key] = value - args = () - super(Trie, self).update(*args, **kwargs) - - def copy(self): - """Returns a shallow copy of the trie.""" - return self.__class__(self) - - @classmethod - def fromkeys(cls, keys, value=None): - """Creates a new trie with given keys set. - - This is roughly equivalent to calling the constructor with a ``(key, - value) for key in keys`` generator. - - Args: - keys: An iterable of keys that should be set in the new trie. - value: Value to associate with given keys. - - Returns: - A new trie where each key from ``keys`` has been set to the given - value. - """ - trie = cls() - for key in keys: - trie[key] = value - return trie - - def _get_node(self, key, create=False): - """Returns node for given key. Creates it if requested. - - Args: - key: A key to look for. - create: Whether to create the node if it does not exist. - - Returns: - ``(node, trace)`` tuple where ``node`` is the node for given key and - ``trace`` is a list specifying path to reach the node including all - the encountered nodes. Each element of trace is a ``(step, node)`` - tuple where ``step`` is a step from parent node to given node and - ``node`` is node on the path. The first element of the path is - always ``(None, self._root)``. - - Raises: - KeyError: If there is no node for the key and ``create`` is - ``False``. - """ - node = self._root - trace = [(None, node)] - for step in self.__path_from_key(key): - if create: - node = node.children.setdefault(step, _Node()) - else: - node = node.children.get(step) - if not node: - raise KeyError(key) - trace.append((step, node)) - return node, trace - - def __iter__(self): - return self.iterkeys() - - # pylint: disable=arguments-differ - - def iteritems(self, prefix=_SENTINEL, shallow=False): - """Yields all nodes with associated values with given prefix. - - Only nodes with values are output. For example:: - - >>> import pygtrie - >>> t = pygtrie.StringTrie() - >>> t['foo'] = 'Foo' - >>> t['foo/bar/baz'] = 'Baz' - >>> t['qux'] = 'Qux' - >>> t.items() - [('qux', 'Qux'), ('foo', 'Foo'), ('foo/bar/baz', 'Baz')] - - Items are generated in topological order but the order of siblings is - unspecified by default. In other words, in the above example, the - ``('qux', 'Qux')`` pair might have been at the end of the list. At an - expense of efficiency, this can be changed via - :func:`Trie.enable_sorting`. - - With ``prefix`` argument, only items with specified prefix are generated - (i.e. only given subtrie is traversed) as demonstrated by:: - - >>> t.items(prefix='foo/bar') - [('foo/bar/baz', 'Baz')] - - With ``shallow`` argument, if a node has value associated with it, it's - children are not traversed even if they exist which can be seen in:: - - >>> t.items(shallow=True) - [('qux', 'Qux'), ('foo', 'Foo')] - - Args: - prefix: Prefix to limit iteration to. - shallow: Perform a shallow traversal, i.e. do not yield items if - their prefix has been yielded. - - Yields: - ``(key, value)`` tuples. - - Raises: - KeyError: If ``prefix`` does not match any node. - """ - node, _ = self._get_node(prefix) - for path, value in node.iterate(list(self.__path_from_key(prefix)), - shallow, self._iteritems): - yield (self._key_from_path(path), value) - - def iterkeys(self, prefix=_SENTINEL, shallow=False): - """Yields all keys having associated values with given prefix. - - This is equivalent to taking first element of tuples generated by - :func:`Trie.iteritems` which see for more detailed documentation. - - Args: - prefix: Prefix to limit iteration to. - shallow: Perform a shallow traversal, i.e. do not yield keys if - their prefix has been yielded. - - Yields: - All the keys (with given prefix) with associated values in the trie. - - Raises: - KeyError: If ``prefix`` does not match any node. - """ - for key, _ in self.iteritems(prefix=prefix, shallow=shallow): - yield key - - def itervalues(self, prefix=_SENTINEL, shallow=False): - """Yields all values associated with keys with given prefix. - - This is equivalent to taking second element of tuples generated by - :func:`Trie.iteritems` which see for more detailed documentation. - - Args: - prefix: Prefix to limit iteration to. - shallow: Perform a shallow traversal, i.e. do not yield values if - their prefix has been yielded. - - Yields: - All the values associated with keys (with given prefix) in the trie. - - Raises: - KeyError: If ``prefix`` does not match any node. - """ - node, _ = self._get_node(prefix) - for _, value in node.iterate(list(self.__path_from_key(prefix)), - shallow, self._iteritems): - yield value - - def items(self, prefix=_SENTINEL, shallow=False): - """Returns a list of ``(key, value)`` pairs in given subtrie. - - This is equivalent to constructing a list from generator returned by - :func:`Trie.iteritems` which see for more detailed documentation. - """ - return list(self.iteritems(prefix=prefix, shallow=shallow)) - - def keys(self, prefix=_SENTINEL, shallow=False): - """Returns a list of all the keys, with given prefix, in the trie. - - This is equivalent to constructing a list from generator returned by - :func:`Trie.iterkeys` which see for more detailed documentation. - """ - return list(self.iterkeys(prefix=prefix, shallow=shallow)) - - def values(self, prefix=_SENTINEL, shallow=False): - """Returns a list of values in given subtrie. - - This is equivalent to constructing a list from generator returned by - :func:`Trie.iterivalues` which see for more detailed documentation. - """ - return list(self.itervalues(prefix=prefix, shallow=shallow)) - - # pylint: enable=arguments-differ - - def __len__(self): - """Returns number of values in a trie. - - Note that this method is expensive as it iterates over the whole trie. - """ - return sum(1 for _ in self.itervalues()) - - def __nonzero__(self): - return bool(self._root) - - HAS_VALUE = 1 - HAS_SUBTRIE = 2 - - def has_node(self, key): - """Returns whether given node is in the trie. - - Return value is a bitwise or of ``HAS_VALUE`` and ``HAS_SUBTRIE`` - constants indicating node has a value associated with it and that it is - a prefix of another existing key respectively. Both of those are - independent of each other and all of the four combinations are possible. - For example:: - - >>> import pygtrie - >>> t = pygtrie.StringTrie() - >>> t['foo/bar'] = 'Bar' - >>> t['foo/bar/baz'] = 'Baz' - >>> t.has_node('qux') == 0 - True - >>> t.has_node('foo/bar/baz') == pygtrie.Trie.HAS_VALUE - True - >>> t.has_node('foo') == pygtrie.Trie.HAS_SUBTRIE - True - >>> t.has_node('foo/bar') == (pygtrie.Trie.HAS_VALUE | - ... pygtrie.Trie.HAS_SUBTRIE) - True - - There are two higher level methods built on top of this one which give - easier interface for the information. :func:`Trie.has_key` and returns - whether node has a value associated with it and :func:`Trie.has_subtrie` - checks whether node is a prefix. Continuing previous example:: - - >>> t.has_key('qux'), t.has_subtrie('qux') - False, False - >>> t.has_key('foo/bar/baz'), t.has_subtrie('foo/bar/baz') - True, False - >>> t.has_key('foo'), t.has_subtrie('foo') - False, True - >>> t.has_key('foo/bar'), t.has_subtrie('foo/bar') - True, True - - Args: - key: A key to look for. - - Returns: - Non-zero if node exists and if it does a bit-field denoting whether - it has a value associated with it and whether it has a subtrie. - """ - try: - node, _ = self._get_node(key) - except KeyError: - return 0 - return ((self.HAS_VALUE * int(node.value is not _SENTINEL)) | - (self.HAS_SUBTRIE * int(bool(node.children)))) - - def has_key(self, key): - """Indicates whether given key has value associated with it. - - See :func:`Trie.has_node` for more detailed documentation. - """ - return bool(self.has_node(key) & self.HAS_VALUE) - - def has_subtrie(self, key): - """Returns whether given key is a prefix of another key in the trie. - - See :func:`Trie.has_node` for more detailed documentation. - """ - return bool(self.has_node(key) & self.HAS_SUBTRIE) - - @staticmethod - def _slice_maybe(key_or_slice): - """Checks whether argument is a slice or a plain key. - - Args: - key_or_slice: A key or a slice to test. - - Returns: - ``(key, is_slice)`` tuple. ``is_slice`` indicates whether - ``key_or_slice`` is a slice and ``key`` is either ``key_or_slice`` - itself (if it's not a slice) or slice's start position. - - Raises: - TypeError: If ``key_or_slice`` is a slice whose stop or step are not - ``None`` In other words, only ``[key:]`` slices are valid. - """ - if isinstance(key_or_slice, slice): - if key_or_slice.stop is not None or key_or_slice.step is not None: - raise TypeError(key_or_slice) - return key_or_slice.start, True - return key_or_slice, False - - def __getitem__(self, key_or_slice): - """Returns value associated with given key or raises KeyError. - - When argument is a single key, value for that key is returned (or - :class:`KeyError` exception is thrown if the node does not exist or has - no value associated with it). - - When argument is a slice, it must be one with only `start` set in which - case the access is identical to :func:`Trie.itervalues` invocation with - prefix argument. - - Example: - - >>> import pygtrie - >>> t = pygtrie.StringTrie() - >>> t['foo/bar'] = 'Bar' - >>> t['foo/baz'] = 'Baz' - >>> t['qux'] = 'Qux' - >>> t['foo/bar'] - 'Bar' - >>> list(t['foo':]) - ['Baz', 'Bar'] - >>> t['foo'] - Traceback (most recent call last): - ... - pygtrie.ShortKeyError: 'foo' - - Args: - key_or_slice: A key or a slice to look for. - - Returns: - If a single key is passed, a value associated with given key. If - a slice is passed, a generator of values in specified subtrie. - - Raises: - ShortKeyError: If the key has no value associated with it but is - a prefix of some key with a value. Note that - :class:`ShortKeyError` is subclass of :class:`KeyError`. - KeyError: If key has no value associated with it nor is a prefix of - an existing key. - TypeError: If ``key_or_slice`` is a slice but it's stop or step are - not ``None``. - """ - if self._slice_maybe(key_or_slice)[1]: - return self.itervalues(key_or_slice.start) - node, _ = self._get_node(key_or_slice) - if node.value is _SENTINEL: - raise ShortKeyError(key_or_slice) - return node.value - - def _set(self, key, value, only_if_missing=False, clear_children=False): - """Sets value for a given key. - - Args: - key: Key to set value of. - value: Value to set to. - only_if_missing: If ``True``, value won't be changed if the key is - already associated with a value. - clear_children: If ``True``, all children of the node, if any, will - be removed. - - Returns: - Value of the node. - """ - node, _ = self._get_node(key, create=True) - if not only_if_missing or node.value is _SENTINEL: - node.value = value - if clear_children: - node.children.clear() - return node.value - - def __setitem__(self, key_or_slice, value): - """Sets value associated with given key. - - If `key_or_slice` is a key, simply associate it with given value. If it - is a slice (which must have `start` set only), it in addition clears any - subtrie that might have been attached to particular key. For example:: - - >>> import pygtrie - >>> t = pygtrie.StringTrie() - >>> t['foo/bar'] = 'Bar' - >>> t['foo/baz'] = 'Baz' - >>> t.keys() - ['foo/baz', 'foo/bar'] - >>> t['foo':] = 'Foo' - >>> t.keys() - ['foo'] - - Args: - key_or_slice: A key to look for or a slice. If it is a slice, the - whole subtrie (if present) will be replaced by a single node - with given value set. - value: Value to set. - - Raises: - TypeError: If key is a slice whose stop or step are not None. - """ - key, is_slice = self._slice_maybe(key_or_slice) - self._set(key, value, clear_children=is_slice) - - def setdefault(self, key, value=None): - """Sets value of a given node if not set already. Also returns it. - - In contrast to :func:`Trie.__setitem__`, this method does not accept - slice as a key. - """ - return self._set(key, value, only_if_missing=True) - - @staticmethod - def _cleanup_trace(trace): - """Removes empty nodes present on specified trace. - - Args: - trace: Trace to the node to cleanup as returned by - :func:`Trie._get_node`. - """ - i = len(trace) - 1 # len(path) >= 1 since root is always there - step, node = trace[i] - while i and not node: - i -= 1 - parent_step, parent = trace[i] - del parent.children[step] - step, node = parent_step, parent - - def _pop_from_node(self, node, trace, default=_SENTINEL): - """Removes a value from given node. - - Args: - node: Node to get value of. - trace: Trace to that node as returned by :func:`Trie._get_node`. - default: A default value to return if node has no value set. - - Returns: - Value of the node or ``default``. - - Raises: - ShortKeyError: If the node has no value associated with it and - ``default`` has not been given. - """ - if node.value is not _SENTINEL: - value = node.value - node.value = _SENTINEL - self._cleanup_trace(trace) - return value - elif default is _SENTINEL: - raise ShortKeyError() - else: - return default - - def pop(self, key, default=_SENTINEL): - """Deletes value associated with given key and returns it. - - Args: - key: A key to look for. - default: If specified, value that will be returned if given key has - no value associated with it. If not specified, method will - throw KeyError in such cases. - - Returns: - Removed value, if key had value associated with it, or ``default`` - (if given). - - Raises: - ShortKeyError: If ``default`` has not been specified and the key has - no value associated with it but is a prefix of some key with - a value. Note that :class:`ShortKeyError` is subclass of - :class:`KeyError`. - KeyError: If default has not been specified and key has no value - associated with it nor is a prefix of an existing key. - """ - try: - return self._pop_from_node(*self._get_node(key)) - except KeyError: - if default is not _SENTINEL: - return default - raise - - def popitem(self): - """Deletes an arbitrary value from the trie and returns it. - - There is no guarantee as to which item is deleted and returned. Neither - in respect to its lexicographical nor topological order. - - Returns: - ``(key, value)`` tuple indicating deleted key. - - Raises: - KeyError: If the trie is empty. - """ - if not self: - raise KeyError() - node = self._root - trace = [(None, node)] - while node.value is _SENTINEL: - step = next(_iterkeys(node.children)) - node = node.children[step] - trace.append((step, node)) - return (self._key_from_path((step for step, _ in trace[1:])), - self._pop_from_node(node, trace)) - - def __delitem__(self, key_or_slice): - """Deletes value associated with given key or raises KeyError. - - If argument is a key, value associated with it is deleted. If the key - is also a prefix, its descendents are not affected. On the other hand, - if the argument is a slice (in which case it must have only start set), - the whole subtrie is removed. For example:: - - >>> import pygtrie - >>> t = pygtrie.StringTrie() - >>> t['foo'] = 'Foo' - >>> t['foo/bar'] = 'Bar' - >>> t['foo/bar/baz'] = 'Baz' - >>> del t['foo/bar'] - >>> t.keys() - ['foo', 'foo/bar/baz'] - >>> del t['foo':] - >>> t.keys() - [] - - Args: - key_or_slice: A key to look for or a slice. If key is a slice, the - whole subtrie will be removed. - - Raises: - ShortKeyError: If the key has no value associated with it but is - a prefix of some key with a value. This is not thrown is - key_or_slice is a slice -- in such cases, the whole subtrie is - removed. Note that :class:`ShortKeyError` is subclass of - :class:`KeyError`. - KeyError: If key has no value associated with it nor is a prefix of - an existing key. - TypeError: If key is a slice whose stop or step are not ``None``. - """ - key, is_slice = self._slice_maybe(key_or_slice) - node, trace = self._get_node(key) - if is_slice: - node.children.clear() - elif node.value is _SENTINEL: - raise ShortKeyError(key) - node.value = _SENTINEL - self._cleanup_trace(trace) - - def prefixes(self, key): - """Walks towards the node specified by key and yields all found items. - - Example: - - >>> import pygtrie - >>> t = pygtrie.StringTrie() - >>> t['foo'] = 'Foo' - >>> t['foo/bar/baz'] = 'Baz' - >>> list(t.prefixes('foo/bar/baz/qux')) - [('foo', 'Foo'), ('foo/bar/baz', 'Baz')] - >>> list(t.prefixes('does/not/exist')) - [] - - Args: - key: Key to look for. - - Yields: - ``(k, value)`` pairs denoting keys with associated values - encountered on the way towards the specified key. - """ - node = self._root - path = self.__path_from_key(key) - pos = 0 - while True: - if node.value is not _SENTINEL: - yield self._key_from_path(path[:pos]), node.value - if pos == len(path): - break - node = node.children.get(path[pos]) - if not node: - break - pos += 1 - - def shortest_prefix(self, key): - """Finds the shortest prefix of a key with a value. - - This is equivalent to taking the first object yielded by - :func:`Trie.prefixes` with a default of `(None, None)` if said method - yields no items. As an added bonus, the pair in that case will be - a falsy value (as opposed to regular two-element tuple of ``None`` - values). - - Example: - - >>> import pygtrie - >>> t = pygtrie.StringTrie() - >>> t['foo'] = 'Foo' - >>> t['foo/bar/baz'] = 'Baz' - >>> t.shortest_prefix('foo/bar/baz/qux') - ('foo', 'Foo') - >>> t.shortest_prefix('does/not/exist') - (None, None) - >>> bool(t.shortest_prefix('does/not/exist')) - False - - Args: - key: Key to look for. - - Returns: - ``(k, value)`` where ``k`` is the shortest prefix of ``key`` (it may - equal ``key``) and ``value`` is a value associated with that key. - If no node is found, ``(None, None)`` is returned. - """ - return next(self.prefixes(key), _NONE_PAIR) - - def longest_prefix(self, key): - """Finds the longest prefix of a key with a value. - - This is equivalent to taking the last object yielded by - :func:`Trie.prefixes` with a default of `(None, None)` if said method - yields no items. As an added bonus, the pair in that case will be - a falsy value (as opposed to regular two-element tuple of ``None`` - values). - - Example: - - >>> import pygtrie - >>> t = pygtrie.StringTrie() - >>> t['foo'] = 'Foo' - >>> t['foo/bar/baz'] = 'Baz' - >>> t.longest_prefix('foo/bar/baz/qux') - ('foo/bar/baz', 'Baz') - >>> t.longest_prefix('does/not/exist') - (None, None) - >>> bool(t.longest_prefix('does/not/exist')) - False - - Args: - key: Key to look for. - - Returns: - ``(k, value)`` where ``k`` is the longest prefix of ``key`` (it may - equal ``key``) and ``value`` is a value associated with that key. - If no node is found, ``(None, None)`` is returned. - """ - ret = _NONE_PAIR - for ret in self.prefixes(key): - pass - return ret - - def __eq__(self, other): - return self._root == other._root # pylint: disable=protected-access - - def __ne__(self, other): - return self._root != other._root # pylint: disable=protected-access - - def __str__(self): - return 'Trie(%s)' % ( - ', '.join('%s: %s' % item for item in self.iteritems())) - - def __repr__(self): - if self: - return 'Trie((%s,))' % ( - ', '.join('(%r, %r)' % item for item in self.iteritems())) - else: - return 'Trie()' - - def __path_from_key(self, key): - """Converts a user visible key object to internal path representation. - - Args: - key: User supplied key or ``_SENTINEL``. - - Returns: - An empty tuple if ``key`` was ``_SENTINEL``, otherwise whatever - :func:`Trie._path_from_key` returns. - - Raises: - TypeError: If ``key`` is of invalid type. - """ - return () if key is _SENTINEL else self._path_from_key(key) - - def _path_from_key(self, key): # pylint: disable=no-self-use - """Converts a user visible key object to internal path representation. - - The default implementation simply returns key. - - Args: - key: User supplied key. - - Returns: - A path, which is an iterable of steps. Each step must be hashable. - - Raises: - TypeError: If key is of invalid type. - """ - return key - - def _key_from_path(self, path): # pylint: disable=no-self-use - """Converts an internal path into a user visible key object. - - The default implementation creates a tuple from the path. - - Args: - path: Internal path representation. - Returns: - A user visible key object. - """ - return tuple(path) - - def traverse(self, node_factory, prefix=_SENTINEL): - """Traverses the tree using node_factory object. - - node_factory is a callable function which accepts (path_conv, path, - children, value=...) arguments, where path_conv is a lambda converting - path representation to key, path is the path to this node, children is - an iterable of children nodes constructed by node_factory, optional - value is the value associated with the path. - - node_factory's children argument is a generator which has a few - consequences: - - * To traverse into node's children, the generator must be iterated over. - This can by accomplished by a simple "children = list(children)" - statement. - * Ignoring the argument allows node_factory to stop the traversal from - going into the children of the node. In other words, whole subtrie - can be removed from traversal if node_factory chooses so. - * If children is stored as is (i.e. as a generator) when it is iterated - over later on it will see state of the trie as it is during the - iteration and not when traverse method was called. - - :func:`Trie.traverse` has two advantages over :func:`Trie.iteritems` and - similar methods: - - 1. it allows subtries to be skipped completely when going through the - list of nodes based on the property of the parent node; and - - 2. it represents structure of the trie directly making it easy to - convert structure into a different representation. - - For example, the below snippet prints all files in current directory - counting how many HTML files were found but ignores hidden files and - directories (i.e. those whose names start with a dot):: - - import os - import pygtrie - - t = pygtrie.StringTrie(separator=os.sep) - - # Construct a trie with all files in current directory and all - # of its sub-directories. Files get set a True value. - # Directories are represented implicitly by being prefixes of - # files. - for root, _, files in os.walk('.'): - for name in files: t[os.path.join(root, name)] = True - - def traverse_callback(path_conv, path, children, is_file=False): - if path and path[-1] != '.' and path[-1][0] == '.': - # Ignore hidden directory (but accept root node and '.') - return 0 - elif is_file: - print path_conv(path) - return int(path[-1].endswith('.html')) - else: - # Otherwise, it's a directory. Traverse into children. - return sum(int(is_html) for is_html in children) - - print t.traverse(traverse_callback) - - As documented, ignoring the children argument causes subtrie to be - omitted and not walked into. - - In the next example, the trie is converted to a tree representation - where child nodes include a pointer to their parent. As before, hidden - files and directories are ignored:: - - import os - import pygtrie - - t = pygtrie.StringTrie(separator=os.sep) - for root, _, files in os.walk('.'): - for name in files: t[os.path.join(root, name)] = True - - class File(object): - def __init__(self, name): - self.name = name - self.parent = None - - class Directory(File): - def __init__(self, name, children): - super(Directory, self).__init__(name) - self._children = children - for child in children: - child.parent = self - - def traverse_callback(path_conv, path, children, is_file=False): - if not path or path[-1] == '.' or path[-1][0] != '.': - if is_file: - return File(path[-1]) - children = filter(None, children) - return Directory(path[-1] if path else '', children) - - root = t.traverse(traverse_callback) - - Note: Unlike iterators, traverse method uses stack recursion which means - that using it on deep tries may lead to a RuntimeError exception thrown - once Python's maximum recursion depth is reached. - - Args: - node_factory: Makes opaque objects from the keys and values of the - trie. - prefix: Prefix for node to start traversal, by default starts at - root. - - Returns: - Node object constructed by node_factory corresponding to the root - node. - - """ - node, _ = self._get_node(prefix) - return node.traverse(node_factory, self._key_from_path, - list(self.__path_from_key(prefix)), - self._iteritems) - -class CharTrie(Trie): - """A variant of a :class:`pygtrie.Trie` which accepts strings as keys. - - The only difference between :class:`pygtrie.CharTrie` and - :class:`pygtrie.Trie` is that when :class:`pygtrie.CharTrie` returns keys - back to the client (for instance in keys() method is called), those keys are - returned as strings. - - Canonical example where this class can be used is a dictionary of words in - a natural language. For example:: - - >>> import pygtrie - >>> t = pygtrie.CharTrie() - >>> t['wombat'] = True - >>> t['woman'] = True - >>> t['man'] = True - >>> t['manhole'] = True - >>> t.has_subtrie('wo') - True - >>> t.has_key('man') - True - >>> t.has_subtrie('man') - True - >>> t.has_subtrie('manhole') - False - """ - - def _key_from_path(self, path): - return ''.join(path) - - -class StringTrie(Trie): - """:class:`pygtrie.Trie` variant accepting strings with a separator as keys. - - The trie accepts strings as keys which are split into components using - a separator specified during initialisation ("/" by default). - - Canonical example where this class can be used is when keys are paths. For - example, it could map from a path to a request handler:: - - import pygtrie - - def handle_root(): pass - def handle_admin(): pass - def handle_admin_images(): pass - - handlers = pygtrie.StringTrie() - handlers[''] = handle_root - handlers['/admin'] = handle_admin - handlers['/admin/images'] = handle_admin_images - - request_path = '/admin/images/foo' - - handler = handlers.longest_prefix(request_path) - """ - - def __init__(self, *args, **kwargs): - """Initialises the trie. - - Except for a ``separator`` named argument, all other arguments are - interpreted the same way :func:`Trie.update` interprets them. - - Args: - *args: Passed to super class initialiser. - **kwargs: Passed to super class initialiser. - separator: A separator to use when splitting keys into paths used by - the trie. "/" is used if this argument is not specified. This - named argument is not specified on the function's prototype - because of Python's limitations. - """ - separator = kwargs.pop('separator', '/') - if not isinstance(separator, _basestring): - raise TypeError('separator must be a string') - if not separator: - raise ValueError('separator can not be empty') - self._separator = separator - super(StringTrie, self).__init__(*args, **kwargs) - - @classmethod - def fromkeys(cls, keys, value=None, separator='/'): # pylint: disable=arguments-differ - trie = cls(separator=separator) - for key in keys: - trie[key] = value - return trie - - def _path_from_key(self, key): - return key.split(self._separator) - - def _key_from_path(self, path): - return self._separator.join(path) - - -class PrefixSet(_collections.MutableSet): # pylint: disable=abstract-class-not-used - """A set of prefixes. - - :class:`pygtrie.PrefixSet` works similar to a normal set except it is said - to contain a key if the key or it's prefix is stored in the set. For - instance, if "foo" is added to the set, the set contains "foo" as well as - "foobar". - - The set supports addition of elements but does *not* support removal of - elements. This is because there's no obvious consistent and intuitive - behaviour for element deletion. - """ - - def __init__(self, iterable=None, factory=Trie, **kwargs): - """Initialises the prefix set. - - Args: - iterable: A sequence of keys to add to the set. - factory: A function used to create a trie used by the - :class:`pygtrie.PrefixSet`. - kwargs: Additional keyword arguments passed to the factory function. - """ - super(PrefixSet, self).__init__() - trie = factory(**kwargs) - if iterable: - trie.update((key, True) for key in iterable) - self._trie = trie - - def copy(self): - """Returns a copy of the prefix set.""" - return self.__class__(self._trie) - - def clear(self): - """Removes all keys from the set.""" - self._trie.clear() - - def __contains__(self, key): - """Checks whether set contains key or its prefix.""" - return bool(self._trie.shortest_prefix(key)[1]) - - def __iter__(self): - """Return iterator over all prefixes in the set. - - See :func:`PrefixSet.iter` method for more info. - """ - return self._trie.iterkeys() - - def iter(self, prefix=_SENTINEL): - """Iterates over all keys in the set optionally starting with a prefix. - - Since a key does not have to be explicitly added to the set to be an - element of the set, this method does not iterate over all possible keys - that the set contains, but only over the shortest set of prefixes of all - the keys the set contains. - - For example, if "foo" has been added to the set, the set contains also - "foobar", but this method will *not* iterate over "foobar". - - If ``prefix`` argument is given, method will iterate over keys with - given prefix only. The keys yielded from the function if prefix is - given does not have to be a subset (in mathematical sense) of the keys - yielded when there is not prefix. This happens, if the set contains - a prefix of the given prefix. - - For example, if only "foo" has been added to the set, iter method called - with no arguments will yield "foo" only. However, when called with - "foobar" argument, it will yield "foobar" only. - """ - if prefix is _SENTINEL: - return iter(self) - elif self._trie.has_node(prefix): - return self._trie.iterkeys(prefix=prefix) - elif prefix in self: - # Make sure the type of returned keys is consistent. - # pylint: disable=protected-access - return self._trie._key_from_path(self._trie._path_from_key(prefix)), - else: - return () - - def __len__(self): - """Returns number of keys stored in the set. - - Since a key does not have to be explicitly added to the set to be an - element of the set, this method does not count over all possible keys - that the set contains (since that would be infinity), but only over the - shortest set of prefixes of all the keys the set contains. - - For example, if "foo" has been added to the set, the set contains also - "foobar", but this method will *not* count "foobar". - - """ - return len(self._trie) - - def add(self, key): - """Adds given key to the set. - - If the set already contains prefix of the key being added, this - operation has no effect. If the key being added is a prefix of some - existing keys in the set, those keys are deleted and replaced by - a single entry for the key being added. - - For example, if the set contains key "foo" adding a key "foobar" does - not change anything. On the other hand, if the set contains keys - "foobar" and "foobaz", adding a key "foo" will replace those two keys - with a single key "foo". - - This makes a difference when iterating over the keys or counting number - of keys. Counter intuitively, adding of a key can *decrease* size of - the set. - - Args: - key: Key to add. - """ - if key not in self: - self._trie[key:] = True - - def discard(self, key): - raise NotImplementedError( - 'Removing keys from PrefixSet is not implemented.') - - def remove(self, key): - raise NotImplementedError( - 'Removing keys from PrefixSet is not implemented.') - - def pop(self): - raise NotImplementedError( - 'Removing keys from PrefixSet is not implemented.')