From 1f067b88df42154d71bcadeddec7e830fb7d2979 Mon Sep 17 00:00:00 2001 From: Michael Achenbach Date: Fri, 2 Feb 2018 11:59:16 +0100 Subject: [PATCH] Optimize check for existing files in download_from_google_storage The first call to gsutil causes a network call and makes the download script slow also in the most optimal cases. This CL refactors the download script and moves the first gsutil call after checking locally if sha1s match. 1) This turns the input acquisition into a generator and buffers the files and sha1s in a list before multithreading. 2) This sequentially checks the sha1s and files and bails out early if all match. In Chrome-land, we usually call this script with only one file. There are some cases with around 4. This could also be parallelized if the need arises. 3) The initial gsutil check, which ensures gsutil is updated, is moved right in front of the multithreaded downloads. The performance of one call to download_from_google_storage for an existing 500MB file is 2.3s before this CL and 1.2s after (most of the remaining time left is spent for making sha1sum). Example for full gclient runhooks (when everything is up-to-date): Chromium: 32s before, 12s after V8: 12s before, 3s after Bug: 776311 Change-Id: Ia7715a6af84b1b336455ea88494d399bdb050317 Reviewed-on: https://chromium-review.googlesource.com/897562 Commit-Queue: Michael Achenbach Reviewed-by: Sergiy Byelozyorov Reviewed-by: Ryan Tseng --- download_from_google_storage.py | 82 ++++++++++++++----- .../download_from_google_storage_unittest.py | 16 ++-- 2 files changed, 71 insertions(+), 27 deletions(-) diff --git a/download_from_google_storage.py b/download_from_google_storage.py index c6ed7faa2..f484f4f51 100755 --- a/download_from_google_storage.py +++ b/download_from_google_storage.py @@ -148,9 +148,8 @@ def get_sha1(filename): # Download-specific code starts here -def enumerate_work_queue(input_filename, work_queue, directory, - recursive, ignore_errors, output, sha1_file, - auto_platform): +def enumerate_input(input_filename, directory, recursive, ignore_errors, output, + sha1_file, auto_platform): if sha1_file: if not os.path.exists(input_filename): if not ignore_errors: @@ -159,18 +158,17 @@ def enumerate_work_queue(input_filename, work_queue, directory, with open(input_filename, 'rb') as f: sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip()) if sha1_match: - work_queue.put((sha1_match.groups(1)[0], output)) - return 1 + yield (sha1_match.groups(1)[0], output) + return if not ignore_errors: raise InvalidFileError('No sha1 sum found in %s.' % input_filename) print >> sys.stderr, 'No sha1 sum found in %s.' % input_filename - return 0 + return if not directory: - work_queue.put((input_filename, output)) - return 1 + yield (input_filename, output) + return - work_queue_size = 0 for root, dirs, files in os.walk(input_filename): if not recursive: for item in dirs[:]: @@ -199,14 +197,11 @@ def enumerate_work_queue(input_filename, work_queue, directory, with open(full_path, 'rb') as f: sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip()) if sha1_match: - work_queue.put( - (sha1_match.groups(1)[0], full_path.replace('.sha1', ''))) - work_queue_size += 1 + yield (sha1_match.groups(1)[0], full_path.replace('.sha1', '')) else: if not ignore_errors: raise InvalidFileError('No sha1 sum found in %s.' % filename) print >> sys.stderr, 'No sha1 sum found in %s.' % filename - return work_queue_size def _validate_tar_file(tar, prefix): @@ -233,7 +228,7 @@ def _downloader_worker_thread(thread_num, q, force, base_url, thread_num, output_filename)) ret_codes.put((1, '%s is not a tar.gz archive.' % (output_filename))) continue - extract_dir = output_filename[0:len(output_filename)-7] + extract_dir = output_filename[:-len('.tar.gz')] if os.path.exists(output_filename) and not force: if not extract or os.path.exists(extract_dir): if get_sha1(output_filename) == input_sha1_sum: @@ -344,9 +339,57 @@ class PrinterThread(threading.Thread): print line +def _data_exists(input_sha1_sum, output_filename, extract): + """Returns True if the data exists locally and matches the sha1. + + This conservatively returns False for error cases. + + Args: + input_sha1_sum: Expected sha1 stored on disk. + output_filename: The file to potentially download later. Its sha1 will be + compared to input_sha1_sum. + extract: Wheather or not a downloaded file should be extracted. If the file + is not extracted, this just compares the sha1 of the file. If the file + is to be extracted, this only compares the sha1 of the target archive if + the target directory already exists. The content of the target directory + is not checked. + """ + extract_dir = None + if extract: + if not output_filename.endswith('.tar.gz'): + # This will cause an error later. Conservativly return False to not bail + # out too early. + return False + extract_dir = output_filename[:-len('.tar.gz')] + if os.path.exists(output_filename): + if not extract or os.path.exists(extract_dir): + if get_sha1(output_filename) == input_sha1_sum: + return True + return False + + def download_from_google_storage( input_filename, base_url, gsutil, num_threads, directory, recursive, force, output, ignore_errors, sha1_file, verbose, auto_platform, extract): + + # Tuples of sha1s and paths. + input_data = list(enumerate_input( + input_filename, directory, recursive, ignore_errors, output, sha1_file, + auto_platform)) + + # Sequentially check for the most common case and see if we can bail out + # early before making any slow calls to gsutil. + if not force and all( + _data_exists(sha1, path, extract) for sha1, path in input_data): + return 0 + + # Call this once to ensure gsutil's update routine is called only once. Only + # needs to be done if we'll process input data in parallel, which can lead to + # a race in gsutil's self-update on the first call. Note, this causes a + # network call, therefore any fast bailout should be done before this point. + if len(input_data) > 1: + gsutil.check_call('version') + # Start up all the worker threads. all_threads = [] download_start = time.time() @@ -366,10 +409,9 @@ def download_from_google_storage( printer_thread.daemon = True printer_thread.start() - # Enumerate our work queue. - work_queue_size = enumerate_work_queue( - input_filename, work_queue, directory, recursive, - ignore_errors, output, sha1_file, auto_platform) + # Populate our work queue. + for sha1, path in input_data: + work_queue.put((sha1, path)) for _ in all_threads: work_queue.put((None, None)) # Used to tell worker threads to stop. @@ -389,7 +431,7 @@ def download_from_google_storage( # Only print summary if any work was done. if printer_thread.did_print_anything: print 'Downloading %d files took %1f second(s)' % ( - work_queue_size, time.time() - download_start) + len(input_data), time.time() - download_start) return max_ret_code @@ -493,7 +535,6 @@ def main(args): else: parser.error('gsutil not found in %s, bad depot_tools checkout?' % GSUTIL_DEFAULT_PATH) - gsutil.check_call('version') # Call this once to ensure it exists. # Passing in -g/--config will run our copy of GSUtil, then quit. if options.config: @@ -501,6 +542,7 @@ def main(args): print 'If you do not have a project ID, enter "0" when asked for one.' print '===End note from depot_tools===' print + gsutil.check_call('version') return gsutil.call('config') if not args: diff --git a/tests/download_from_google_storage_unittest.py b/tests/download_from_google_storage_unittest.py index 908ebbcf8..c03c911d3 100755 --- a/tests/download_from_google_storage_unittest.py +++ b/tests/download_from_google_storage_unittest.py @@ -202,19 +202,20 @@ class DownloadTests(unittest.TestCase): shutil.rmtree(self.temp_dir) def test_enumerate_files_non_recursive(self): - queue_size = download_from_google_storage.enumerate_work_queue( - self.base_path, self.queue, True, False, False, None, False, False) + for item in download_from_google_storage.enumerate_input( + self.base_path, True, False, False, None, False, False): + self.queue.put(item) expected_queue = [ ('e6c4fbd4fe7607f3e6ebf68b2ea4ef694da7b4fe', os.path.join(self.base_path, 'rootfolder_text.txt')), ('7871c8e24da15bad8b0be2c36edc9dc77e37727f', os.path.join(self.base_path, 'uploaded_lorem_ipsum.txt'))] self.assertEqual(sorted(expected_queue), sorted(self.queue.queue)) - self.assertEqual(queue_size, 2) def test_enumerate_files_recursive(self): - queue_size = download_from_google_storage.enumerate_work_queue( - self.base_path, self.queue, True, True, False, None, False, False) + for item in download_from_google_storage.enumerate_input( + self.base_path, True, True, False, None, False, False): + self.queue.put(item) expected_queue = [ ('e6c4fbd4fe7607f3e6ebf68b2ea4ef694da7b4fe', os.path.join(self.base_path, 'rootfolder_text.txt')), @@ -223,7 +224,6 @@ class DownloadTests(unittest.TestCase): ('b5415aa0b64006a95c0c409182e628881d6d6463', os.path.join(self.base_path, 'subfolder', 'subfolder_text.txt'))] self.assertEqual(sorted(expected_queue), sorted(self.queue.queue)) - self.assertEqual(queue_size, 3) def test_download_worker_single_file(self): sha1_hash = self.lorem_ipsum_sha1 @@ -337,7 +337,7 @@ class DownloadTests(unittest.TestCase): sha1_hash = '7871c8e24da15bad8b0be2c36edc9dc77e37727f' input_filename = '%s/%s' % (self.base_url, sha1_hash) output_filename = os.path.join(self.base_path, 'uploaded_lorem_ipsum.txt') - self.gsutil.add_expected(0, '', '') + self.gsutil.add_expected(0, '', '') # ls self.gsutil.add_expected(101, '', 'Test error message.') code = download_from_google_storage.download_from_google_storage( input_filename=sha1_hash, @@ -392,6 +392,7 @@ class DownloadTests(unittest.TestCase): sha1_hash = '7871c8e24da15bad8b0be2c36edc9dc77e37727f' input_filename = '%s/%s' % (self.base_url, sha1_hash) output_filename = os.path.join(self.base_path, 'uploaded_lorem_ipsum.txt') + self.gsutil.add_expected(0, '', '') # version self.gsutil.add_expected(0, '', '') # ls self.gsutil.add_expected(0, '', '', lambda: shutil.copyfile( self.lorem_ipsum, output_filename)) # cp @@ -410,6 +411,7 @@ class DownloadTests(unittest.TestCase): auto_platform=False, extract=False) expected_calls = [ + ('check_call', ('version',)), ('check_call', ('ls', input_filename)), ('check_call',