Optimize check for existing files in download_from_google_storage

The first call to gsutil causes a network call and makes the download
script slow also in the most optimal cases. This CL refactors the
download script and moves the first gsutil call after checking locally
if sha1s match.

1) This turns the input acquisition into a generator and buffers the
files and sha1s in a list before multithreading.

2) This sequentially checks the sha1s and files and bails out early if
all match. In Chrome-land, we usually call this script with only one
file. There are some cases with around 4. This could also be
parallelized if the need arises.

3) The initial gsutil check, which ensures gsutil is updated, is moved
right in front of the multithreaded downloads.

The performance of one call to download_from_google_storage for an
existing 500MB file is 2.3s before this CL and 1.2s after (most of the
remaining time left is spent for making sha1sum).

Example for full gclient runhooks (when everything is up-to-date):

Chromium: 32s before, 12s after
V8: 12s before, 3s after

Bug: 776311
Change-Id: Ia7715a6af84b1b336455ea88494d399bdb050317
Reviewed-on: https://chromium-review.googlesource.com/897562
Commit-Queue: Michael Achenbach <machenbach@chromium.org>
Reviewed-by: Sergiy Byelozyorov <sergiyb@chromium.org>
Reviewed-by: Ryan Tseng <hinoka@chromium.org>
changes/62/897562/7
Michael Achenbach 7 years ago committed by Commit Bot
parent e150d63db9
commit 1f067b88df

@ -148,9 +148,8 @@ def get_sha1(filename):
# Download-specific code starts here
def enumerate_work_queue(input_filename, work_queue, directory,
recursive, ignore_errors, output, sha1_file,
auto_platform):
def enumerate_input(input_filename, directory, recursive, ignore_errors, output,
sha1_file, auto_platform):
if sha1_file:
if not os.path.exists(input_filename):
if not ignore_errors:
@ -159,18 +158,17 @@ def enumerate_work_queue(input_filename, work_queue, directory,
with open(input_filename, 'rb') as f:
sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())
if sha1_match:
work_queue.put((sha1_match.groups(1)[0], output))
return 1
yield (sha1_match.groups(1)[0], output)
return
if not ignore_errors:
raise InvalidFileError('No sha1 sum found in %s.' % input_filename)
print >> sys.stderr, 'No sha1 sum found in %s.' % input_filename
return 0
return
if not directory:
work_queue.put((input_filename, output))
return 1
yield (input_filename, output)
return
work_queue_size = 0
for root, dirs, files in os.walk(input_filename):
if not recursive:
for item in dirs[:]:
@ -199,14 +197,11 @@ def enumerate_work_queue(input_filename, work_queue, directory,
with open(full_path, 'rb') as f:
sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())
if sha1_match:
work_queue.put(
(sha1_match.groups(1)[0], full_path.replace('.sha1', '')))
work_queue_size += 1
yield (sha1_match.groups(1)[0], full_path.replace('.sha1', ''))
else:
if not ignore_errors:
raise InvalidFileError('No sha1 sum found in %s.' % filename)
print >> sys.stderr, 'No sha1 sum found in %s.' % filename
return work_queue_size
def _validate_tar_file(tar, prefix):
@ -233,7 +228,7 @@ def _downloader_worker_thread(thread_num, q, force, base_url,
thread_num, output_filename))
ret_codes.put((1, '%s is not a tar.gz archive.' % (output_filename)))
continue
extract_dir = output_filename[0:len(output_filename)-7]
extract_dir = output_filename[:-len('.tar.gz')]
if os.path.exists(output_filename) and not force:
if not extract or os.path.exists(extract_dir):
if get_sha1(output_filename) == input_sha1_sum:
@ -344,9 +339,57 @@ class PrinterThread(threading.Thread):
print line
def _data_exists(input_sha1_sum, output_filename, extract):
"""Returns True if the data exists locally and matches the sha1.
This conservatively returns False for error cases.
Args:
input_sha1_sum: Expected sha1 stored on disk.
output_filename: The file to potentially download later. Its sha1 will be
compared to input_sha1_sum.
extract: Wheather or not a downloaded file should be extracted. If the file
is not extracted, this just compares the sha1 of the file. If the file
is to be extracted, this only compares the sha1 of the target archive if
the target directory already exists. The content of the target directory
is not checked.
"""
extract_dir = None
if extract:
if not output_filename.endswith('.tar.gz'):
# This will cause an error later. Conservativly return False to not bail
# out too early.
return False
extract_dir = output_filename[:-len('.tar.gz')]
if os.path.exists(output_filename):
if not extract or os.path.exists(extract_dir):
if get_sha1(output_filename) == input_sha1_sum:
return True
return False
def download_from_google_storage(
input_filename, base_url, gsutil, num_threads, directory, recursive,
force, output, ignore_errors, sha1_file, verbose, auto_platform, extract):
# Tuples of sha1s and paths.
input_data = list(enumerate_input(
input_filename, directory, recursive, ignore_errors, output, sha1_file,
auto_platform))
# Sequentially check for the most common case and see if we can bail out
# early before making any slow calls to gsutil.
if not force and all(
_data_exists(sha1, path, extract) for sha1, path in input_data):
return 0
# Call this once to ensure gsutil's update routine is called only once. Only
# needs to be done if we'll process input data in parallel, which can lead to
# a race in gsutil's self-update on the first call. Note, this causes a
# network call, therefore any fast bailout should be done before this point.
if len(input_data) > 1:
gsutil.check_call('version')
# Start up all the worker threads.
all_threads = []
download_start = time.time()
@ -366,10 +409,9 @@ def download_from_google_storage(
printer_thread.daemon = True
printer_thread.start()
# Enumerate our work queue.
work_queue_size = enumerate_work_queue(
input_filename, work_queue, directory, recursive,
ignore_errors, output, sha1_file, auto_platform)
# Populate our work queue.
for sha1, path in input_data:
work_queue.put((sha1, path))
for _ in all_threads:
work_queue.put((None, None)) # Used to tell worker threads to stop.
@ -389,7 +431,7 @@ def download_from_google_storage(
# Only print summary if any work was done.
if printer_thread.did_print_anything:
print 'Downloading %d files took %1f second(s)' % (
work_queue_size, time.time() - download_start)
len(input_data), time.time() - download_start)
return max_ret_code
@ -493,7 +535,6 @@ def main(args):
else:
parser.error('gsutil not found in %s, bad depot_tools checkout?' %
GSUTIL_DEFAULT_PATH)
gsutil.check_call('version') # Call this once to ensure it exists.
# Passing in -g/--config will run our copy of GSUtil, then quit.
if options.config:
@ -501,6 +542,7 @@ def main(args):
print 'If you do not have a project ID, enter "0" when asked for one.'
print '===End note from depot_tools==='
print
gsutil.check_call('version')
return gsutil.call('config')
if not args:

@ -202,19 +202,20 @@ class DownloadTests(unittest.TestCase):
shutil.rmtree(self.temp_dir)
def test_enumerate_files_non_recursive(self):
queue_size = download_from_google_storage.enumerate_work_queue(
self.base_path, self.queue, True, False, False, None, False, False)
for item in download_from_google_storage.enumerate_input(
self.base_path, True, False, False, None, False, False):
self.queue.put(item)
expected_queue = [
('e6c4fbd4fe7607f3e6ebf68b2ea4ef694da7b4fe',
os.path.join(self.base_path, 'rootfolder_text.txt')),
('7871c8e24da15bad8b0be2c36edc9dc77e37727f',
os.path.join(self.base_path, 'uploaded_lorem_ipsum.txt'))]
self.assertEqual(sorted(expected_queue), sorted(self.queue.queue))
self.assertEqual(queue_size, 2)
def test_enumerate_files_recursive(self):
queue_size = download_from_google_storage.enumerate_work_queue(
self.base_path, self.queue, True, True, False, None, False, False)
for item in download_from_google_storage.enumerate_input(
self.base_path, True, True, False, None, False, False):
self.queue.put(item)
expected_queue = [
('e6c4fbd4fe7607f3e6ebf68b2ea4ef694da7b4fe',
os.path.join(self.base_path, 'rootfolder_text.txt')),
@ -223,7 +224,6 @@ class DownloadTests(unittest.TestCase):
('b5415aa0b64006a95c0c409182e628881d6d6463',
os.path.join(self.base_path, 'subfolder', 'subfolder_text.txt'))]
self.assertEqual(sorted(expected_queue), sorted(self.queue.queue))
self.assertEqual(queue_size, 3)
def test_download_worker_single_file(self):
sha1_hash = self.lorem_ipsum_sha1
@ -337,7 +337,7 @@ class DownloadTests(unittest.TestCase):
sha1_hash = '7871c8e24da15bad8b0be2c36edc9dc77e37727f'
input_filename = '%s/%s' % (self.base_url, sha1_hash)
output_filename = os.path.join(self.base_path, 'uploaded_lorem_ipsum.txt')
self.gsutil.add_expected(0, '', '')
self.gsutil.add_expected(0, '', '') # ls
self.gsutil.add_expected(101, '', 'Test error message.')
code = download_from_google_storage.download_from_google_storage(
input_filename=sha1_hash,
@ -392,6 +392,7 @@ class DownloadTests(unittest.TestCase):
sha1_hash = '7871c8e24da15bad8b0be2c36edc9dc77e37727f'
input_filename = '%s/%s' % (self.base_url, sha1_hash)
output_filename = os.path.join(self.base_path, 'uploaded_lorem_ipsum.txt')
self.gsutil.add_expected(0, '', '') # version
self.gsutil.add_expected(0, '', '') # ls
self.gsutil.add_expected(0, '', '', lambda: shutil.copyfile(
self.lorem_ipsum, output_filename)) # cp
@ -410,6 +411,7 @@ class DownloadTests(unittest.TestCase):
auto_platform=False,
extract=False)
expected_calls = [
('check_call', ('version',)),
('check_call',
('ls', input_filename)),
('check_call',

Loading…
Cancel
Save