diff --git a/metadata/README.md b/metadata/README.md index 94accc471..06772b85a 100644 --- a/metadata/README.md +++ b/metadata/README.md @@ -1,4 +1,18 @@ # Validation for Chromium's Third Party Metadata Files -This directory contains the code to validate Chromium's third party metadata +This directory contains the code to validate Chromium third party metadata files, i.e. `README.chromium` files. + +## Prerequisites +1. Have the Chromium source code +[checked out](https://chromium.googlesource.com/chromium/src/+/main/docs/#checking-out-and-building) on disk +1. Ensure you've run `gclient runhooks` on your source checkout + +## Run +`metadata/scan.py` can be used to search for and validate all Chromium third +party metadata files within a repository. For example, if your `chromium/src` +checkout is at `~/my/path/to/chromium/src`, run the following command from the +root directory of `depot_tools`: +``` +vpython3 --vpython-spec=.vpython3 metadata/scan.py ~/my/path/to/chromium/src +``` diff --git a/metadata/dependency_metadata.py b/metadata/dependency_metadata.py index 48e358d70..0f103cfc5 100644 --- a/metadata/dependency_metadata.py +++ b/metadata/dependency_metadata.py @@ -118,9 +118,10 @@ class DependencyMetadata: ] if repeated_field_info: repeated = ", ".join(repeated_field_info) - error = vr.ValidationError( - f"Multiple entries for the same field: {repeated}.") - error.set_tag(tag="reason", value="repeated field") + error = vr.ValidationError(reason="There is a repeated field.", + additional=[ + f"Repeated fields: {repeated}", + ]) results.append(error) # Check required fields are present. @@ -128,8 +129,8 @@ class DependencyMetadata: for field in required_fields: if field not in self._metadata: field_name = field.get_name() - error = vr.ValidationError(f"Required field '{field_name}' is missing.") - error.set_tag(tag="reason", value="missing required field") + error = vr.ValidationError( + reason=f"Required field '{field_name}' is missing.") results.append(error) # Validate values for all present fields. diff --git a/metadata/discover.py b/metadata/discover.py index f9915da86..10427b57f 100644 --- a/metadata/discover.py +++ b/metadata/discover.py @@ -3,6 +3,7 @@ # found in the LICENSE file. import os +from typing import List # The base names that are known to be Chromium metadata files. _METADATA_FILES = { @@ -13,3 +14,25 @@ _METADATA_FILES = { def is_metadata_file(path: str) -> bool: """Filter for metadata files.""" return os.path.basename(path) in _METADATA_FILES + + +def find_metadata_files(root: str) -> List[str]: + """Finds all metadata files within the given root directory, including + subdirectories. + + Args: + root: the absolute path to the root directory within which to search. + + Returns: + the absolute full paths for all the metadata files within the root + directory. + """ + metadata_files = [] + for item in os.listdir(root): + full_path = os.path.join(root, item) + if is_metadata_file(item): + metadata_files.append(full_path) + elif os.path.isdir(full_path): + metadata_files.extend(find_metadata_files(full_path)) + + return metadata_files diff --git a/metadata/fields/custom/cpe_prefix.py b/metadata/fields/custom/cpe_prefix.py index 26c045d5d..35d8c6669 100644 --- a/metadata/fields/custom/cpe_prefix.py +++ b/metadata/fields/custom/cpe_prefix.py @@ -19,7 +19,7 @@ import metadata.fields.field_types as field_types import metadata.fields.util as util import metadata.validation_result as vr -_PATTERN_CPE_PREFIX = re.compile(r"^cpe:/.+:.+:.+(:.+)*$") +_PATTERN_CPE_PREFIX = re.compile(r"^cpe:(2.3:|/).+:.+:.+(:.+)*$", re.IGNORECASE) class CPEPrefixField(field_types.MetadataField): @@ -28,12 +28,16 @@ class CPEPrefixField(field_types.MetadataField): super().__init__(name="CPEPrefix", one_liner=True) def validate(self, value: str) -> Union[vr.ValidationResult, None]: - """Checks the given value is either 'unknown', or a valid - CPE in the URI form `cpe:/::[:]`. + """Checks the given value is either 'unknown', or conforms to either the + CPE 2.3 or 2.2 format. """ if util.is_unknown(value) or util.matches(_PATTERN_CPE_PREFIX, value): return None return vr.ValidationError( - f"{self._name} is '{value}' - must be either 'unknown', or in the form " - "'cpe:/::[:]'.") + reason=f"{self._name} is invalid.", + additional=[ + "This field should be a CPE (version 2.3 or 2.2), or 'unknown'.", + "Search for a CPE tag for the package at " + "https://nvd.nist.gov/products/cpe/search.", + ]) diff --git a/metadata/fields/custom/date.py b/metadata/fields/custom/date.py index 9509b9728..e50eded68 100644 --- a/metadata/fields/custom/date.py +++ b/metadata/fields/custom/date.py @@ -32,5 +32,8 @@ class DateField(field_types.MetadataField): if util.matches(_PATTERN_DATE, value): return None - return vr.ValidationError( - f"{self._name} is '{value}' - must use format YYYY-MM-DD.") + return vr.ValidationError(reason=f"{self._name} is invalid.", + additional=[ + "The correct format is YYYY-MM-DD.", + f"Current value is '{value}'.", + ]) diff --git a/metadata/fields/custom/license.py b/metadata/fields/custom/license.py index dc1171cf7..5d0afef0a 100644 --- a/metadata/fields/custom/license.py +++ b/metadata/fields/custom/license.py @@ -108,24 +108,22 @@ class LicenseField(field_types.MetadataField): atomic_delimiter=self.VALUE_DELIMITER) for license, allowed in licenses: if util.is_empty(license): - return vr.ValidationError(f"{self._name} has an empty value.") + return vr.ValidationError(reason=f"{self._name} has an empty value.") if not allowed: not_allowlisted.append(license) if not_allowlisted: - template = ("{field_name} has licenses not in the allowlist. If " - "there are multiple license types, separate them with a " - "'{delim}'. Invalid values: {values}.") - message = template.format(field_name=self._name, - delim=self.VALUE_DELIMITER, - values=util.quoted(not_allowlisted)) - return vr.ValidationWarning(message) + return vr.ValidationWarning( + reason=f"{self._name} has a license not in the allowlist.", + additional=[ + f"Separate licenses using a '{self.VALUE_DELIMITER}'.", + f"Licenses not allowlisted: {util.quoted(not_allowlisted)}.", + ]) # Suggest using the standard value delimiter when possible. if (re.search(_PATTERN_VERBOSE_DELIMITER, value) and self.VALUE_DELIMITER not in value): return vr.ValidationWarning( - f"{self._name} should use '{self.VALUE_DELIMITER}' to delimit " - "values.") + reason=f"Separate licenses using a '{self.VALUE_DELIMITER}'.") return None diff --git a/metadata/fields/custom/license_file.py b/metadata/fields/custom/license_file.py index 3a13514b1..dd15c76a5 100644 --- a/metadata/fields/custom/license_file.py +++ b/metadata/fields/custom/license_file.py @@ -42,7 +42,10 @@ class LicenseFileField(field_types.MetadataField): """ if value == _NOT_SHIPPED: return vr.ValidationWarning( - f"{self._name} uses deprecated value '{_NOT_SHIPPED}'.") + reason=f"{self._name} uses deprecated value '{_NOT_SHIPPED}'.", + additional=[ + f"Remove this field and use 'Shipped: {util.NO}' instead.", + ]) invalid_values = [] for path in value.split(self.VALUE_DELIMITER): @@ -51,13 +54,13 @@ class LicenseFileField(field_types.MetadataField): invalid_values.append(path) if invalid_values: - template = ("{field_name} has invalid values. Paths cannot be empty, " - "or include '../'. If there are multiple license files, " - "separate them with a '{delim}'. Invalid values: {values}.") - message = template.format(field_name=self._name, - delim=self.VALUE_DELIMITER, - values=util.quoted(invalid_values)) - return vr.ValidationError(message) + return vr.ValidationError( + reason=f"{self._name} is invalid.", + additional=[ + "File paths cannot be empty, or include '../'.", + f"Separate license files using a '{self.VALUE_DELIMITER}'.", + f"Invalid values: {util.quoted(invalid_values)}.", + ]) return None @@ -84,7 +87,10 @@ class LicenseFileField(field_types.MetadataField): """ if value == _NOT_SHIPPED: return vr.ValidationWarning( - f"{self._name} uses deprecated value '{_NOT_SHIPPED}'.") + reason=f"{self._name} uses deprecated value '{_NOT_SHIPPED}'.", + additional=[ + f"Remove this field and use 'Shipped: {util.NO}' instead.", + ]) invalid_values = [] for license_filename in value.split(self.VALUE_DELIMITER): @@ -100,10 +106,12 @@ class LicenseFileField(field_types.MetadataField): invalid_values.append(license_filepath) if invalid_values: - template = ("{field_name} has invalid values. Failed to find file(s) on" - "local disk. Invalid values: {values}.") - message = template.format(field_name=self._name, - values=util.quoted(invalid_values)) - return vr.ValidationError(message) + missing = ", ".join(invalid_values) + return vr.ValidationError( + reason=f"{self._name} is invalid.", + additional=[ + "Failed to find all license files on local disk.", + f"Missing files:{missing}.", + ]) return None diff --git a/metadata/fields/custom/url.py b/metadata/fields/custom/url.py index 35f47a08d..1cdfaa638 100644 --- a/metadata/fields/custom/url.py +++ b/metadata/fields/custom/url.py @@ -44,13 +44,12 @@ class URLField(field_types.MetadataField): invalid_values.append(url) if invalid_values: - template = ("{field_name} has invalid values. URLs must use a protocol " - "scheme in [http, https, ftp, git]. If there are multiple " - "URLs, separate them with a '{delim}'. Invalid values: " - "{values}.") - message = template.format(field_name=self._name, - delim=self.VALUE_DELIMITER, - values=util.quoted(invalid_values)) - return vr.ValidationError(message) + return vr.ValidationError( + reason=f"{self._name} is invalid.", + additional=[ + "URLs must use a protocol scheme in [http, https, ftp, git].", + f"Separate URLs using a '{self.VALUE_DELIMITER}'.", + f"Invalid values: {util.quoted(invalid_values)}.", + ]) return None diff --git a/metadata/fields/custom/version.py b/metadata/fields/custom/version.py index 47f09a76e..02c995b93 100644 --- a/metadata/fields/custom/version.py +++ b/metadata/fields/custom/version.py @@ -40,12 +40,18 @@ class VersionField(field_types.MetadataField): """ if value == "0" or util.is_unknown(value): return vr.ValidationWarning( - f"{self._name} is '{value}' - use 'N/A' if this package does not " - "version or is versioned by date or revision.") + reason=f"{self._name} is '{value}'.", + additional=[ + "Set this field to 'N/A' if this package does not version or is " + "versioned by date or revision.", + ]) if util.is_empty(value): return vr.ValidationError( - f"{self._name} is empty - use 'N/A' if this package is versioned by " - "date or revision.") + reason=f"{self._name} is empty.", + additional=[ + "Set this field to 'N/A' if this package does not version or is " + "versioned by date or revision.", + ]) return None diff --git a/metadata/fields/field_types.py b/metadata/fields/field_types.py index 21f8ebe1e..48f26bc64 100644 --- a/metadata/fields/field_types.py +++ b/metadata/fields/field_types.py @@ -66,7 +66,7 @@ class FreeformTextField(MetadataField): def validate(self, value: str) -> Union[vr.ValidationResult, None]: """Checks the given value has at least one non-whitespace character.""" if util.is_empty(value): - return vr.ValidationError(f"{self._name} is empty.") + return vr.ValidationError(reason=f"{self._name} is empty.") return None @@ -83,8 +83,15 @@ class YesNoField(MetadataField): if util.matches(_PATTERN_STARTS_WITH_YES_OR_NO, value): return vr.ValidationWarning( - f"{self._name} is '{value}' - should be only {util.YES} or {util.NO}." - ) + reason=f"{self._name} is invalid.", + additional=[ + f"This field should be only {util.YES} or {util.NO}.", + f"Current value is '{value}'.", + ]) return vr.ValidationError( - f"{self._name} is '{value}' - must be {util.YES} or {util.NO}.") + reason=f"{self._name} is invalid.", + additional=[ + f"This field must be {util.YES} or {util.NO}.", + f"Current value is '{value}'.", + ]) \ No newline at end of file diff --git a/metadata/scan.py b/metadata/scan.py new file mode 100644 index 000000000..b4e6b2bdd --- /dev/null +++ b/metadata/scan.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 +# Copyright 2023 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +from __future__ import print_function + +import argparse +from collections import defaultdict +import os +import sys + +_THIS_DIR = os.path.abspath(os.path.dirname(__file__)) +# The repo's root directory. +_ROOT_DIR = os.path.abspath(os.path.join(_THIS_DIR, "..")) + +# Add the repo's root directory for clearer imports. +sys.path.insert(0, _ROOT_DIR) + +import metadata.discover +import metadata.validate + + +def parse_args() -> argparse.Namespace: + """Helper to parse args to this script.""" + parser = argparse.ArgumentParser() + repo_root_dir = parser.add_argument( + "repo_root_dir", + help=("The path to the repository's root directory, which will be " + "scanned for Chromium metadata files, e.g. '~/chromium/src'."), + ) + + args = parser.parse_args() + + # Check the repo root directory exists. + src_dir = os.path.abspath(args.repo_root_dir) + if not os.path.exists(src_dir) or not os.path.isdir(src_dir): + raise argparse.ArgumentError( + repo_root_dir, + f"Invalid repository root directory '{src_dir}' - not found", + ) + + return args + + +def main() -> None: + """Runs validation on all metadata files within the directory specified by the + repo_root_dir arg. + """ + config = parse_args() + src_dir = os.path.abspath(config.repo_root_dir) + + metadata_files = metadata.discover.find_metadata_files(src_dir) + file_count = len(metadata_files) + print(f"Found {file_count} metadata files.") + + invalid_file_count = 0 + + # Key is constructed from the result severity and reason; + # Value is a list of files affected by that reason at that severity. + all_reasons = defaultdict(list) + for filepath in metadata_files: + file_results = metadata.validate.validate_file(filepath, + repo_root_dir=src_dir) + invalid = False + if file_results: + relpath = os.path.relpath(filepath, start=src_dir) + print(f"\n{len(file_results)} problem(s) in {relpath}:") + for result in file_results: + print(f" {result}") + summary_key = "{severity} - {reason}".format( + severity=result.get_severity_prefix(), reason=result.get_reason()) + all_reasons[summary_key].append(relpath) + if result.is_fatal(): + invalid = True + + if invalid: + invalid_file_count += 1 + + print("\n\nDone.\nSummary:") + for summary_key, affected_files in all_reasons.items(): + count = len(affected_files) + plural = "s" if count > 1 else "" + print(f"\n {count} file{plural}: {summary_key}") + for affected_file in affected_files: + print(f" {affected_file}") + + print(f"\n\n{invalid_file_count} / {file_count} metadata files are invalid, " + "i.e. the file has at least one fatal validation issue.") + + +if __name__ == "__main__": + main() diff --git a/metadata/tests/dependency_metadata_test.py b/metadata/tests/dependency_metadata_test.py index 178497193..2a0dc91f7 100644 --- a/metadata/tests/dependency_metadata_test.py +++ b/metadata/tests/dependency_metadata_test.py @@ -38,8 +38,7 @@ class DependencyValidationTest(unittest.TestCase): ) self.assertEqual(len(results), 1) self.assertTrue(isinstance(results[0], vr.ValidationError)) - self.assertDictEqual(results[0].get_all_tags(), - {"reason": "repeated field"}) + self.assertEqual(results[0].get_reason(), "There is a repeated field.") def test_required_field(self): """Check that a validation error is returned for a missing field.""" @@ -58,8 +57,8 @@ class DependencyValidationTest(unittest.TestCase): ) self.assertEqual(len(results), 1) self.assertTrue(isinstance(results[0], vr.ValidationError)) - self.assertDictEqual(results[0].get_all_tags(), - {"reason": "missing required field"}) + self.assertEqual(results[0].get_reason(), + "Required field 'URL' is missing.") def test_invalid_field(self): """Check field validation issues are returned.""" @@ -78,8 +77,7 @@ class DependencyValidationTest(unittest.TestCase): ) self.assertEqual(len(results), 1) self.assertTrue(isinstance(results[0], vr.ValidationError)) - self.assertDictEqual(results[0].get_all_tags(), - {"field": known_fields.SECURITY_CRITICAL.get_name()}) + self.assertEqual(results[0].get_reason(), "Security Critical is invalid.") def test_invalid_license_file_path(self): """Check license file path validation issues are returned.""" @@ -99,8 +97,7 @@ class DependencyValidationTest(unittest.TestCase): ) self.assertEqual(len(results), 1) self.assertTrue(isinstance(results[0], vr.ValidationError)) - self.assertDictEqual(results[0].get_all_tags(), - {"field": known_fields.LICENSE_FILE.get_name()}) + self.assertEqual(results[0].get_reason(), "License File is invalid.") def test_multiple_validation_issues(self): """Check all validation issues are returned.""" diff --git a/metadata/tests/fields_test.py b/metadata/tests/fields_test.py index 8ce04efed..a1756990e 100644 --- a/metadata/tests/fields_test.py +++ b/metadata/tests/fields_test.py @@ -70,7 +70,11 @@ class FieldValidationTest(unittest.TestCase): self._run_field_validation( field=known_fields.CPE_PREFIX, valid_values=[ - "unknown", "cpe:/a:sqlite:sqlite:3.0.0", "cpe:/a:sqlite:sqlite" + "unknown", + "Cpe:2.3:a:sqlite:sqlite:3.0.0", + "cpe:2.3:a:sqlite:sqlite", + "CPE:/a:sqlite:sqlite:3.0.0", + "cpe:/a:sqlite:sqlite", ], error_values=["", "\n"], ) diff --git a/metadata/validate.py b/metadata/validate.py index 960c5e489..4f25b6a57 100644 --- a/metadata/validate.py +++ b/metadata/validate.py @@ -36,8 +36,7 @@ def validate_content(content: str, source_file_dir: str, results = [] dependencies = metadata.parse.parse_content(content) if not dependencies: - result = vr.ValidationError("No dependency metadata found.") - result.set_tag("reason", "no metadata") + result = vr.ValidationError(reason="No dependency metadata found.") return [result] for dependency in dependencies: @@ -49,8 +48,9 @@ def validate_content(content: str, source_file_dir: str, def _construct_file_read_error(filepath: str, cause: str) -> vr.ValidationError: """Helper function to create a validation error for a file reading issue.""" - result = vr.ValidationError(f"Cannot read '{filepath}' - {cause}.") - result.set_tag(tag="reason", value="read error") + result = vr.ValidationError( + reason="Cannot read metadata file.", + additional=[f"Attempted to read '{filepath}' but {cause}."]) return result diff --git a/metadata/validation_result.py b/metadata/validation_result.py index 45a1d1dca..23da20382 100644 --- a/metadata/validation_result.py +++ b/metadata/validation_result.py @@ -4,7 +4,7 @@ # found in the LICENSE file. import textwrap -from typing import Dict, Union +from typing import Dict, List, Union _CHROMIUM_METADATA_PRESCRIPT = "Third party metadata issue:" @@ -14,13 +14,23 @@ _CHROMIUM_METADATA_POSTSCRIPT = ("Check //third_party/README.chromium.template " class ValidationResult: """Base class for validation issues.""" - def __init__(self, message: str, fatal: bool): + def __init__(self, reason: str, fatal: bool, additional: List[str] = []): + """Constructor for a validation issue. + + Args: + reason: the root cause of the issue. + fatal: whether the issue is fatal. + additional: details that should be included in the validation message, + e.g. advice on how to address the issue, or specific + problematic values. + """ + self._reason = reason self._fatal = fatal - self._message = message + self._message = " ".join([reason] + additional) self._tags = {} def __str__(self) -> str: - prefix = "ERROR" if self._fatal else "[non-fatal]" + prefix = self.get_severity_prefix() return f"{prefix} - {self._message}" def __repr__(self) -> str: @@ -29,6 +39,14 @@ class ValidationResult: def is_fatal(self) -> bool: return self._fatal + def get_severity_prefix(self): + if self._fatal: + return "ERROR" + return "[non-fatal]" + + def get_reason(self) -> str: + return self._reason + def set_tag(self, tag: str, value: str) -> bool: self._tags[tag] = value @@ -54,11 +72,11 @@ class ValidationResult: class ValidationError(ValidationResult): """Fatal validation issue. Presubmit should fail.""" - def __init__(self, message: str): - super().__init__(message=message, fatal=True) + def __init__(self, reason: str, additional: List[str] = []): + super().__init__(reason=reason, fatal=True, additional=additional) class ValidationWarning(ValidationResult): """Non-fatal validation issue. Presubmit should pass.""" - def __init__(self, message: str): - super().__init__(message=message, fatal=False) + def __init__(self, reason: str, additional: List[str] = []): + super().__init__(reason=reason, fatal=False, additional=additional)