[ssci] Script to run validation on all metadata files

Adds script metadata/scan.py which can be used to search for and validate Chromium dependency metadata files, given a repository root directory. Bug: b:277147404 Change-Id: Ibde0eeb7babe0b1e3f9c7f887bece629d390974a Reviewed-on: https://chromium-review.googlesource.com/c/chromium/tools/depot_tools/+/4823596 Commit-Queue: Anne Redulla <aredulla@google.com> Reviewed-by: Rachael Newitt <renewitt@google.com>
2 years ago · 427f0f43ad
parent e36c6bba14
commit 427f0f43ad
15 changed files with 249 additions and 74 deletions
--- a/metadata/README.md
+++ b/metadata/README.md
@ -1,4 +1,18 @@
 # Validation for Chromium's Third Party Metadata Files

-This directory contains the code to validate Chromium's third party metadata
+This directory contains the code to validate Chromium third party metadata
 files, i.e. `README.chromium` files.
+
+## Prerequisites
+1. Have the Chromium source code
+[checked out](https://chromium.googlesource.com/chromium/src/+/main/docs/#checking-out-and-building) on disk
+1. Ensure you've run `gclient runhooks` on your source checkout
+
+## Run
+`metadata/scan.py` can be used to search for and validate all Chromium third
+party metadata files within a repository. For example, if your `chromium/src`
+checkout is at `~/my/path/to/chromium/src`, run the following command from the
+root directory of `depot_tools`:
+```
+vpython3 --vpython-spec=.vpython3 metadata/scan.py ~/my/path/to/chromium/src
+```
--- a/metadata/dependency_metadata.py
+++ b/metadata/dependency_metadata.py
@ -118,9 +118,10 @@ class DependencyMetadata:
    ]
    if repeated_field_info:
      repeated = ", ".join(repeated_field_info)
-      error = vr.ValidationError(
-          f"Multiple entries for the same field: {repeated}.")
-      error.set_tag(tag="reason", value="repeated field")
+      error = vr.ValidationError(reason="There is a repeated field.",
+                                 additional=[
+                                     f"Repeated fields: {repeated}",
+                                 ])
      results.append(error)

    # Check required fields are present.
@ -128,8 +129,8 @@ class DependencyMetadata:
    for field in required_fields:
      if field not in self._metadata:
        field_name = field.get_name()
-        error = vr.ValidationError(f"Required field '{field_name}' is missing.")
-        error.set_tag(tag="reason", value="missing required field")
+        error = vr.ValidationError(
+            reason=f"Required field '{field_name}' is missing.")
        results.append(error)

    # Validate values for all present fields.
--- a/metadata/discover.py
+++ b/metadata/discover.py
@ -3,6 +3,7 @@
 # found in the LICENSE file.

 import os
+from typing import List

 # The base names that are known to be Chromium metadata files.
 _METADATA_FILES = {
@ -13,3 +14,25 @@ _METADATA_FILES = {
 def is_metadata_file(path: str) -> bool:
  """Filter for metadata files."""
  return os.path.basename(path) in _METADATA_FILES
+
+
+def find_metadata_files(root: str) -> List[str]:
+  """Finds all metadata files within the given root directory, including
+  subdirectories.
+
+  Args:
+    root: the absolute path to the root directory within which to search.
+
+  Returns:
+    the absolute full paths for all the metadata files within the root
+    directory.
+  """
+  metadata_files = []
+  for item in os.listdir(root):
+    full_path = os.path.join(root, item)
+    if is_metadata_file(item):
+      metadata_files.append(full_path)
+    elif os.path.isdir(full_path):
+      metadata_files.extend(find_metadata_files(full_path))
+
+  return metadata_files
--- a/metadata/fields/custom/cpe_prefix.py
+++ b/metadata/fields/custom/cpe_prefix.py
@ -19,7 +19,7 @@ import metadata.fields.field_types as field_types
 import metadata.fields.util as util
 import metadata.validation_result as vr

-_PATTERN_CPE_PREFIX = re.compile(r"^cpe:/.+:.+:.+(:.+)*$")
+_PATTERN_CPE_PREFIX = re.compile(r"^cpe:(2.3:|/).+:.+:.+(:.+)*$", re.IGNORECASE)


 class CPEPrefixField(field_types.MetadataField):
@ -28,12 +28,16 @@ class CPEPrefixField(field_types.MetadataField):
    super().__init__(name="CPEPrefix", one_liner=True)

  def validate(self, value: str) -> Union[vr.ValidationResult, None]:
-    """Checks the given value is either 'unknown', or a valid
-    CPE in the URI form `cpe:/<part>:<vendor>:<product>[:<optional fields>]`.
+    """Checks the given value is either 'unknown', or conforms to either the
+    CPE 2.3 or 2.2 format.
    """
    if util.is_unknown(value) or util.matches(_PATTERN_CPE_PREFIX, value):
      return None

    return vr.ValidationError(
-        f"{self._name} is '{value}' - must be either 'unknown', or in the form "
-        "'cpe:/<part>:<vendor>:<product>[:<optional fields>]'.")
+        reason=f"{self._name} is invalid.",
+        additional=[
+            "This field should be a CPE (version 2.3 or 2.2), or 'unknown'.",
+            "Search for a CPE tag for the package at "
+            "https://nvd.nist.gov/products/cpe/search.",
+        ])
--- a/metadata/fields/custom/date.py
+++ b/metadata/fields/custom/date.py
@ -32,5 +32,8 @@ class DateField(field_types.MetadataField):
    if util.matches(_PATTERN_DATE, value):
      return None

-    return vr.ValidationError(
-        f"{self._name} is '{value}' - must use format YYYY-MM-DD.")
+    return vr.ValidationError(reason=f"{self._name} is invalid.",
+                              additional=[
+                                  "The correct format is YYYY-MM-DD.",
+                                  f"Current value is '{value}'.",
+                              ])
--- a/metadata/fields/custom/license.py
+++ b/metadata/fields/custom/license.py
@ -108,24 +108,22 @@ class LicenseField(field_types.MetadataField):
                                     atomic_delimiter=self.VALUE_DELIMITER)
    for license, allowed in licenses:
      if util.is_empty(license):
-        return vr.ValidationError(f"{self._name} has an empty value.")
+        return vr.ValidationError(reason=f"{self._name} has an empty value.")
      if not allowed:
        not_allowlisted.append(license)

    if not_allowlisted:
-      template = ("{field_name} has licenses not in the allowlist. If "
-                  "there are multiple license types, separate them with a "
-                  "'{delim}'. Invalid values: {values}.")
-      message = template.format(field_name=self._name,
-                                delim=self.VALUE_DELIMITER,
-                                values=util.quoted(not_allowlisted))
-      return vr.ValidationWarning(message)
+      return vr.ValidationWarning(
+          reason=f"{self._name} has a license not in the allowlist.",
+          additional=[
+              f"Separate licenses using a '{self.VALUE_DELIMITER}'.",
+              f"Licenses not allowlisted: {util.quoted(not_allowlisted)}.",
+          ])

    # Suggest using the standard value delimiter when possible.
    if (re.search(_PATTERN_VERBOSE_DELIMITER, value)
        and self.VALUE_DELIMITER not in value):
      return vr.ValidationWarning(
-          f"{self._name} should use '{self.VALUE_DELIMITER}' to delimit "
-          "values.")
+          reason=f"Separate licenses using a '{self.VALUE_DELIMITER}'.")

    return None
--- a/metadata/fields/custom/license_file.py
+++ b/metadata/fields/custom/license_file.py
@ -42,7 +42,10 @@ class LicenseFileField(field_types.MetadataField):
    """
    if value == _NOT_SHIPPED:
      return vr.ValidationWarning(
-          f"{self._name} uses deprecated value '{_NOT_SHIPPED}'.")
+          reason=f"{self._name} uses deprecated value '{_NOT_SHIPPED}'.",
+          additional=[
+              f"Remove this field and use 'Shipped: {util.NO}' instead.",
+          ])

    invalid_values = []
    for path in value.split(self.VALUE_DELIMITER):
@ -51,13 +54,13 @@ class LicenseFileField(field_types.MetadataField):
        invalid_values.append(path)

    if invalid_values:
-      template = ("{field_name} has invalid values. Paths cannot be empty, "
-                  "or include '../'. If there are multiple license files, "
-                  "separate them with a '{delim}'. Invalid values: {values}.")
-      message = template.format(field_name=self._name,
-                                delim=self.VALUE_DELIMITER,
-                                values=util.quoted(invalid_values))
-      return vr.ValidationError(message)
+      return vr.ValidationError(
+          reason=f"{self._name} is invalid.",
+          additional=[
+              "File paths cannot be empty, or include '../'.",
+              f"Separate license files using a '{self.VALUE_DELIMITER}'.",
+              f"Invalid values: {util.quoted(invalid_values)}.",
+          ])

    return None

@ -84,7 +87,10 @@ class LicenseFileField(field_types.MetadataField):
    """
    if value == _NOT_SHIPPED:
      return vr.ValidationWarning(
-          f"{self._name} uses deprecated value '{_NOT_SHIPPED}'.")
+          reason=f"{self._name} uses deprecated value '{_NOT_SHIPPED}'.",
+          additional=[
+              f"Remove this field and use 'Shipped: {util.NO}' instead.",
+          ])

    invalid_values = []
    for license_filename in value.split(self.VALUE_DELIMITER):
@ -100,10 +106,12 @@ class LicenseFileField(field_types.MetadataField):
        invalid_values.append(license_filepath)

    if invalid_values:
-      template = ("{field_name} has invalid values. Failed to find file(s) on"
-                  "local disk. Invalid values: {values}.")
-      message = template.format(field_name=self._name,
-                                values=util.quoted(invalid_values))
-      return vr.ValidationError(message)
+      missing = ", ".join(invalid_values)
+      return vr.ValidationError(
+          reason=f"{self._name} is invalid.",
+          additional=[
+              "Failed to find all license files on local disk.",
+              f"Missing files:{missing}.",
+          ])

    return None
--- a/metadata/fields/custom/url.py
+++ b/metadata/fields/custom/url.py
@ -44,13 +44,12 @@ class URLField(field_types.MetadataField):
        invalid_values.append(url)

    if invalid_values:
-      template = ("{field_name} has invalid values. URLs must use a protocol "
-                  "scheme in [http, https, ftp, git]. If there are multiple "
-                  "URLs, separate them with a '{delim}'. Invalid values: "
-                  "{values}.")
-      message = template.format(field_name=self._name,
-                                delim=self.VALUE_DELIMITER,
-                                values=util.quoted(invalid_values))
-      return vr.ValidationError(message)
+      return vr.ValidationError(
+          reason=f"{self._name} is invalid.",
+          additional=[
+              "URLs must use a protocol scheme in [http, https, ftp, git].",
+              f"Separate URLs using a '{self.VALUE_DELIMITER}'.",
+              f"Invalid values: {util.quoted(invalid_values)}.",
+          ])

    return None
--- a/metadata/fields/custom/version.py
+++ b/metadata/fields/custom/version.py
@ -40,12 +40,18 @@ class VersionField(field_types.MetadataField):
    """
    if value == "0" or util.is_unknown(value):
      return vr.ValidationWarning(
-          f"{self._name} is '{value}' - use 'N/A' if this package does not "
-          "version or is versioned by date or revision.")
+          reason=f"{self._name} is '{value}'.",
+          additional=[
+              "Set this field to 'N/A' if this package does not version or is "
+              "versioned by date or revision.",
+          ])

    if util.is_empty(value):
      return vr.ValidationError(
-          f"{self._name} is empty - use 'N/A' if this package is versioned by "
-          "date or revision.")
+          reason=f"{self._name} is empty.",
+          additional=[
+              "Set this field to 'N/A' if this package does not version or is "
+              "versioned by date or revision.",
+          ])

    return None
--- a/metadata/fields/field_types.py
+++ b/metadata/fields/field_types.py
@ -66,7 +66,7 @@ class FreeformTextField(MetadataField):
  def validate(self, value: str) -> Union[vr.ValidationResult, None]:
    """Checks the given value has at least one non-whitespace character."""
    if util.is_empty(value):
-      return vr.ValidationError(f"{self._name} is empty.")
+      return vr.ValidationError(reason=f"{self._name} is empty.")

    return None

@ -83,8 +83,15 @@ class YesNoField(MetadataField):

    if util.matches(_PATTERN_STARTS_WITH_YES_OR_NO, value):
      return vr.ValidationWarning(
-          f"{self._name} is '{value}' - should be only {util.YES} or {util.NO}."
-      )
+          reason=f"{self._name} is invalid.",
+          additional=[
+              f"This field should be only {util.YES} or {util.NO}.",
+              f"Current value is '{value}'.",
+          ])

    return vr.ValidationError(
-        f"{self._name} is '{value}' - must be {util.YES} or {util.NO}.")
+        reason=f"{self._name} is invalid.",
+        additional=[
+            f"This field must be {util.YES} or {util.NO}.",
+            f"Current value is '{value}'.",
+        ])
--- a/metadata/scan.py
+++ b/metadata/scan.py
@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+# Copyright 2023 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+from __future__ import print_function
+
+import argparse
+from collections import defaultdict
+import os
+import sys
+
+_THIS_DIR = os.path.abspath(os.path.dirname(__file__))
+# The repo's root directory.
+_ROOT_DIR = os.path.abspath(os.path.join(_THIS_DIR, ".."))
+
+# Add the repo's root directory for clearer imports.
+sys.path.insert(0, _ROOT_DIR)
+
+import metadata.discover
+import metadata.validate
+
+
+def parse_args() -> argparse.Namespace:
+  """Helper to parse args to this script."""
+  parser = argparse.ArgumentParser()
+  repo_root_dir = parser.add_argument(
+      "repo_root_dir",
+      help=("The path to the repository's root directory, which will be "
+            "scanned for Chromium metadata files, e.g. '~/chromium/src'."),
+  )
+
+  args = parser.parse_args()
+
+  # Check the repo root directory exists.
+  src_dir = os.path.abspath(args.repo_root_dir)
+  if not os.path.exists(src_dir) or not os.path.isdir(src_dir):
+    raise argparse.ArgumentError(
+        repo_root_dir,
+        f"Invalid repository root directory '{src_dir}' - not found",
+    )
+
+  return args
+
+
+def main() -> None:
+  """Runs validation on all metadata files within the directory specified by the
+  repo_root_dir arg.
+  """
+  config = parse_args()
+  src_dir = os.path.abspath(config.repo_root_dir)
+
+  metadata_files = metadata.discover.find_metadata_files(src_dir)
+  file_count = len(metadata_files)
+  print(f"Found {file_count} metadata files.")
+
+  invalid_file_count = 0
+
+  # Key is constructed from the result severity and reason;
+  # Value is a list of files affected by that reason at that severity.
+  all_reasons = defaultdict(list)
+  for filepath in metadata_files:
+    file_results = metadata.validate.validate_file(filepath,
+                                                   repo_root_dir=src_dir)
+    invalid = False
+    if file_results:
+      relpath = os.path.relpath(filepath, start=src_dir)
+      print(f"\n{len(file_results)} problem(s) in {relpath}:")
+      for result in file_results:
+        print(f"    {result}")
+        summary_key = "{severity} - {reason}".format(
+            severity=result.get_severity_prefix(), reason=result.get_reason())
+        all_reasons[summary_key].append(relpath)
+        if result.is_fatal():
+          invalid = True
+
+    if invalid:
+      invalid_file_count += 1
+
+  print("\n\nDone.\nSummary:")
+  for summary_key, affected_files in all_reasons.items():
+    count = len(affected_files)
+    plural = "s" if count > 1 else ""
+    print(f"\n  {count} file{plural}: {summary_key}")
+    for affected_file in affected_files:
+      print(f"    {affected_file}")
+
+  print(f"\n\n{invalid_file_count} / {file_count} metadata files are invalid, "
+        "i.e. the file has at least one fatal validation issue.")
+
+
+if __name__ == "__main__":
+  main()
--- a/metadata/tests/dependency_metadata_test.py
+++ b/metadata/tests/dependency_metadata_test.py
@ -38,8 +38,7 @@ class DependencyValidationTest(unittest.TestCase):
    )
    self.assertEqual(len(results), 1)
    self.assertTrue(isinstance(results[0], vr.ValidationError))
-    self.assertDictEqual(results[0].get_all_tags(),
-                         {"reason": "repeated field"})
+    self.assertEqual(results[0].get_reason(), "There is a repeated field.")

  def test_required_field(self):
    """Check that a validation error is returned for a missing field."""
@ -58,8 +57,8 @@ class DependencyValidationTest(unittest.TestCase):
    )
    self.assertEqual(len(results), 1)
    self.assertTrue(isinstance(results[0], vr.ValidationError))
-    self.assertDictEqual(results[0].get_all_tags(),
-                         {"reason": "missing required field"})
+    self.assertEqual(results[0].get_reason(),
+                     "Required field 'URL' is missing.")

  def test_invalid_field(self):
    """Check field validation issues are returned."""
@ -78,8 +77,7 @@ class DependencyValidationTest(unittest.TestCase):
    )
    self.assertEqual(len(results), 1)
    self.assertTrue(isinstance(results[0], vr.ValidationError))
-    self.assertDictEqual(results[0].get_all_tags(),
-                         {"field": known_fields.SECURITY_CRITICAL.get_name()})
+    self.assertEqual(results[0].get_reason(), "Security Critical is invalid.")

  def test_invalid_license_file_path(self):
    """Check license file path validation issues are returned."""
@ -99,8 +97,7 @@ class DependencyValidationTest(unittest.TestCase):
    )
    self.assertEqual(len(results), 1)
    self.assertTrue(isinstance(results[0], vr.ValidationError))
-    self.assertDictEqual(results[0].get_all_tags(),
-                         {"field": known_fields.LICENSE_FILE.get_name()})
+    self.assertEqual(results[0].get_reason(), "License File is invalid.")

  def test_multiple_validation_issues(self):
    """Check all validation issues are returned."""
--- a/metadata/tests/fields_test.py
+++ b/metadata/tests/fields_test.py
@ -70,7 +70,11 @@ class FieldValidationTest(unittest.TestCase):
    self._run_field_validation(
        field=known_fields.CPE_PREFIX,
        valid_values=[
-            "unknown", "cpe:/a:sqlite:sqlite:3.0.0", "cpe:/a:sqlite:sqlite"
+            "unknown",
+            "Cpe:2.3:a:sqlite:sqlite:3.0.0",
+            "cpe:2.3:a:sqlite:sqlite",
+            "CPE:/a:sqlite:sqlite:3.0.0",
+            "cpe:/a:sqlite:sqlite",
        ],
        error_values=["", "\n"],
    )
--- a/metadata/validate.py
+++ b/metadata/validate.py
@ -36,8 +36,7 @@ def validate_content(content: str, source_file_dir: str,
  results = []
  dependencies = metadata.parse.parse_content(content)
  if not dependencies:
-    result = vr.ValidationError("No dependency metadata found.")
-    result.set_tag("reason", "no metadata")
+    result = vr.ValidationError(reason="No dependency metadata found.")
    return [result]

  for dependency in dependencies:
@ -49,8 +48,9 @@ def validate_content(content: str, source_file_dir: str,

 def _construct_file_read_error(filepath: str, cause: str) -> vr.ValidationError:
  """Helper function to create a validation error for a file reading issue."""
-  result = vr.ValidationError(f"Cannot read '{filepath}' - {cause}.")
-  result.set_tag(tag="reason", value="read error")
+  result = vr.ValidationError(
+      reason="Cannot read metadata file.",
+      additional=[f"Attempted to read '{filepath}' but {cause}."])
  return result


--- a/metadata/validation_result.py
+++ b/metadata/validation_result.py
@ -4,7 +4,7 @@
 # found in the LICENSE file.

 import textwrap
-from typing import Dict, Union
+from typing import Dict, List, Union


 _CHROMIUM_METADATA_PRESCRIPT = "Third party metadata issue:"
@ -14,13 +14,23 @@ _CHROMIUM_METADATA_POSTSCRIPT = ("Check //third_party/README.chromium.template "

 class ValidationResult:
  """Base class for validation issues."""
-  def __init__(self, message: str, fatal: bool):
+  def __init__(self, reason: str, fatal: bool, additional: List[str] = []):
+    """Constructor for a validation issue.
+
+    Args:
+      reason: the root cause of the issue.
+      fatal: whether the issue is fatal.
+      additional: details that should be included in the validation message,
+                  e.g. advice on how to address the issue, or specific
+                  problematic values.
+    """
+    self._reason = reason
    self._fatal = fatal
-    self._message = message
+    self._message = " ".join([reason] + additional)
    self._tags = {}

  def __str__(self) -> str:
-    prefix = "ERROR" if self._fatal else "[non-fatal]"
+    prefix = self.get_severity_prefix()
    return f"{prefix} - {self._message}"

  def __repr__(self) -> str:
@ -29,6 +39,14 @@ class ValidationResult:
  def is_fatal(self) -> bool:
    return self._fatal

+  def get_severity_prefix(self):
+    if self._fatal:
+      return "ERROR"
+    return "[non-fatal]"
+
+  def get_reason(self) -> str:
+    return self._reason
+
  def set_tag(self, tag: str, value: str) -> bool:
    self._tags[tag] = value

@ -54,11 +72,11 @@ class ValidationResult:

 class ValidationError(ValidationResult):
  """Fatal validation issue. Presubmit should fail."""
-  def __init__(self, message: str):
-    super().__init__(message=message, fatal=True)
+  def __init__(self, reason: str, additional: List[str] = []):
+    super().__init__(reason=reason, fatal=True, additional=additional)


 class ValidationWarning(ValidationResult):
  """Non-fatal validation issue. Presubmit should pass."""
-  def __init__(self, message: str):
-    super().__init__(message=message, fatal=False)
+  def __init__(self, reason: str, additional: List[str] = []):
+    super().__init__(reason=reason, fatal=False, additional=additional)