diff --git a/metadata/fields/custom/date.py b/metadata/fields/custom/date.py index 213469d8e..07096d7d4 100644 --- a/metadata/fields/custom/date.py +++ b/metadata/fields/custom/date.py @@ -3,8 +3,8 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +import datetime import os -import re import sys from typing import Union @@ -16,10 +16,56 @@ _ROOT_DIR = os.path.abspath(os.path.join(_THIS_DIR, "..", "..", "..")) sys.path.insert(0, _ROOT_DIR) import metadata.fields.field_types as field_types -import metadata.fields.util as util import metadata.validation_result as vr -_PATTERN_DATE = re.compile(r"^\d{4}-(0|1)\d-[0-3]\d$") +# The preferred date format for the start of date values. +_PREFERRED_PREFIX_FORMAT = "%Y-%m-%d" + +# Formats for the start of date values that are recognized as +# alternative date formats. +_RECOGNIZED_PREFIX_FORMATS = ( + "%d-%m-%Y", + "%m-%d-%Y", + "%d-%m-%y", + "%m-%d-%y", + "%d/%m/%Y", + "%m/%d/%Y", + "%d/%m/%y", + "%m/%d/%y", + "%d.%m.%Y", + "%m.%d.%Y", + "%d.%m.%y", + "%m.%d.%y", + "%Y/%m/%d", + "%Y.%m.%d", + "%Y%m%d", +) + +# Formats recognized as alternative date formats (entire value must +# match). +_RECOGNIZED_DATE_FORMATS = ( + "%d %b %Y", + "%d %b, %Y", + "%b %d %Y", + "%b %d, %Y", + "%Y %b %d", + "%d %B %Y", + "%d %B, %Y", + "%B %d %Y", + "%B %d, %Y", + "%Y %B %d", + "%a %b %d %H:%M:%S %Y", + "%a %b %d %H:%M:%S %Y %z", +) + + +def format_matches(value: str, date_format: str): + """Returns whether the given value matches the date format.""" + try: + datetime.datetime.strptime(value, date_format) + except ValueError: + return False + return True class DateField(field_types.MetadataField): @@ -29,11 +75,38 @@ class DateField(field_types.MetadataField): def validate(self, value: str) -> Union[vr.ValidationResult, None]: """Checks the given value is a YYYY-MM-DD date.""" - if util.matches(_PATTERN_DATE, value): + value = value.strip() + if not value: + return vr.ValidationError( + reason=f"{self._name} is empty.", + additional=["Provide date in format YYYY-MM-DD."]) + + # Check if the first part (to ignore timezone info) uses the + # preferred format. + parts = value.split() + if format_matches(parts[0], _PREFERRED_PREFIX_FORMAT): return None - return vr.ValidationError(reason=f"{self._name} is invalid.", - additional=[ - "The correct format is YYYY-MM-DD.", - f"Current value is '{value}'.", - ]) + # Check if the first part (to ignore timezone info) uses a + # recognized format. + for prefix_format in _RECOGNIZED_PREFIX_FORMATS: + if format_matches(parts[0], prefix_format): + return vr.ValidationWarning( + reason=f"{self._name} is not in the preferred format.", + additional=[ + "Use YYYY-MM-DD.", f"Current value is '{value}'." + ]) + + # Check the entire value for recognized date formats. + for date_format in _RECOGNIZED_DATE_FORMATS: + if format_matches(value, date_format): + return vr.ValidationWarning( + reason=f"{self._name} is not in the preferred format.", + additional=[ + "Use YYYY-MM-DD.", f"Current value is '{value}'." + ]) + + # Return an error as the value's format was not recognized. + return vr.ValidationError( + reason=f"{self._name} is invalid.", + additional=["Use YYYY-MM-DD.", f"Current value is '{value}'."]) diff --git a/metadata/scan.py b/metadata/scan.py index 9fac9e3fd..aba59f237 100644 --- a/metadata/scan.py +++ b/metadata/scan.py @@ -55,8 +55,10 @@ def main() -> None: invalid_file_count = 0 # Key is constructed from the result severity and reason; - # Value is a list of files affected by that reason at that severity. - all_reasons = defaultdict(list) + # Value is a dict for: + # * list of files affected by that reason at that severity; and + # * list of validation result strings for that reason and severity. + all_reasons = defaultdict(lambda: {"files": [], "results": set()}) for filepath in metadata_files: file_results = metadata.validate.validate_file(filepath, repo_root_dir=src_dir) @@ -69,21 +71,34 @@ def main() -> None: summary_key = "{severity} - {reason}".format( severity=result.get_severity_prefix(), reason=result.get_reason()) - all_reasons[summary_key].append(relpath) + all_reasons[summary_key]["files"].append(relpath) + all_reasons[summary_key]["results"].add(str(result)) if result.is_fatal(): invalid = True if invalid: invalid_file_count += 1 - print("\n\nDone.\nSummary:") - for summary_key, affected_files in all_reasons.items(): + print("\n\nDone.") + + print("\nSummary of files:") + for summary_key, data in all_reasons.items(): + affected_files = data["files"] count = len(affected_files) plural = "s" if count > 1 else "" print(f"\n {count} file{plural}: {summary_key}") for affected_file in affected_files: print(f" {affected_file}") + print("\nSummary of results:") + for summary_key, data in all_reasons.items(): + results = data["results"] + count = len(results) + plural = "s" if count > 1 else "" + print(f"\n {count} issue{plural}: {summary_key}") + for result in results: + print(f" {result}") + print(f"\n\n{invalid_file_count} / {file_count} metadata files are " "invalid, i.e. the file has at least one fatal validation issue.") diff --git a/metadata/tests/fields_test.py b/metadata/tests/fields_test.py index f4cec6807..b491b5615 100644 --- a/metadata/tests/fields_test.py +++ b/metadata/tests/fields_test.py @@ -97,8 +97,19 @@ class FieldValidationTest(unittest.TestCase): def test_date_validation(self): self._run_field_validation( field=known_fields.DATE, - valid_values=["2012-03-04"], - error_values=["", "\n", "April 3, 2012", "2012/03/04"], + valid_values=[ + "2012-03-04", "2012-03-04 UTC", "2012-03-04 UTC+10:00" + ], + error_values=[ + "", + "\n", + "N/A", + ], + warning_values=[ + "2012/03/04 UTC+10:00", "20120304", "April 3, 2012", + "3 Apr 2012", "03-04-12", "04/03/2012", + "Tue Apr 3 05:06:07 2012 +0800" + ], ) def test_license_validation(self):