diff --git a/metadata/fields/field_types.py b/metadata/fields/field_types.py index 609bc0edc1..4ff7f3457d 100644 --- a/metadata/fields/field_types.py +++ b/metadata/fields/field_types.py @@ -33,9 +33,13 @@ class MetadataField: # The delimiter used to separate multiple values. VALUE_DELIMITER = "," - def __init__(self, name: str, one_liner: bool = True): + def __init__(self, + name: str, + one_liner: bool = True, + structured: bool = True): self._name = name self._one_liner = one_liner + self._structured = structured def __eq__(self, other): if not isinstance(other, MetadataField): @@ -52,6 +56,17 @@ class MetadataField: def is_one_liner(self): return self._one_liner + def is_structured(self): + """Whether the field represents structured data, such as a list of + URLs. + + If true, parser will treat `Field Name Like Pattern:` in subsequent + lines as a new field, in addition to all known field names. + If false, parser will only recognize known field names, and unknown + fields will be merged into the preceding field value. + """ + return self._structured + def validate(self, value: str) -> Union[vr.ValidationResult, None]: """Checks the given value is acceptable for the field. diff --git a/metadata/fields/known.py b/metadata/fields/known.py index aa065ff9ce..0731c303c3 100644 --- a/metadata/fields/known.py +++ b/metadata/fields/known.py @@ -26,9 +26,12 @@ import metadata.fields.field_types as field_types NAME = field_types.FreeformTextField("Name") SHORT_NAME = field_types.FreeformTextField("Short Name") REVISION = field_types.FreeformTextField("Revision") -DESCRIPTION = field_types.FreeformTextField("Description", one_liner=False) +DESCRIPTION = field_types.FreeformTextField("Description", + one_liner=False, + structured=False) LOCAL_MODIFICATIONS = field_types.FreeformTextField("Local Modifications", - one_liner=False) + one_liner=False, + structured=False) # Yes/no fields. SECURITY_CRITICAL = field_types.YesNoField("Security Critical") diff --git a/metadata/parse.py b/metadata/parse.py index 008a31b909..1840f790a7 100644 --- a/metadata/parse.py +++ b/metadata/parse.py @@ -24,9 +24,16 @@ DEPENDENCY_DIVIDER = re.compile(r"^-{20} DEPENDENCY DIVIDER -{20}$") # Delimiter used to separate a field's name from its value. FIELD_DELIMITER = ":" +# Heuristic for detecting unknown field names. +_PATTERN_FIELD_NAME_WORD_HEURISTIC = r"[A-Z]\w+" +_PATTERN_FIELD_NAME_HEURISTIC = re.compile(r"^({}(?: {})*){}[\b\s]".format( + _PATTERN_FIELD_NAME_WORD_HEURISTIC, _PATTERN_FIELD_NAME_WORD_HEURISTIC, + FIELD_DELIMITER)) +_DEFAULT_TO_STRUCTURED_TEXT = False + # Pattern used to check if a line from a metadata file declares a new # field. -_PATTERN_FIELD_DECLARATION = re.compile( +_PATTERN_KNOWN_FIELD_DECLARATION = re.compile( "^({}){}".format("|".join(known_fields.ALL_FIELD_NAMES), FIELD_DELIMITER), re.IGNORECASE) @@ -44,6 +51,7 @@ def parse_content(content: str) -> List[dm.DependencyMetadata]: current_metadata = dm.DependencyMetadata() current_field_name = None current_field_value = "" + current_field_is_structured = _DEFAULT_TO_STRUCTURED_TEXT for line in content.splitlines(keepends=True): # Check if a new dependency is being described. if DEPENDENCY_DIVIDER.match(line): @@ -59,8 +67,11 @@ def parse_content(content: str) -> List[dm.DependencyMetadata]: current_metadata = dm.DependencyMetadata() current_field_name = None current_field_value = "" + current_field_is_structured = False - elif _PATTERN_FIELD_DECLARATION.match(line): + elif (_PATTERN_KNOWN_FIELD_DECLARATION.match(line) + or (current_field_is_structured + and _PATTERN_FIELD_NAME_HEURISTIC.match(line))): # Save the field value to the current dependency's metadata. if current_field_name: current_metadata.add_entry(current_field_name, @@ -69,6 +80,11 @@ def parse_content(content: str) -> List[dm.DependencyMetadata]: current_field_name, current_field_value = line.split( FIELD_DELIMITER, 1) field = known_fields.get_field(current_field_name) + + # Treats unknown fields as `_DEFAULT_TO_STRUCTURED_TEXT`. + current_field_is_structured = field.is_structured( + ) if field else _DEFAULT_TO_STRUCTURED_TEXT + if field and field.is_one_liner(): # The field should be on one line, so add it now. current_metadata.add_entry(current_field_name, diff --git a/metadata/tests/data/README.chromium.test.single-valid b/metadata/tests/data/README.chromium.test.single-valid index e53b57651d..119f9a0564 100644 --- a/metadata/tests/data/README.chromium.test.single-valid +++ b/metadata/tests/data/README.chromium.test.single-valid @@ -2,6 +2,9 @@ Name: Test-A README for Chromium metadata Short Name: metadata-test-valid URL: https://www.example.com/metadata, https://www.example.com/parser +Unknown Field: Should be extracted into a field, because the preceding URL + field is structured, thus terminated by another field-like + line, even if the field name isn't well known to us. Version: 1.0.12 Date: 2020-12-03 License: Apache, 2.0 and MIT diff --git a/metadata/tests/parse_test.py b/metadata/tests/parse_test.py index eddb907de2..bfaf99c6ce 100644 --- a/metadata/tests/parse_test.py +++ b/metadata/tests/parse_test.py @@ -34,6 +34,11 @@ class ParseTest(unittest.TestCase): ("Short Name", "metadata-test-valid"), ("URL", "https://www.example.com/metadata,\n" " https://www.example.com/parser"), + ("Unknown Field", + "Should be extracted into a field, because the preceding URL\n" + " field is structured, thus terminated by another field-like\n" + " line, even if the field name isn't well known to us." + ), ("Version", "1.0.12"), ("Date", "2020-12-03"), ("License", "Apache, 2.0 and MIT"),