metadata: add "structured" field parsing

This CL adds a "structured" concept to the parser. In a structured
field, the parser will proactively look for field-like patterns to
start a new field (even if they aren't known fields).

This mitigates the issue when an unknown field immediately
follows a multi-line text field, such as:

URL: https://example.com
UnknownField: abc

And URL field value parses to
"https://example.com<newline>UnknownField:abc".

Bug: b/324149233
Change-Id: I54807bd7b242fc14c679483453ade83f8fd20225
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/tools/depot_tools/+/5379679
Reviewed-by: Anne Redulla <aredulla@google.com>
Commit-Queue: Jiewei Qian <qjw@chromium.org>
changes/79/5379679/3
Jiewei Qian 2 years ago committed by LUCI CQ
parent 2508c6f716
commit d76c4d6045

@ -33,9 +33,13 @@ class MetadataField:
# The delimiter used to separate multiple values. # The delimiter used to separate multiple values.
VALUE_DELIMITER = "," VALUE_DELIMITER = ","
def __init__(self, name: str, one_liner: bool = True): def __init__(self,
name: str,
one_liner: bool = True,
structured: bool = True):
self._name = name self._name = name
self._one_liner = one_liner self._one_liner = one_liner
self._structured = structured
def __eq__(self, other): def __eq__(self, other):
if not isinstance(other, MetadataField): if not isinstance(other, MetadataField):
@ -52,6 +56,17 @@ class MetadataField:
def is_one_liner(self): def is_one_liner(self):
return self._one_liner return self._one_liner
def is_structured(self):
"""Whether the field represents structured data, such as a list of
URLs.
If true, parser will treat `Field Name Like Pattern:` in subsequent
lines as a new field, in addition to all known field names.
If false, parser will only recognize known field names, and unknown
fields will be merged into the preceding field value.
"""
return self._structured
def validate(self, value: str) -> Union[vr.ValidationResult, None]: def validate(self, value: str) -> Union[vr.ValidationResult, None]:
"""Checks the given value is acceptable for the field. """Checks the given value is acceptable for the field.

@ -26,9 +26,12 @@ import metadata.fields.field_types as field_types
NAME = field_types.FreeformTextField("Name") NAME = field_types.FreeformTextField("Name")
SHORT_NAME = field_types.FreeformTextField("Short Name") SHORT_NAME = field_types.FreeformTextField("Short Name")
REVISION = field_types.FreeformTextField("Revision") REVISION = field_types.FreeformTextField("Revision")
DESCRIPTION = field_types.FreeformTextField("Description", one_liner=False) DESCRIPTION = field_types.FreeformTextField("Description",
one_liner=False,
structured=False)
LOCAL_MODIFICATIONS = field_types.FreeformTextField("Local Modifications", LOCAL_MODIFICATIONS = field_types.FreeformTextField("Local Modifications",
one_liner=False) one_liner=False,
structured=False)
# Yes/no fields. # Yes/no fields.
SECURITY_CRITICAL = field_types.YesNoField("Security Critical") SECURITY_CRITICAL = field_types.YesNoField("Security Critical")

@ -24,9 +24,16 @@ DEPENDENCY_DIVIDER = re.compile(r"^-{20} DEPENDENCY DIVIDER -{20}$")
# Delimiter used to separate a field's name from its value. # Delimiter used to separate a field's name from its value.
FIELD_DELIMITER = ":" FIELD_DELIMITER = ":"
# Heuristic for detecting unknown field names.
_PATTERN_FIELD_NAME_WORD_HEURISTIC = r"[A-Z]\w+"
_PATTERN_FIELD_NAME_HEURISTIC = re.compile(r"^({}(?: {})*){}[\b\s]".format(
_PATTERN_FIELD_NAME_WORD_HEURISTIC, _PATTERN_FIELD_NAME_WORD_HEURISTIC,
FIELD_DELIMITER))
_DEFAULT_TO_STRUCTURED_TEXT = False
# Pattern used to check if a line from a metadata file declares a new # Pattern used to check if a line from a metadata file declares a new
# field. # field.
_PATTERN_FIELD_DECLARATION = re.compile( _PATTERN_KNOWN_FIELD_DECLARATION = re.compile(
"^({}){}".format("|".join(known_fields.ALL_FIELD_NAMES), FIELD_DELIMITER), "^({}){}".format("|".join(known_fields.ALL_FIELD_NAMES), FIELD_DELIMITER),
re.IGNORECASE) re.IGNORECASE)
@ -44,6 +51,7 @@ def parse_content(content: str) -> List[dm.DependencyMetadata]:
current_metadata = dm.DependencyMetadata() current_metadata = dm.DependencyMetadata()
current_field_name = None current_field_name = None
current_field_value = "" current_field_value = ""
current_field_is_structured = _DEFAULT_TO_STRUCTURED_TEXT
for line in content.splitlines(keepends=True): for line in content.splitlines(keepends=True):
# Check if a new dependency is being described. # Check if a new dependency is being described.
if DEPENDENCY_DIVIDER.match(line): if DEPENDENCY_DIVIDER.match(line):
@ -59,8 +67,11 @@ def parse_content(content: str) -> List[dm.DependencyMetadata]:
current_metadata = dm.DependencyMetadata() current_metadata = dm.DependencyMetadata()
current_field_name = None current_field_name = None
current_field_value = "" current_field_value = ""
current_field_is_structured = False
elif _PATTERN_FIELD_DECLARATION.match(line): elif (_PATTERN_KNOWN_FIELD_DECLARATION.match(line)
or (current_field_is_structured
and _PATTERN_FIELD_NAME_HEURISTIC.match(line))):
# Save the field value to the current dependency's metadata. # Save the field value to the current dependency's metadata.
if current_field_name: if current_field_name:
current_metadata.add_entry(current_field_name, current_metadata.add_entry(current_field_name,
@ -69,6 +80,11 @@ def parse_content(content: str) -> List[dm.DependencyMetadata]:
current_field_name, current_field_value = line.split( current_field_name, current_field_value = line.split(
FIELD_DELIMITER, 1) FIELD_DELIMITER, 1)
field = known_fields.get_field(current_field_name) field = known_fields.get_field(current_field_name)
# Treats unknown fields as `_DEFAULT_TO_STRUCTURED_TEXT`.
current_field_is_structured = field.is_structured(
) if field else _DEFAULT_TO_STRUCTURED_TEXT
if field and field.is_one_liner(): if field and field.is_one_liner():
# The field should be on one line, so add it now. # The field should be on one line, so add it now.
current_metadata.add_entry(current_field_name, current_metadata.add_entry(current_field_name,

@ -2,6 +2,9 @@ Name: Test-A README for Chromium metadata
Short Name: metadata-test-valid Short Name: metadata-test-valid
URL: https://www.example.com/metadata, URL: https://www.example.com/metadata,
https://www.example.com/parser https://www.example.com/parser
Unknown Field: Should be extracted into a field, because the preceding URL
field is structured, thus terminated by another field-like
line, even if the field name isn't well known to us.
Version: 1.0.12 Version: 1.0.12
Date: 2020-12-03 Date: 2020-12-03
License: Apache, 2.0 and MIT License: Apache, 2.0 and MIT

@ -34,6 +34,11 @@ class ParseTest(unittest.TestCase):
("Short Name", "metadata-test-valid"), ("Short Name", "metadata-test-valid"),
("URL", "https://www.example.com/metadata,\n" ("URL", "https://www.example.com/metadata,\n"
" https://www.example.com/parser"), " https://www.example.com/parser"),
("Unknown Field",
"Should be extracted into a field, because the preceding URL\n"
" field is structured, thus terminated by another field-like\n"
" line, even if the field name isn't well known to us."
),
("Version", "1.0.12"), ("Version", "1.0.12"),
("Date", "2020-12-03"), ("Date", "2020-12-03"),
("License", "Apache, 2.0 and MIT"), ("License", "Apache, 2.0 and MIT"),

Loading…
Cancel
Save