metadata: add "structured" field parsing

This CL adds a "structured" concept to the parser. In a structured
field, the parser will proactively look for field-like patterns to
start a new field (even if they aren't known fields).

This mitigates the issue when an unknown field immediately
follows a multi-line text field, such as:

URL: https://example.com
UnknownField: abc

And URL field value parses to
"https://example.com<newline>UnknownField:abc".

Bug: b/324149233
Change-Id: I54807bd7b242fc14c679483453ade83f8fd20225
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/tools/depot_tools/+/5379679
Reviewed-by: Anne Redulla <aredulla@google.com>
Commit-Queue: Jiewei Qian <qjw@chromium.org>
changes/79/5379679/3
Jiewei Qian 2 years ago committed by LUCI CQ
parent 2508c6f716
commit d76c4d6045

@ -33,9 +33,13 @@ class MetadataField:
# The delimiter used to separate multiple values.
VALUE_DELIMITER = ","
def __init__(self, name: str, one_liner: bool = True):
def __init__(self,
name: str,
one_liner: bool = True,
structured: bool = True):
self._name = name
self._one_liner = one_liner
self._structured = structured
def __eq__(self, other):
if not isinstance(other, MetadataField):
@ -52,6 +56,17 @@ class MetadataField:
def is_one_liner(self):
return self._one_liner
def is_structured(self):
"""Whether the field represents structured data, such as a list of
URLs.
If true, parser will treat `Field Name Like Pattern:` in subsequent
lines as a new field, in addition to all known field names.
If false, parser will only recognize known field names, and unknown
fields will be merged into the preceding field value.
"""
return self._structured
def validate(self, value: str) -> Union[vr.ValidationResult, None]:
"""Checks the given value is acceptable for the field.

@ -26,9 +26,12 @@ import metadata.fields.field_types as field_types
NAME = field_types.FreeformTextField("Name")
SHORT_NAME = field_types.FreeformTextField("Short Name")
REVISION = field_types.FreeformTextField("Revision")
DESCRIPTION = field_types.FreeformTextField("Description", one_liner=False)
DESCRIPTION = field_types.FreeformTextField("Description",
one_liner=False,
structured=False)
LOCAL_MODIFICATIONS = field_types.FreeformTextField("Local Modifications",
one_liner=False)
one_liner=False,
structured=False)
# Yes/no fields.
SECURITY_CRITICAL = field_types.YesNoField("Security Critical")

@ -24,9 +24,16 @@ DEPENDENCY_DIVIDER = re.compile(r"^-{20} DEPENDENCY DIVIDER -{20}$")
# Delimiter used to separate a field's name from its value.
FIELD_DELIMITER = ":"
# Heuristic for detecting unknown field names.
_PATTERN_FIELD_NAME_WORD_HEURISTIC = r"[A-Z]\w+"
_PATTERN_FIELD_NAME_HEURISTIC = re.compile(r"^({}(?: {})*){}[\b\s]".format(
_PATTERN_FIELD_NAME_WORD_HEURISTIC, _PATTERN_FIELD_NAME_WORD_HEURISTIC,
FIELD_DELIMITER))
_DEFAULT_TO_STRUCTURED_TEXT = False
# Pattern used to check if a line from a metadata file declares a new
# field.
_PATTERN_FIELD_DECLARATION = re.compile(
_PATTERN_KNOWN_FIELD_DECLARATION = re.compile(
"^({}){}".format("|".join(known_fields.ALL_FIELD_NAMES), FIELD_DELIMITER),
re.IGNORECASE)
@ -44,6 +51,7 @@ def parse_content(content: str) -> List[dm.DependencyMetadata]:
current_metadata = dm.DependencyMetadata()
current_field_name = None
current_field_value = ""
current_field_is_structured = _DEFAULT_TO_STRUCTURED_TEXT
for line in content.splitlines(keepends=True):
# Check if a new dependency is being described.
if DEPENDENCY_DIVIDER.match(line):
@ -59,8 +67,11 @@ def parse_content(content: str) -> List[dm.DependencyMetadata]:
current_metadata = dm.DependencyMetadata()
current_field_name = None
current_field_value = ""
current_field_is_structured = False
elif _PATTERN_FIELD_DECLARATION.match(line):
elif (_PATTERN_KNOWN_FIELD_DECLARATION.match(line)
or (current_field_is_structured
and _PATTERN_FIELD_NAME_HEURISTIC.match(line))):
# Save the field value to the current dependency's metadata.
if current_field_name:
current_metadata.add_entry(current_field_name,
@ -69,6 +80,11 @@ def parse_content(content: str) -> List[dm.DependencyMetadata]:
current_field_name, current_field_value = line.split(
FIELD_DELIMITER, 1)
field = known_fields.get_field(current_field_name)
# Treats unknown fields as `_DEFAULT_TO_STRUCTURED_TEXT`.
current_field_is_structured = field.is_structured(
) if field else _DEFAULT_TO_STRUCTURED_TEXT
if field and field.is_one_liner():
# The field should be on one line, so add it now.
current_metadata.add_entry(current_field_name,

@ -2,6 +2,9 @@ Name: Test-A README for Chromium metadata
Short Name: metadata-test-valid
URL: https://www.example.com/metadata,
https://www.example.com/parser
Unknown Field: Should be extracted into a field, because the preceding URL
field is structured, thus terminated by another field-like
line, even if the field name isn't well known to us.
Version: 1.0.12
Date: 2020-12-03
License: Apache, 2.0 and MIT

@ -34,6 +34,11 @@ class ParseTest(unittest.TestCase):
("Short Name", "metadata-test-valid"),
("URL", "https://www.example.com/metadata,\n"
" https://www.example.com/parser"),
("Unknown Field",
"Should be extracted into a field, because the preceding URL\n"
" field is structured, thus terminated by another field-like\n"
" line, even if the field name isn't well known to us."
),
("Version", "1.0.12"),
("Date", "2020-12-03"),
("License", "Apache, 2.0 and MIT"),

Loading…
Cancel
Save