From 55d065cc0c7b65c7de40d8c798b78ff693cc3651 Mon Sep 17 00:00:00 2001 From: Struan Shrimpton Date: Wed, 11 Sep 2024 18:18:26 +0000 Subject: [PATCH] Add anonymization and the protos to the telemetry lib Add the protos for the traces used to publish metrics to Clearcut and the anonymization code/test that ensure we don't collect de-anonymized paths. Bug: 326277821 Change-Id: Ifae4d51f59db2219995a0a8d21785729f5eeb137 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/tools/depot_tools/+/5850298 Reviewed-by: Terrence Reilly Commit-Queue: Struan Shrimpton --- infra_lib/telemetry/anonymization.py | 55 +++++++++++++++++ infra_lib/telemetry/anonymization_unittest.py | 53 ++++++++++++++++ .../telemetry/proto/clientanalytics_pb2.py | 33 ++++++++++ infra_lib/telemetry/proto/trace_span_pb2.py | 60 +++++++++++++++++++ infra_lib/telemetry/proto/update.sh | 4 ++ 5 files changed, 205 insertions(+) create mode 100644 infra_lib/telemetry/anonymization.py create mode 100644 infra_lib/telemetry/anonymization_unittest.py create mode 100644 infra_lib/telemetry/proto/clientanalytics_pb2.py create mode 100644 infra_lib/telemetry/proto/trace_span_pb2.py create mode 100755 infra_lib/telemetry/proto/update.sh diff --git a/infra_lib/telemetry/anonymization.py b/infra_lib/telemetry/anonymization.py new file mode 100644 index 000000000..b6fb83f5d --- /dev/null +++ b/infra_lib/telemetry/anonymization.py @@ -0,0 +1,55 @@ +# Copyright 2024 The Chromium Authors +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. +"""Util for anonymizing telemetry spans.""" + +import getpass +import re + +from typing import Optional, Pattern, Sequence, Tuple +from google.protobuf import json_format + +from .proto import trace_span_pb2 + + +class Anonymizer: + """Redact the personally identifiable information.""" + + def __init__( + self, + replacements: Optional[Sequence[Tuple[Pattern[str], + str]]] = None) -> None: + self._replacements = list(replacements or []) + if getpass.getuser() != "root": + # Substituting the root user doesn't actually anonymize anything. + self._replacements.append( + (re.compile(re.escape(getpass.getuser())), "")) + + def __call__(self, *args, **kwargs): + return self.apply(*args, **kwargs) + + def apply(self, data: str) -> str: + """Applies the replacement rules to data text.""" + if not data: + return data + + for repl_from, repl_to in self._replacements: + data = re.sub(repl_from, repl_to, data) + + return data + + +class AnonymizingFilter: + """Applies the anonymizer to TraceSpan messages.""" + + def __init__(self, anonymizer: Anonymizer) -> None: + self._anonymizer = anonymizer + + def __call__(self, + msg: trace_span_pb2.TraceSpan) -> trace_span_pb2.TraceSpan: + """Applies the anonymizer to TraceSpan message.""" + raw = json_format.MessageToJson(msg) + json_msg = self._anonymizer.apply(raw) + output = trace_span_pb2.TraceSpan() + json_format.Parse(json_msg, output) + return output diff --git a/infra_lib/telemetry/anonymization_unittest.py b/infra_lib/telemetry/anonymization_unittest.py new file mode 100644 index 000000000..877290af6 --- /dev/null +++ b/infra_lib/telemetry/anonymization_unittest.py @@ -0,0 +1,53 @@ +# Copyright 2024 The Chromium Authors +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. +"""Test the config and anonymizer utils.""" + +import getpass +import re +import pytest + +from . import anonymization + + +def test_default_anonymizer_to_remove_username_from_path(monkeypatch) -> None: + """Test that default Anonymizer redacts username.""" + monkeypatch.setattr(getpass, "getuser", lambda: "user") + + a = anonymization.Anonymizer() + output = a.apply("/home/user/docs") + + assert output == "/home//docs" + + +def test_anonymizer_to_apply_passed_replacements() -> None: + """Test anonymizer to apply the requested replacements.""" + text = "/home/%s/docs" % getpass.getuser() + + replacements = [(re.escape(getpass.getuser()), "")] + a = anonymization.Anonymizer(replacements=replacements) + output = a.apply(text) + + assert output == "/home//docs" + + +def test_anonymizer_to_apply_multiple_replacements() -> None: + """Test anonymizer to apply the passed replacements in order.""" + replacements = [(re.escape("abc"), "x"), (re.escape("xyz"), "t")] + text = "hello abcd. how is xyz. abcyz" + + a = anonymization.Anonymizer(replacements=replacements) + output = a.apply(text) + + assert output == "hello xd. how is t. t" + + +def test_default_anonymizer_skip_root(monkeypatch) -> None: + """Test the anonymizer skips the root user.""" + monkeypatch.setattr(getpass, "getuser", lambda: "root") + + text = "/root/home service.sysroot.SetupBoard" + a = anonymization.Anonymizer() + output = a.apply(text) + + assert output == text diff --git a/infra_lib/telemetry/proto/clientanalytics_pb2.py b/infra_lib/telemetry/proto/clientanalytics_pb2.py new file mode 100644 index 000000000..e1be2a569 --- /dev/null +++ b/infra_lib/telemetry/proto/clientanalytics_pb2.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: chromite/telemetry/clientanalytics.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n(chromite/telemetry/clientanalytics.proto\x12\x12\x63hromite.telemetry\";\n\x08LogEvent\x12\x15\n\revent_time_ms\x18\x01 \x01(\x03\x12\x18\n\x10source_extension\x18\x06 \x01(\x0c\"!\n\nClientInfo\x12\x13\n\x0b\x63lient_type\x18\x01 \x01(\x05\"\x9f\x01\n\nLogRequest\x12\x33\n\x0b\x63lient_info\x18\x01 \x01(\x0b\x32\x1e.chromite.telemetry.ClientInfo\x12\x12\n\nlog_source\x18\x02 \x01(\x05\x12\x17\n\x0frequest_time_ms\x18\x04 \x01(\x03\x12/\n\tlog_event\x18\x03 \x03(\x0b\x32\x1c.chromite.telemetry.LogEvent\"/\n\x0bLogResponse\x12 \n\x18next_request_wait_millis\x18\x01 \x01(\x03\x42>Z\n\x0cstack_frames\x18\x01 \x03(\x0b\x32(.chromite.telemetry.TraceSpan.StackFrame\x12\x1c\n\x14\x64ropped_frames_count\x18\x02 \x01(\x03\x12\x17\n\x0fstacktrace_hash\x18\x03 \x01(\t\x1a\xee\x01\n\x06Status\x12\x44\n\x0bstatus_code\x18\x01 \x01(\x0e\x32/.chromite.telemetry.TraceSpan.Status.StatusCode\x12\x0f\n\x07message\x18\x02 \x01(\t\x12=\n\x0bstack_trace\x18\x03 \x01(\x0b\x32(.chromite.telemetry.TraceSpan.StackTrace\"N\n\nStatusCode\x12\x15\n\x11STATUS_CODE_UNSET\x10\x00\x12\x12\n\x0eSTATUS_CODE_OK\x10\x01\x12\x15\n\x11STATUS_CODE_ERROR\x10\x02\x1a\x41\n\x07\x43ontext\x12\x10\n\x08trace_id\x18\x01 \x01(\t\x12\x0f\n\x07span_id\x18\x02 \x01(\t\x12\x13\n\x0btrace_state\x18\x03 \x01(\t\x1ak\n\x04Link\x12\x36\n\x07\x63ontext\x18\x01 \x01(\x0b\x32%.chromite.telemetry.TraceSpan.Context\x12+\n\nattributes\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"i\n\x08SpanKind\x12\x19\n\x15SPAN_KIND_UNSPECIFIED\x10\x00\x12\x16\n\x12SPAN_KIND_INTERNAL\x10\x01\x12\x14\n\x10SPAN_KIND_SERVER\x10\x02\x12\x14\n\x10SPAN_KIND_CLIENT\x10\x03\x42>Z clientanalytics_pb2.py +gob-curl https://chromium.googlesource.com/chromiumos/chromite/+/main/api/gen_sdk/chromite/telemetry/trace_span_pb2.py?format=TEXT | base64 --decode > trace_span_pb2.py \ No newline at end of file