From 915dd5bd0acdfd01f7f7be048834b957cf0bdf06 Mon Sep 17 00:00:00 2001 From: Aaron Gonzales Date: Mon, 8 Jun 2026 22:35:21 +0000 Subject: [PATCH] Add regex-backed detection strategies Signed-off-by: Aaron Gonzales --- .../engine/detection/detection_workflow.py | 75 +- src/anonymizer/engine/detection/rules.py | 274 +++ tests/engine/test_detection_rules.py | 318 +++ tests/tools/test_benchmark_output_analysis.py | 156 +- tests/tools/test_compare_strategy_pairs.py | 107 +- tests/tools/test_detection_strategies.py | 1210 +++++++++- tests/tools/test_extract_signature_deltas.py | 16 +- tests/tools/test_measurement_tools.py | 510 ++++- .../tools/test_screen_strategy_comparisons.py | 130 +- .../test_staged_detection_output_analysis.py | 108 +- tests/tools/test_staged_detection_probe.py | 233 ++ tools/measurement/README.md | 1946 +++++++++++++++-- tools/measurement/analyze_benchmark_output.py | 16 + .../analyze_staged_detection_output.py | 59 + tools/measurement/compare_strategy_pairs.py | 15 +- tools/measurement/detection_strategies.py | 908 +++++++- tools/measurement/extract_signature_deltas.py | 23 + tools/measurement/run_benchmarks.py | 139 +- .../screen_strategy_comparisons.py | 32 + tools/measurement/staged_detection_probe.py | 117 +- 20 files changed, 5877 insertions(+), 515 deletions(-) create mode 100644 src/anonymizer/engine/detection/rules.py create mode 100644 tests/engine/test_detection_rules.py diff --git a/src/anonymizer/engine/detection/detection_workflow.py b/src/anonymizer/engine/detection/detection_workflow.py index c0a34f83..153eb50d 100644 --- a/src/anonymizer/engine/detection/detection_workflow.py +++ b/src/anonymizer/engine/detection/detection_workflow.py @@ -49,7 +49,12 @@ parse_detected_entities, prepare_validation_inputs, ) -from anonymizer.engine.detection.postprocess import EntitySpan, group_entities_by_value +from anonymizer.engine.detection.postprocess import EntitySpan, build_tagged_text, group_entities_by_value +from anonymizer.engine.detection.rules import ( + STRUCTURED_RULE_FAST_LANE_LABELS, + SUPPORTED_RULE_LABELS, + detect_high_confidence_entities, +) from anonymizer.engine.ndd.adapter import FailedRecord, NddAdapter from anonymizer.engine.ndd.model_loader import resolve_model_alias, resolve_model_aliases from anonymizer.engine.prompt_utils import substitute_placeholders @@ -86,6 +91,33 @@ class EntityDetectionWorkflow: def __init__(self, adapter: NddAdapter) -> None: self._adapter = adapter + def detect_with_high_confidence_rules( + self, + dataframe: pd.DataFrame, + *, + entity_labels: list[str] | None = None, + ) -> EntityDetectionResult: + """Detect only deterministic high-confidence rule spans without DataDesigner. + + This is an internal fast-lane primitive for benchmark probes and + future routing work. It is intentionally limited to labels with narrow + deterministic coverage and does not attempt contextual PII detection. + """ + labels = _resolve_detection_labels(entity_labels) + _ensure_high_confidence_rule_labels(labels) + output = dataframe.copy() + output[COL_DETECTED_ENTITIES] = output[COL_TEXT].apply( + lambda text: _high_confidence_rule_payload(text, labels=labels) + ) + output[COL_TAGGED_TEXT] = output.apply( + lambda row: _tagged_text_from_entities( + text=row.get(COL_TEXT, ""), + raw_entities=row.get(COL_DETECTED_ENTITIES, {}), + ), + axis=1, + ) + return EntityDetectionResult(dataframe=output, failed_records=[]) + def detect_and_validate_entities( self, dataframe: pd.DataFrame, @@ -357,6 +389,47 @@ def _resolve_detection_labels(entity_labels: list[str] | None) -> list[str]: return list(entity_labels) +def labels_are_supported_by_high_confidence_rules(labels: list[str]) -> bool: + """Return True when every label can be handled by deterministic rules.""" + return set(labels).issubset(SUPPORTED_RULE_LABELS) + + +def labels_are_supported_by_structured_rule_fast_lane(labels: list[str]) -> bool: + """Return True when every label is safe for the structured no-model fast lane.""" + return set(labels).issubset(STRUCTURED_RULE_FAST_LANE_LABELS) + + +def _ensure_high_confidence_rule_labels(labels: list[str]) -> None: + unsupported = sorted(set(labels) - SUPPORTED_RULE_LABELS) + if unsupported: + supported = ", ".join(sorted(SUPPORTED_RULE_LABELS)) + raise ValueError( + f"unsupported high-confidence rule labels: {', '.join(unsupported)}; supported labels: {supported}" + ) + + +def _high_confidence_rule_payload(text: object, *, labels: list[str]) -> dict: + spans = detect_high_confidence_entities(str(text), labels=labels) + return EntitiesSchema(entities=[span.as_dict() for span in spans]).model_dump(mode="json") + + +def _tagged_text_from_entities(*, text: object, raw_entities: object) -> str: + parsed = EntitiesSchema.from_raw(raw_entities) + spans = [ + EntitySpan( + entity_id=e.id, + value=e.value, + label=e.label, + start_position=e.start_position, + end_position=e.end_position, + score=e.score, + source=e.source, + ) + for e in parsed.entities + ] + return build_tagged_text(text=str(text), entities=spans) + + def _materialize_final_entities(raw: object, *, allowed_labels: set[str] | None) -> dict: """Build COL_FINAL_ENTITIES, optionally filtering to *allowed_labels*.""" parsed = EntitiesSchema.from_raw(raw) diff --git a/src/anonymizer/engine/detection/rules.py b/src/anonymizer/engine/detection/rules.py new file mode 100644 index 00000000..1b96748b --- /dev/null +++ b/src/anonymizer/engine/detection/rules.py @@ -0,0 +1,274 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import re +from collections.abc import Iterable +from dataclasses import dataclass + +from anonymizer.engine.detection.postprocess import EntitySpan, resolve_overlaps + +_RULE_SCORE = 1.0 +_RULE_SOURCE = "rule" +_RELIGIOUS_BELIEF_TERMS = ( + "agnostic", + "atheist", + "baptist", + "buddhist", + "catholic", + "christian", + "hindu", + "jewish", + "mormon", + "muslim", + "protestant", + "secular", +) +_RELIGIOUS_BELIEF_RE = "|".join(re.escape(term) for term in _RELIGIOUS_BELIEF_TERMS) +_COOKIE_PAIR_RE = r"[A-Za-z][A-Za-z0-9_-]*=[^;'\s\"\r\n]+" +_COOKIE_VALUE_RE = rf"({_COOKIE_PAIR_RE}(?:;\s*{_COOKIE_PAIR_RE})*)" +_STRUCTURED_ID_VALUE_RE = ( + r"(?:[A-Za-z][A-Za-z0-9]{1,20}[-_][A-Za-z0-9][A-Za-z0-9_-]{5,}|" + r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})" +) + + +@dataclass(frozen=True) +class _RulePattern: + label: str + pattern: re.Pattern[str] + group: int = 0 + + +_RULES: tuple[_RulePattern, ...] = ( + _RulePattern( + label="api_key", + pattern=re.compile(r"sk-(?:test|ant-api03|proj|prod)-[A-Za-z0-9_-]{16,}"), + ), + _RulePattern(label="api_key", pattern=re.compile(r"ghp_[A-Za-z0-9_]{20,}")), + _RulePattern(label="api_key", pattern=re.compile(r"hf_[A-Za-z0-9]{20,}")), + _RulePattern(label="api_key", pattern=re.compile(r"pat-[A-Za-z0-9_-]{20,}")), + _RulePattern(label="api_key", pattern=re.compile(r"xoxb-[A-Za-z0-9-]{20,}")), + _RulePattern(label="api_key", pattern=re.compile(r"AIza[A-Za-z0-9_-]{20,}")), + _RulePattern(label="api_key", pattern=re.compile(r"ya29\.[A-Za-z0-9_-]{20,}")), + _RulePattern(label="api_key", pattern=re.compile(r"AKIA[A-Z0-9]{16,}")), + _RulePattern( + label="api_key", + pattern=re.compile( + r"\b(?:api[_-]?key|token|auth[_-]?token|session[_-]?id|aws_access_key_id|access_key_id)=" + r"([^\s;'\"\\]{8,})", + flags=re.IGNORECASE, + ), + group=1, + ), + _RulePattern( + label="api_key", + pattern=re.compile(r"Authorization:\s*Bearer\s+([A-Za-z0-9._-]{16,})", flags=re.IGNORECASE), + group=1, + ), + _RulePattern( + label="http_cookie", + pattern=re.compile(rf"\bCookie:\s*{_COOKIE_VALUE_RE}", flags=re.IGNORECASE), + group=1, + ), + _RulePattern( + label="http_cookie", + pattern=re.compile(rf"\bcookie\s*=\s*{_COOKIE_VALUE_RE}", flags=re.IGNORECASE), + group=1, + ), + _RulePattern( + label="pin", + pattern=re.compile(r"(?]+", + flags=re.IGNORECASE, + ), + ), + _RulePattern( + label="email", + pattern=re.compile(r"(?]+")), + _RulePattern( + label="date_of_birth", + pattern=re.compile( + r"\b(?:born|date\s+of\s+birth|dob)\s*(?:[:=-]|\bin\b|\bon\b)?\s*" + r"(\d{4}|\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}-\d{2}-\d{2})\b", + flags=re.IGNORECASE, + ), + group=1, + ), + _RulePattern( + label="religious_belief", + pattern=re.compile( + rf"\b(?:describes?\s+(?:himself|herself|themself|themselves)\s+as|" + rf"identif(?:y|ies)\s+as|raised\s+in\s+the|practicing)\s+" + rf"(?:a|an|the)?\s*({_RELIGIOUS_BELIEF_RE})\b", + flags=re.IGNORECASE, + ), + group=1, + ), + _RulePattern( + label="street_address", + pattern=re.compile( + r"\b(?:lives?\s+at|living\s+at|house\s+on|home\s+on)\s+" + r"([A-Z0-9][A-Za-z0-9.\s-]{1,60}?\b" + r"(?:Street|St\.?|Avenue|Ave\.?|Road|Rd\.?|Drive|Dr\.?|Trail|Boulevard|Blvd\.?|Lane|Ln\.?|Court|Ct\.?))", + ), + group=1, + ), + _RulePattern( + label="organization_name", + pattern=re.compile( + r"\b(?:at|from|with|joining|joined)\s+" + r"([A-Z][A-Za-z0-9&.'\u2019 -]{2,90}?\b" + r"(?:Center|Hospital|Clinic|University|College|Institute|Bank|Builders|Construction|Woodworks|Health))" + r"\b", + ), + group=1, + ), +) + +SUPPORTED_RULE_LABELS = frozenset(rule.label for rule in _RULES) +STRUCTURED_RULE_FAST_LANE_LABELS = frozenset( + { + "api_key", + "email", + "http_cookie", + "password", + "pin", + "unique_id", + "url", + "user_name", + } +) + + +def detect_high_confidence_entities(text: str, labels: Iterable[str] | None = None) -> list[EntitySpan]: + """Detect deterministic high-confidence PII and secret spans in raw text. + + These rules intentionally cover narrow, high-signal command/log and prose + patterns. They are suitable as a local seed detector or benchmark probe, + not as a complete replacement for model-backed contextual detection. + """ + allowed_labels = set(labels) if labels is not None else None + spans: list[EntitySpan] = [] + + for rule in _RULES: + if allowed_labels is not None and rule.label not in allowed_labels: + continue + for match in rule.pattern.finditer(text): + start, end = match.span(rule.group) + if start < 0 or end <= start: + continue + value = text[start:end] + value, end = _trim_rule_value(label=rule.label, value=value, end=end) + if not value: + continue + spans.append( + EntitySpan( + entity_id=_build_rule_entity_id(label=rule.label, start=start, end=end), + value=value, + label=rule.label, + start_position=start, + end_position=end, + score=_RULE_SCORE, + source=_RULE_SOURCE, + ) + ) + + return resolve_overlaps(_deduplicate(spans)) + + +def _trim_rule_value(*, label: str, value: str, end: int) -> tuple[str, int]: + if label != "http_cookie": + return value, end + trimmed = value.rstrip(".,") + return trimmed, end - (len(value) - len(trimmed)) + + +def _deduplicate(entities: list[EntitySpan]) -> list[EntitySpan]: + seen: set[tuple[str, int, int]] = set() + deduplicated: list[EntitySpan] = [] + for entity in entities: + key = (entity.label, entity.start_position, entity.end_position) + if key in seen: + continue + seen.add(key) + deduplicated.append(entity) + return deduplicated + + +def _build_rule_entity_id(*, label: str, start: int, end: int) -> str: + return f"{label}_{start}_{end}" diff --git a/tests/engine/test_detection_rules.py b/tests/engine/test_detection_rules.py new file mode 100644 index 00000000..d7640ed5 --- /dev/null +++ b/tests/engine/test_detection_rules.py @@ -0,0 +1,318 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from collections import Counter +from unittest.mock import Mock + +import pandas as pd +import pytest + +from anonymizer.engine.constants import COL_DETECTED_ENTITIES, COL_TAGGED_TEXT, COL_TEXT +from anonymizer.engine.detection.detection_workflow import EntityDetectionWorkflow +from anonymizer.engine.detection.rules import ( + STRUCTURED_RULE_FAST_LANE_LABELS, + SUPPORTED_RULE_LABELS, + detect_high_confidence_entities, +) +from anonymizer.engine.schemas import EntitiesSchema + +SHELL_TEXT = """$ curl -H 'Authorization: Bearer sk-test-AAAAAAAAAAAAAAAAAAAAAAAA' https://internal.example.test/api +$ export AWS_ACCESS_KEY_ID=AKIATEST1234567890FAKE +$ export AWS_SECRET_ACCESS_KEY=fakeSecretValue1234567890! +$ docker run -e DATABASE_URL='postgres://app_user:fakeDbPass123!@db.example.test:5432/app' -e API_KEY=ghp_FAKEtoken1234567890abcdef myapp:latest +$ ssh jane.doe@example.test@host-01.example.test +Password: fakeSshPass123! +""" + + +def test_detect_high_confidence_entities_extracts_shell_secret_values() -> None: + entities = detect_high_confidence_entities( + SHELL_TEXT, + labels=["api_key", "password", "email", "url"], + ) + + assert Counter(entity.label for entity in entities) == { + "api_key": 3, + "password": 2, + "email": 1, + "url": 2, + } + values_by_label = {(entity.label, entity.value) for entity in entities} + assert ("api_key", "sk-test-AAAAAAAAAAAAAAAAAAAAAAAA") in values_by_label + assert ("api_key", "AKIATEST1234567890FAKE") in values_by_label + assert ("api_key", "ghp_FAKEtoken1234567890abcdef") in values_by_label + assert ("password", "fakeSecretValue1234567890!") in values_by_label + assert ("password", "fakeSshPass123!") in values_by_label + assert ("email", "jane.doe@example.test") in values_by_label + assert ("url", "https://internal.example.test/api") in values_by_label + assert ("url", "postgres://app_user:fakeDbPass123!@db.example.test:5432/app") in values_by_label + + values = [entity.value for entity in entities] + assert all(not value.startswith(("Authorization", "Bearer", "API_KEY=", "Password:")) for value in values) + + +def test_detect_high_confidence_entities_extracts_email_before_sentence_punctuation() -> None: + entities = detect_high_confidence_entities( + "Email alice@example.com. Then contact bob@example.co.uk, if needed.", + labels=["email"], + ) + + assert [entity.value for entity in entities] == ["alice@example.com", "bob@example.co.uk"] + + +def test_detect_high_confidence_entities_excludes_config_url_separators() -> None: + text = ( + "DATABASE_URL=postgres://svc_user:DbSecretPass2026!@db.example.test:5432/app; " + "endpoint: https://internal.example.test/admin;" + ) + + entities = detect_high_confidence_entities(text, labels=["url"]) + + assert [entity.value for entity in entities] == [ + "postgres://svc_user:DbSecretPass2026!@db.example.test:5432/app", + "https://internal.example.test/admin", + ] + + +def test_supported_rule_labels_match_detected_label_families() -> None: + assert SUPPORTED_RULE_LABELS == { + "api_key", + "date_of_birth", + "email", + "http_cookie", + "organization_name", + "password", + "pin", + "religious_belief", + "street_address", + "unique_id", + "url", + "user_name", + } + + +def test_structured_rule_fast_lane_excludes_narrow_prose_labels() -> None: + assert STRUCTURED_RULE_FAST_LANE_LABELS == { + "api_key", + "email", + "http_cookie", + "password", + "pin", + "unique_id", + "url", + "user_name", + } + assert {"date_of_birth", "organization_name", "religious_belief", "street_address"}.isdisjoint( + STRUCTURED_RULE_FAST_LANE_LABELS + ) + + +def test_detect_high_confidence_entities_respects_label_filter() -> None: + entities = detect_high_confidence_entities(SHELL_TEXT, labels=["password"]) + + assert Counter(entity.label for entity in entities) == {"password": 3} + assert {entity.value for entity in entities} == { + "fakeSecretValue1234567890!", + "fakeDbPass123!", + "fakeSshPass123!", + } + + +def test_detect_high_confidence_entities_extracts_sudo_stdin_password() -> None: + text = '$ echo "P@ssw0rd-local-2026!" | sudo -S systemctl restart nginx' + + entities = detect_high_confidence_entities(text, labels=["password"]) + + assert [(entity.label, entity.value) for entity in entities] == [("password", "P@ssw0rd-local-2026!")] + + +def test_detect_high_confidence_entities_does_not_treat_generic_echo_as_password() -> None: + text = '$ echo "P@ssw0rd-local-2026!" | grep local' + + assert detect_high_confidence_entities(text, labels=["password"]) == [] + + +def test_detect_high_confidence_entities_does_not_emit_secret_false_positives_for_prose() -> None: + prose = ( + "Alice Johnson filed Case No. 2025-CV-12345 in Superior Court. " + "The opinion cites Section 10(b), Exhibit A-17, and docket trace order_390974. " + "A biography says Jordan Patel joined NVIDIA in 2021 and later moved to Seattle." + ) + + entities = detect_high_confidence_entities(prose, labels=["api_key", "password", "email", "url"]) + + assert entities == [] + + +def test_detect_high_confidence_entities_extracts_contextual_date_of_birth() -> None: + text = "The applicant was born in 1978 and later moved to Berlin. Another report cites 2024." + + entities = detect_high_confidence_entities(text, labels=["date_of_birth"]) + + assert [(entity.label, entity.value) for entity in entities] == [("date_of_birth", "1978")] + + +def test_detect_high_confidence_entities_ignores_standalone_year_for_date_of_birth() -> None: + text = "The report cites filings from 1978, 2021, and 2024." + + assert detect_high_confidence_entities(text, labels=["date_of_birth"]) == [] + + +def test_detect_high_confidence_entities_extracts_narrow_prose_patterns() -> None: + text = ( + "After graduation he spent three years at NASA's Goddard Space Flight Center before joining a lab. " + "Idilio describes himself as secular and leans progressive on most political issues. " + "Outside the lab, Idilio shares a modest house on West Roberts Drive with his wife." + ) + + entities = detect_high_confidence_entities( + text, + labels=["organization_name", "religious_belief", "street_address"], + ) + + assert [(entity.label, entity.value) for entity in entities] == [ + ("organization_name", "NASA's Goddard Space Flight Center"), + ("religious_belief", "secular"), + ("street_address", "West Roberts Drive"), + ] + + +def test_detect_high_confidence_entities_avoids_generic_prose_belief_false_positive() -> None: + text = "Jordan describes himself as careful and later worked at a local lab near Roberts Drive." + + assert ( + detect_high_confidence_entities( + text, + labels=["organization_name", "religious_belief", "street_address"], + ) + == [] + ) + + +def test_detect_high_confidence_entities_returns_sorted_non_overlapping_spans() -> None: + entities = detect_high_confidence_entities( + "token=sk-test-BBBBBBBBBBBBBBBBBBBBBBBB and Auth: ignored\nPassword: fakePass123!", + labels=["api_key", "password"], + ) + + assert [(entity.label, entity.value) for entity in entities] == [ + ("api_key", "sk-test-BBBBBBBBBBBBBBBBBBBBBBBB"), + ("password", "fakePass123!"), + ] + assert entities[0].end_position < entities[1].start_position + + +def test_detect_high_confidence_entities_extracts_session_id_assignments() -> None: + text = "Cookie: session_id=abc123xyz; auth_token=xoxb-STRUCTURED-Slack-Token-000000" + + entities = detect_high_confidence_entities(text, labels=["api_key"]) + + assert [(entity.label, entity.value) for entity in entities] == [ + ("api_key", "abc123xyz"), + ("api_key", "xoxb-STRUCTURED-Slack-Token-000000"), + ] + + +def test_detect_high_confidence_entities_extracts_structured_identifier_labels() -> None: + text = ( + "POST /audit HTTP/1.1\n" + "Cookie: session_id=abc123xyz; user_id=26762; auth_token=token-abcdef\n" + "trace-id: req_KA5k78XNwT0yUNZkPpwq\n" + "pin=97294\n" + "user_name=sloanenguy217\n" + ) + + entities = detect_high_confidence_entities( + text, + labels=["http_cookie", "pin", "unique_id", "user_name"], + ) + + assert [(entity.label, entity.value) for entity in entities] == [ + ("http_cookie", "session_id=abc123xyz; user_id=26762; auth_token=token-abcdef"), + ("unique_id", "req_KA5k78XNwT0yUNZkPpwq"), + ("pin", "97294"), + ("user_name", "sloanenguy217"), + ] + + +def test_detect_high_confidence_entities_extracts_quoted_structured_identifier_keys() -> None: + text = '{"user": "avery_khan", "pin": "4921", "callback": "https://internal.example.test/admin"}' + + entities = detect_high_confidence_entities(text, labels=["pin", "url", "user_name"]) + + assert [(entity.label, entity.value) for entity in entities] == [ + ("user_name", "avery_khan"), + ("pin", "4921"), + ("url", "https://internal.example.test/admin"), + ] + + +def test_detect_high_confidence_entities_excludes_cookie_sentence_punctuation() -> None: + text = "Cookie: session_id=abc123xyz; auth_token=token-abcdef. Recovery flow starts." + + entities = detect_high_confidence_entities(text, labels=["http_cookie"]) + + assert [(entity.label, entity.value) for entity in entities] == [ + ("http_cookie", "session_id=abc123xyz; auth_token=token-abcdef"), + ] + + +def test_detect_high_confidence_entities_extracts_service_principal_user_and_tenant_id() -> None: + text = "$ az login --service-principal -u skylerlee985 -p fakePass123! --tenant trace-1b7278d77a73ef4e" + + entities = detect_high_confidence_entities(text, labels=["user_name", "unique_id"]) + + assert [(entity.label, entity.value) for entity in entities] == [ + ("user_name", "skylerlee985"), + ("unique_id", "trace-1b7278d77a73ef4e"), + ] + + +def test_detect_high_confidence_entities_extracts_audit_user_and_trace_id() -> None: + text = "Audit record: user skylerlee985 opened session with trace-id req_KA5k78XNwT0yUNZkPpwq." + + entities = detect_high_confidence_entities(text, labels=["user_name", "unique_id"]) + + assert [(entity.label, entity.value) for entity in entities] == [ + ("user_name", "skylerlee985"), + ("unique_id", "req_KA5k78XNwT0yUNZkPpwq"), + ] + + +def test_detect_high_confidence_entities_does_not_extract_structured_identifiers_from_generic_prose() -> None: + text = "The order_390974 filing mentions user research, cookie recipes, and a five digit docket page." + + assert detect_high_confidence_entities(text, labels=["http_cookie", "pin", "unique_id", "user_name"]) == [] + + +def test_workflow_can_detect_with_high_confidence_rules_without_adapter_calls() -> None: + adapter = Mock() + workflow = EntityDetectionWorkflow(adapter=adapter) + + result = workflow.detect_with_high_confidence_rules( + pd.DataFrame({COL_TEXT: ["token=sk-test-AAAAAAAAAAAAAAAAAAAAAAAA\nPassword: fakePass123!"]}), + entity_labels=["api_key", "password"], + ) + + adapter.run_workflow.assert_not_called() + entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities + assert [(entity.label, entity.value) for entity in entities] == [ + ("api_key", "sk-test-AAAAAAAAAAAAAAAAAAAAAAAA"), + ("password", "fakePass123!"), + ] + tagged_text = result.dataframe[COL_TAGGED_TEXT].iloc[0] + assert "sk-test-AAAAAAAAAAAAAAAAAAAAAAAA" in tagged_text + assert "fakePass123!" in tagged_text + assert result.failed_records == [] + + +def test_workflow_rule_detection_rejects_unsupported_labels() -> None: + workflow = EntityDetectionWorkflow(adapter=Mock()) + + with pytest.raises(ValueError, match="unsupported high-confidence rule labels.*person"): + workflow.detect_with_high_confidence_rules( + pd.DataFrame({COL_TEXT: ["Alice has token=sk-test-AAAAAAAAAAAAAAAAAAAAAAAA"]}), + entity_labels=["api_key", "person"], + ) diff --git a/tests/tools/test_benchmark_output_analysis.py b/tests/tools/test_benchmark_output_analysis.py index 6d620dbe..9f309c73 100644 --- a/tests/tools/test_benchmark_output_analysis.py +++ b/tests/tools/test_benchmark_output_analysis.py @@ -199,7 +199,7 @@ def test_analyze_benchmark_output_joins_measurements_and_detection_artifacts(tmp }, { "record_type": "record", - "run_id": "shell__native-local__r000", + "run_id": "shell__rules-only__r000", "text_length_tokens": 750, "final_entity_count": 8, "replacement_count": 8, @@ -214,12 +214,12 @@ def test_analyze_benchmark_output_joins_measurements_and_detection_artifacts(tmp "run_tags": { "suite_id": "suite", "workload_id": "shell", - "config_id": "native-local", - "experimental_detection_strategy": "native_single_pass", + "config_id": "rules-only", + "experimental_detection_strategy": "rules_only", "experimental_replacement_strategy": "local_structured_substitute", "dd_parser_compat": "raw_json", "repetition": 0, - "case_id": "shell__native-local__r000", + "case_id": "shell__rules-only__r000", }, }, ], @@ -259,23 +259,23 @@ def test_analyze_benchmark_output_joins_measurements_and_detection_artifacts(tmp { "suite_id": "suite", "workload_id": "shell", - "config_id": "native-local", + "config_id": "rules-only", "repetition": 0, - "case_id": "shell__native-local__r000", - "run_id": "shell__native-local__r000", - "workflow_name": "native-single-pass", + "case_id": "shell__rules-only__r000", + "run_id": "shell__rules-only__r000", + "workflow_name": "rules-only", "seed_entity_count": 8, "seed_validation_candidate_count": 0, "augmented_entity_count": 0, "augmented_new_final_value_count": 0, "final_entity_count": 8, - "final_source_counts": {"augmenter": 8}, + "final_source_counts": {"rule": 8}, "final_entity_signature_hashes": ["shell-hash-a"], "final_entity_signature_labels": {"shell-hash-a": "api_key"}, "final_entity_signature_details": { "shell-hash-a": { "label": "api_key", - "source": "native", + "source": "rule", "row_index": 0, "start_position": 12, "end_position": 32, @@ -422,27 +422,25 @@ def test_analyze_benchmark_output_joins_measurements_and_detection_artifacts(tmp } } assert cases["bio__default__r000"].artifact_final_entity_signature_count == 2 - assert cases["shell__native-local__r000"].observed_total_requests == 0 - assert cases["shell__native-local__r000"].experimental_replacement_strategy == "local_structured_substitute" - assert cases["shell__native-local__r000"].observed_failed_request_rate is None - assert cases["shell__native-local__r000"].observed_bridge_fallback_requests is None - assert cases["shell__native-local__r000"].observed_non_bridge_failed_requests is None - assert cases["shell__native-local__r000"].final_entity_count == 8 - assert cases["shell__native-local__r000"].replacement_missing_final_entity_count == 0 - assert cases["shell__native-local__r000"].replacement_missing_final_entity_label_counts == {} - assert cases["shell__native-local__r000"].replacement_missing_final_value_count == 0 - assert cases["shell__native-local__r000"].replacement_synthetic_original_collision_count == 0 - assert cases["shell__native-local__r000"].replacement_synthetic_original_collision_label_counts == {} - assert cases["shell__native-local__r000"].replacement_synthetic_original_collision_value_count == 0 - assert cases["shell__native-local__r000"].original_value_leak_count == 1 - assert cases["shell__native-local__r000"].original_value_leak_record_count == 1 - assert cases["shell__native-local__r000"].original_value_leak_label_counts == {"api_key": 1} - assert cases["shell__native-local__r000"].artifact_final_augmenter_entity_count == 8 - assert cases["shell__native-local__r000"].artifact_final_entity_signature_hashes == ["shell-hash-a"] - assert cases["shell__native-local__r000"].artifact_final_entity_signature_labels == {"shell-hash-a": "api_key"} - assert ( - cases["shell__native-local__r000"].artifact_final_entity_signature_details["shell-hash-a"]["source"] == "native" - ) + assert cases["shell__rules-only__r000"].observed_total_requests == 0 + assert cases["shell__rules-only__r000"].experimental_replacement_strategy == "local_structured_substitute" + assert cases["shell__rules-only__r000"].observed_failed_request_rate is None + assert cases["shell__rules-only__r000"].observed_bridge_fallback_requests is None + assert cases["shell__rules-only__r000"].observed_non_bridge_failed_requests is None + assert cases["shell__rules-only__r000"].final_entity_count == 8 + assert cases["shell__rules-only__r000"].replacement_missing_final_entity_count == 0 + assert cases["shell__rules-only__r000"].replacement_missing_final_entity_label_counts == {} + assert cases["shell__rules-only__r000"].replacement_missing_final_value_count == 0 + assert cases["shell__rules-only__r000"].replacement_synthetic_original_collision_count == 0 + assert cases["shell__rules-only__r000"].replacement_synthetic_original_collision_label_counts == {} + assert cases["shell__rules-only__r000"].replacement_synthetic_original_collision_value_count == 0 + assert cases["shell__rules-only__r000"].original_value_leak_count == 1 + assert cases["shell__rules-only__r000"].original_value_leak_record_count == 1 + assert cases["shell__rules-only__r000"].original_value_leak_label_counts == {"api_key": 1} + assert cases["shell__rules-only__r000"].artifact_final_rule_entity_count == 8 + assert cases["shell__rules-only__r000"].artifact_final_entity_signature_hashes == ["shell-hash-a"] + assert cases["shell__rules-only__r000"].artifact_final_entity_signature_labels == {"shell-hash-a": "api_key"} + assert cases["shell__rules-only__r000"].artifact_final_entity_signature_details["shell-hash-a"]["source"] == "rule" model_rows = {row.model_name: row for row in result.model_usage} assert model_rows["nvidia/gliner-pii"].observed_failed_requests == 0 assert model_rows["nvidia/gliner-pii"].observed_failed_request_rate == 0 @@ -523,7 +521,7 @@ def test_analyze_benchmark_output_counts_generic_model_workflow_records(tmp_path { "record_type": "model_workflow", "run_id": "bio__native__r000", - "workflow_name": "entity-detection-native-single-pass", + "workflow_name": "entity-detection-native-rules-router", "elapsed_sec": 0.25, "observed_total_requests": 3, "observed_successful_requests": 3, @@ -552,7 +550,7 @@ def test_analyze_benchmark_output_counts_generic_model_workflow_records(tmp_path "suite_id": "suite", "workload_id": "bio", "config_id": "native", - "experimental_detection_strategy": "native_single_pass", + "experimental_detection_strategy": "native_rules_router", "experimental_replacement_strategy": "default", "dd_parser_compat": "raw_json", "repetition": 0, @@ -570,7 +568,7 @@ def test_analyze_benchmark_output_counts_generic_model_workflow_records(tmp_path "suite_id": "suite", "workload_id": "bio", "config_id": "native", - "experimental_detection_strategy": "native_single_pass", + "experimental_detection_strategy": "native_rules_router", "experimental_replacement_strategy": "default", "dd_parser_compat": "raw_json", "repetition": 0, @@ -589,7 +587,7 @@ def test_analyze_benchmark_output_counts_generic_model_workflow_records(tmp_path assert case.observed_failed_request_rate == 0 assert result.model_usage_count == 1 model_row = result.model_usage[0] - assert model_row.workflow_name == "entity-detection-native-single-pass" + assert model_row.workflow_name == "entity-detection-native-rules-router" assert model_row.model_alias == "native-direct" assert model_row.model_name == "nvidia/nemotron-3-super" assert model_row.observed_total_tokens == 42 @@ -692,21 +690,21 @@ def test_write_analysis_tables_exports_case_and_group_tables(tmp_path: Path) -> tool.CaseAnalysisRow( suite_id="suite", workload_id="shell", - config_id="native", - experimental_detection_strategy="native_single_pass", + config_id="rules", + experimental_detection_strategy="rules_only", experimental_replacement_strategy="local_structured_substitute", dd_parser_compat="raw_json", repetition=0, - case_id="shell__native__r000", - run_id="shell__native__r000", + case_id="shell__rules__r000", + run_id="shell__rules__r000", final_entity_count=8, ) ], groups=[ tool.GroupAnalysisRow( workload_id="shell", - config_id="native", - experimental_detection_strategy="native_single_pass", + config_id="rules", + experimental_detection_strategy="rules_only", experimental_replacement_strategy="local_structured_substitute", case_count=1, median_final_entity_count=8, @@ -715,17 +713,18 @@ def test_write_analysis_tables_exports_case_and_group_tables(tmp_path: Path) -> median_observed_output_tokens=0, median_observed_failed_request_rate=0, median_artifact_final_entity_count=8, + median_artifact_final_rule_entity_count=8, ) ], model_usage=[ tool.ModelUsageAnalysisRow( workload_id="shell", - config_id="native", - experimental_detection_strategy="native_single_pass", + config_id="rules", + experimental_detection_strategy="rules_only", experimental_replacement_strategy="local_structured_substitute", dd_parser_compat="raw_json", - case_id="shell__native__r000", - run_id="shell__native__r000", + case_id="shell__rules__r000", + run_id="shell__rules__r000", workflow_name="entity-detection", model_name="nvidia/gliner-pii", observed_total_requests=1, @@ -736,8 +735,8 @@ def test_write_analysis_tables_exports_case_and_group_tables(tmp_path: Path) -> model_usage_groups=[ tool.ModelUsageGroupAnalysisRow( workload_id="shell", - config_id="native", - experimental_detection_strategy="native_single_pass", + config_id="rules", + experimental_detection_strategy="rules_only", experimental_replacement_strategy="local_structured_substitute", dd_parser_compat="raw_json", workflow_name="entity-detection", @@ -756,7 +755,7 @@ def test_write_analysis_tables_exports_case_and_group_tables(tmp_path: Path) -> output_dir = tmp_path / "tables" tool.write_analysis_tables(result, output_dir, tool.ExportFormat.csv) - assert pd.read_csv(output_dir / "case_analysis.csv")["case_id"].tolist() == ["shell__native__r000"] + assert pd.read_csv(output_dir / "case_analysis.csv")["case_id"].tolist() == ["shell__rules__r000"] assert pd.read_csv(output_dir / "case_analysis.csv")["experimental_replacement_strategy"].tolist() == [ "local_structured_substitute" ] @@ -814,7 +813,7 @@ def test_analyze_benchmark_output_groups_replacement_strategies_separately(tmp_p "run_tags": { "workload_id": "secrets", "config_id": "candidate", - "experimental_detection_strategy": "native_single_pass", + "experimental_detection_strategy": "rules_covered_or_default", "experimental_replacement_strategy": "default", "case_id": "secrets__candidate__r000", }, @@ -826,7 +825,7 @@ def test_analyze_benchmark_output_groups_replacement_strategies_separately(tmp_p "run_tags": { "workload_id": "secrets", "config_id": "candidate", - "experimental_detection_strategy": "native_single_pass", + "experimental_detection_strategy": "rules_covered_or_default", "experimental_replacement_strategy": "local_structured_substitute", "case_id": "secrets__candidate__r001", }, @@ -843,6 +842,57 @@ def test_analyze_benchmark_output_groups_replacement_strategies_separately(tmp_p } +def test_analyze_benchmark_output_surfaces_route_counts(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_output_analysis_route_counts", + REPO_ROOT / "tools/measurement/analyze_benchmark_output.py", + ) + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + _write_jsonl( + benchmark_dir / "measurements.jsonl", + [ + { + "record_type": "model_workflow", + "run_id": "mixed__router__r000", + "workflow_name": "entity-detection-rules-covered-router", + "status": "completed", + "input_row_count": 2, + "output_row_count": 2, + "failed_record_count": 0, + "elapsed_sec": 0.01, + "observed_total_requests": 0, + "observed_successful_requests": 0, + "observed_failed_requests": 0, + "observed_input_tokens": 0, + "observed_output_tokens": 0, + "observed_total_tokens": 0, + "route_total_row_count": 2, + "route_rule_row_count": 1, + "route_fallback_row_count": 1, + "run_tags": { + "workload_id": "mixed", + "config_id": "router", + "experimental_detection_strategy": "rules_covered_or_default", + "experimental_replacement_strategy": "default", + "case_id": "mixed__router__r000", + }, + } + ], + ) + + result = tool.analyze_benchmark_output(benchmark_dir) + + case = result.cases[0] + assert case.route_total_row_count == 2 + assert case.route_rule_row_count == 1 + assert case.route_fallback_row_count == 1 + group = result.groups[0] + assert group.median_route_total_row_count == 2 + assert group.median_route_rule_row_count == 1 + assert group.median_route_fallback_row_count == 1 + + def test_analyze_benchmark_output_surfaces_failed_cases(tmp_path: Path) -> None: tool = load_tool( "measurement_benchmark_output_analysis_failures", @@ -862,7 +912,7 @@ def test_analyze_benchmark_output_surfaces_failed_cases(tmp_path: Path) -> None: "run_tags": { "workload_id": "shell", "config_id": "candidate", - "experimental_detection_strategy": "detector_only", + "experimental_detection_strategy": "rules_guardrail_detector_only", "repetition": 0, "case_id": "shell__candidate__r000", }, @@ -876,7 +926,7 @@ def test_analyze_benchmark_output_surfaces_failed_cases(tmp_path: Path) -> None: "run_tags": { "workload_id": "shell", "config_id": "candidate", - "experimental_detection_strategy": "detector_only", + "experimental_detection_strategy": "rules_guardrail_detector_only", "repetition": 1, "case_id": "shell__candidate__r001", }, @@ -890,7 +940,7 @@ def test_analyze_benchmark_output_surfaces_failed_cases(tmp_path: Path) -> None: "run_tags": { "workload_id": "shell", "config_id": "candidate", - "experimental_detection_strategy": "detector_only", + "experimental_detection_strategy": "rules_guardrail_detector_only", "repetition": 1, "case_id": "shell__candidate__r001", }, diff --git a/tests/tools/test_compare_strategy_pairs.py b/tests/tools/test_compare_strategy_pairs.py index ad0ab436..be9f455a 100644 --- a/tests/tools/test_compare_strategy_pairs.py +++ b/tests/tools/test_compare_strategy_pairs.py @@ -46,8 +46,8 @@ def test_compare_case_analysis_by_strategy_reports_safety_and_cost_deltas() -> N }, { "workload_id": "shell-1", - "config_id": "shell-native", - "experimental_detection_strategy": "native_candidate_validate_no_augment", + "config_id": "shell-filter", + "experimental_detection_strategy": "rules_filter_guardrail_no_augment", "experimental_replacement_strategy": "local_structured_substitute", "case_id": "shell__candidate", "pipeline_elapsed_sec": 0.8, @@ -58,7 +58,7 @@ def test_compare_case_analysis_by_strategy_reports_safety_and_cost_deltas() -> N "seed_validation_candidate_count": 0, "augmented_entity_count": 0, "augmented_new_final_value_count": 0, - "artifact_final_augmenter_entity_count": 8, + "artifact_final_rule_entity_count": 8, }, { "workload_id": "legal-1", @@ -76,8 +76,8 @@ def test_compare_case_analysis_by_strategy_reports_safety_and_cost_deltas() -> N }, { "workload_id": "legal-1", - "config_id": "legal-native", - "experimental_detection_strategy": "native_candidate_validate_no_augment", + "config_id": "legal-filter", + "experimental_detection_strategy": "rules_filter_guardrail_no_augment", "experimental_replacement_strategy": "local_structured_substitute", "case_id": "legal__candidate", "pipeline_elapsed_sec": 20.9, @@ -94,7 +94,7 @@ def test_compare_case_analysis_by_strategy_reports_safety_and_cost_deltas() -> N rows = tool.compare_case_analysis( table, baseline_strategy="no_augment", - candidate_strategy="native_candidate_validate_no_augment", + candidate_strategy="rules_filter_guardrail_no_augment", ) by_workload = {row.workload_id: row for row in rows} @@ -111,11 +111,11 @@ def test_compare_case_analysis_by_strategy_reports_safety_and_cost_deltas() -> N assert shell.baseline_augmented_new_final_value_count == 1 assert shell.candidate_augmented_new_final_value_count == 0 assert shell.augmented_new_final_value_count_delta == -1 - assert shell.candidate_augmenter_entity_count == 8 + assert shell.candidate_rule_entity_count == 8 assert shell.safety_verdict == "review" assert shell.performance_verdict == "improved" assert shell.candidate_verdict == "review" - assert shell.flags == ["no_candidate_detector_entities"] + assert shell.flags == ["no_candidate_detector_entities", "candidate_uses_rule_entities"] legal = by_workload["legal-1"] assert legal.baseline_replacement_strategy == "default" @@ -151,14 +151,14 @@ def test_compare_case_analysis_rejects_ambiguous_strategy_selector() -> None: { "workload_id": "shell-1", "config_id": "candidate", - "experimental_detection_strategy": "detector_only", + "experimental_detection_strategy": "rules_only", "case_id": "c", }, ] ) with pytest.raises(ValueError, match="baseline selector matched multiple configs"): - tool.compare_case_analysis(table, baseline_strategy="no_augment", candidate_strategy="detector_only") + tool.compare_case_analysis(table, baseline_strategy="no_augment", candidate_strategy="rules_only") def test_compare_case_analysis_rejects_candidate_synthetic_original_collisions() -> None: @@ -279,8 +279,8 @@ def test_compare_case_tables_allows_candidate_from_separate_run() -> None: [ { "workload_id": "legal-5", - "config_id": "legal-native-validate", - "experimental_detection_strategy": "detector_native_validate_no_augment", + "config_id": "legal-rules-guardrail", + "experimental_detection_strategy": "rules_guardrail_no_augment", "case_id": "legal__candidate", "observed_total_tokens": 55805, "final_entity_count": 193, @@ -293,13 +293,13 @@ def test_compare_case_tables_allows_candidate_from_separate_run() -> None: baseline, candidate, baseline_strategy="no_augment", - candidate_strategy="detector_native_validate_no_augment", + candidate_strategy="rules_guardrail_no_augment", ) assert len(rows) == 1 assert rows[0].workload_id == "legal-5" assert rows[0].baseline_config_id == "legal-no-augment" - assert rows[0].candidate_config_id == "legal-native-validate" + assert rows[0].candidate_config_id == "legal-rules-guardrail" assert rows[0].observed_total_tokens_delta == 15 assert rows[0].flags == ["token_increase"] @@ -348,7 +348,7 @@ def test_compare_case_analysis_preserves_augmentation_contribution_deltas() -> N assert row.candidate_augmenter_entity_count == 0 -def test_compare_case_analysis_review_gates_detector_only_candidate_shell_case() -> None: +def test_compare_case_analysis_review_gates_detector_only_candidates() -> None: tool = load_tool( "measurement_compare_strategy_pairs_detector_only", REPO_ROOT / "tools/measurement/compare_strategy_pairs.py", @@ -392,9 +392,9 @@ def test_compare_case_analysis_review_gates_detector_only_candidate_shell_case() assert row.flags == ["candidate_skips_llm_validation"] -def test_compare_case_analysis_review_gates_detector_only_candidates() -> None: +def test_compare_case_analysis_review_gates_rule_detector_only_candidates() -> None: tool = load_tool( - "measurement_compare_strategy_pairs_detector_only", + "measurement_compare_strategy_pairs_rule_detector_only", REPO_ROOT / "tools/measurement/compare_strategy_pairs.py", ) table = pd.DataFrame( @@ -413,14 +413,15 @@ def test_compare_case_analysis_review_gates_detector_only_candidates() -> None: }, { "workload_id": "shell-1", - "config_id": "detector-only", - "experimental_detection_strategy": "detector_only", + "config_id": "rule-detector-only", + "experimental_detection_strategy": "rules_guardrail_detector_only", "case_id": "shell__candidate", "pipeline_elapsed_sec": 1, "observed_total_requests": 1, "observed_total_tokens": 200, "final_entity_count": 2, - "artifact_final_detector_entity_count": 2, + "artifact_final_detector_entity_count": 1, + "artifact_final_rule_entity_count": 1, "artifact_final_entity_signature_hashes": ["a", "b"], }, ] @@ -429,7 +430,7 @@ def test_compare_case_analysis_review_gates_detector_only_candidates() -> None: rows = tool.compare_case_analysis( table, baseline_strategy="default", - candidate_strategy="detector_only", + candidate_strategy="rules_guardrail_detector_only", ) assert len(rows) == 1 @@ -437,19 +438,19 @@ def test_compare_case_analysis_review_gates_detector_only_candidates() -> None: assert row.safety_verdict == "review" assert row.performance_verdict == "improved" assert row.candidate_verdict == "review" - assert row.flags == ["candidate_skips_llm_validation"] + assert row.flags == ["candidate_uses_rule_entities", "candidate_skips_llm_validation"] -def test_compare_case_analysis_review_gates_non_detector_sources_when_signatures_match() -> None: +def test_compare_case_analysis_review_gates_rules_covered_or_default_when_signatures_match() -> None: tool = load_tool( - "measurement_compare_strategy_pairs_non_detector_sources", + "measurement_compare_strategy_pairs_rules_covered_or_default", REPO_ROOT / "tools/measurement/compare_strategy_pairs.py", ) table = pd.DataFrame( [ { "workload_id": "shell-1", - "config_id": "native-source-default", + "config_id": "rule-labels-default", "experimental_detection_strategy": "default", "case_id": "shell__default", "pipeline_elapsed_sec": 21.4, @@ -462,14 +463,14 @@ def test_compare_case_analysis_review_gates_non_detector_sources_when_signatures }, { "workload_id": "shell-1", - "config_id": "native-source-candidate", - "experimental_detection_strategy": "native_single_pass", + "config_id": "rule-labels-covered-or-default", + "experimental_detection_strategy": "rules_covered_or_default", "case_id": "shell__candidate", "pipeline_elapsed_sec": 0.001, "observed_total_requests": 0, "observed_total_tokens": 0, "final_entity_count": 2, - "artifact_final_augmenter_entity_count": 2, + "artifact_final_rule_entity_count": 2, "artifact_final_entity_signature_hashes": ["a", "b"], "artifact_final_entity_signature_labels": {"a": "api_key", "b": "password"}, }, @@ -478,8 +479,8 @@ def test_compare_case_analysis_review_gates_non_detector_sources_when_signatures rows = tool.compare_case_analysis( table, - baseline_config="native-source-default", - candidate_config="native-source-candidate", + baseline_config="rule-labels-default", + candidate_config="rule-labels-covered-or-default", ) assert len(rows) == 1 @@ -492,7 +493,7 @@ def test_compare_case_analysis_review_gates_non_detector_sources_when_signatures assert row.safety_verdict == "review" assert row.performance_verdict == "improved" assert row.candidate_verdict == "review" - assert row.flags == ["no_candidate_detector_entities"] + assert row.flags == ["no_candidate_detector_entities", "candidate_uses_rule_entities"] def test_compare_case_analysis_flags_signature_loss_even_when_counts_match() -> None: @@ -586,20 +587,20 @@ def test_compare_case_analysis_treats_baseline_subspan_as_candidate_covered() -> }, { "workload_id": "structured-identifiers", - "config_id": "native-local", - "experimental_detection_strategy": "native_single_pass", + "config_id": "rules-local", + "experimental_detection_strategy": "rules_covered_or_default", "case_id": "candidate-r0", "pipeline_elapsed_sec": 0.01, "observed_total_requests": 0, "observed_total_tokens": 0, "final_entity_count": 2, - "artifact_final_augmenter_entity_count": 2, + "artifact_final_rule_entity_count": 2, "artifact_final_entity_signature_hashes": ["cookie", "pin"], "artifact_final_entity_signature_labels": {"cookie": "http_cookie", "pin": "pin"}, "artifact_final_entity_signature_details": { "cookie": { "label": "http_cookie", - "source": "native", + "source": "rule", "row_index": 0, "start_position": 30, "end_position": 80, @@ -608,7 +609,7 @@ def test_compare_case_analysis_treats_baseline_subspan_as_candidate_covered() -> }, "pin": { "label": "pin", - "source": "native", + "source": "rule", "row_index": 0, "start_position": 90, "end_position": 95, @@ -620,7 +621,7 @@ def test_compare_case_analysis_treats_baseline_subspan_as_candidate_covered() -> ] ) - rows = tool.compare_case_analysis(table, baseline_config="default", candidate_config="native-local") + rows = tool.compare_case_analysis(table, baseline_config="default", candidate_config="rules-local") assert len(rows) == 1 row = rows[0] @@ -743,20 +744,20 @@ def test_compare_case_analysis_treats_high_overlap_candidate_span_as_covered() - }, { "workload_id": "structured-identifiers", - "config_id": "native-local", - "experimental_detection_strategy": "native_single_pass", + "config_id": "rules-local", + "experimental_detection_strategy": "rules_covered_or_default", "case_id": "candidate-r0", "pipeline_elapsed_sec": 0.01, "observed_total_requests": 0, "observed_total_tokens": 0, "final_entity_count": 1, - "artifact_final_augmenter_entity_count": 1, + "artifact_final_rule_entity_count": 1, "artifact_final_entity_signature_hashes": ["token-value"], "artifact_final_entity_signature_labels": {"token-value": "api_key"}, "artifact_final_entity_signature_details": { "token-value": { "label": "api_key", - "source": "native", + "source": "rule", "row_index": 0, "start_position": 26, "end_position": 69, @@ -768,7 +769,7 @@ def test_compare_case_analysis_treats_high_overlap_candidate_span_as_covered() - ] ) - rows = tool.compare_case_analysis(table, baseline_config="default", candidate_config="native-local") + rows = tool.compare_case_analysis(table, baseline_config="default", candidate_config="rules-local") assert len(rows) == 1 row = rows[0] @@ -819,20 +820,20 @@ def test_compare_case_analysis_treats_small_assignment_prefix_gap_as_boundary_ov }, { "workload_id": "structured-identifiers", - "config_id": "native-local", - "experimental_detection_strategy": "native_single_pass", + "config_id": "rules-local", + "experimental_detection_strategy": "rules_covered_or_default", "case_id": "candidate-r0", "pipeline_elapsed_sec": 0.01, "observed_total_requests": 0, "observed_total_tokens": 0, "final_entity_count": 1, - "artifact_final_augmenter_entity_count": 1, + "artifact_final_rule_entity_count": 1, "artifact_final_entity_signature_hashes": ["login-value"], "artifact_final_entity_signature_labels": {"login-value": "user_name"}, "artifact_final_entity_signature_details": { "login-value": { "label": "user_name", - "source": "native", + "source": "rule", "row_index": 0, "start_position": 26, "end_position": 38, @@ -844,7 +845,7 @@ def test_compare_case_analysis_treats_small_assignment_prefix_gap_as_boundary_ov ] ) - rows = tool.compare_case_analysis(table, baseline_config="default", candidate_config="native-local") + rows = tool.compare_case_analysis(table, baseline_config="default", candidate_config="rules-local") assert len(rows) == 1 row = rows[0] @@ -956,7 +957,7 @@ def test_compare_case_analysis_rejects_candidate_original_value_leaks() -> None: { "workload_id": "structured-secrets", "config_id": "candidate", - "experimental_detection_strategy": "native_single_pass", + "experimental_detection_strategy": "rules_covered_or_default", "case_id": "structured__candidate", "pipeline_elapsed_sec": 1, "observed_total_tokens": 0, @@ -1154,7 +1155,7 @@ def test_compare_case_analysis_rejects_candidate_case_failures() -> None: { "workload_id": "shell-5", "config_id": "candidate", - "experimental_detection_strategy": "detector_only", + "experimental_detection_strategy": "rules_guardrail_detector_only", "case_id": "candidate-r0", "pipeline_elapsed_sec": 2, "case_failed": False, @@ -1163,7 +1164,7 @@ def test_compare_case_analysis_rejects_candidate_case_failures() -> None: { "workload_id": "shell-5", "config_id": "candidate", - "experimental_detection_strategy": "detector_only", + "experimental_detection_strategy": "rules_guardrail_detector_only", "case_id": "candidate-r1", "pipeline_elapsed_sec": 0.2, "case_failed": True, @@ -1385,7 +1386,7 @@ def test_compare_strategy_pairs_writes_csv(tmp_path: Path) -> None: baseline_final_entity_count=4, candidate_final_entity_count=8, final_entity_count_delta=4, - flags=["candidate_skips_llm_validation"], + flags=["candidate_uses_rule_entities"], ) ] @@ -1396,7 +1397,7 @@ def test_compare_strategy_pairs_writes_csv(tmp_path: Path) -> None: assert exported["workload_id"].tolist() == ["shell-1"] assert exported["candidate_replacement_strategy"].tolist() == ["local_structured_substitute"] assert exported["final_entity_count_delta"].tolist() == [4] - assert exported["flags"].tolist() == ['["candidate_skips_llm_validation"]'] + assert exported["flags"].tolist() == ['["candidate_uses_rule_entities"]'] def test_compare_strategy_pairs_summarizes_candidate_verdicts() -> None: @@ -1432,7 +1433,7 @@ def test_compare_strategy_pairs_summarizes_candidate_verdicts() -> None: tool.ComparisonRow( workload_id="shell-1", baseline_config_id="shell-default", - candidate_config_id="shell-detector-only", + candidate_config_id="shell-rules", baseline_case_count=1, candidate_case_count=1, value_protection_verdict="review", diff --git a/tests/tools/test_detection_strategies.py b/tests/tools/test_detection_strategies.py index 5ef9affd..984bb025 100644 --- a/tests/tools/test_detection_strategies.py +++ b/tests/tools/test_detection_strategies.py @@ -13,15 +13,24 @@ from unittest.mock import Mock import pandas as pd +import pytest from anonymizer.engine.constants import ( + COL_AUGMENTED_ENTITIES, COL_DETECTED_ENTITIES, + COL_FINAL_ENTITIES, + COL_INITIAL_TAGGED_TEXT, + COL_MERGED_ENTITIES, COL_RAW_DETECTED, COL_SEED_ENTITIES, COL_SEED_ENTITIES_JSON, + COL_SEED_VALIDATION_CANDIDATES, COL_TAG_NOTATION, COL_TAGGED_TEXT, COL_TEXT, + COL_VALIDATED_ENTITIES, + COL_VALIDATED_SEED_ENTITIES, + COL_VALIDATION_DECISIONS, ) from anonymizer.engine.detection.detection_workflow import EntityDetectionWorkflow from anonymizer.engine.ndd.model_loader import load_default_model_selection @@ -42,6 +51,280 @@ def load_tool(module_name: str, path: Path) -> ModuleType: return module +def test_rules_only_strategy_detects_rule_spans_and_restores_workflow_method() -> None: + tool = load_tool("measurement_detection_strategies", REPO_ROOT / "tools/measurement/detection_strategies.py") + original = EntityDetectionWorkflow.detect_and_validate_entities + + with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.rules_only): + workflow = EntityDetectionWorkflow(adapter=Mock()) + result = workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: ["token=sk-test-AAAAAAAAAAAAAAAAAAAAAAAA\nPassword: fakePass123!"]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + validation_single_chunk_full_text=False, + entity_labels=["api_key", "password"], + ) + + assert EntityDetectionWorkflow.detect_and_validate_entities is original + entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities + assert [(entity.label, entity.value) for entity in entities] == [ + ("api_key", "sk-test-AAAAAAAAAAAAAAAAAAAAAAAA"), + ("password", "fakePass123!"), + ] + + +def test_rules_only_strategy_rejects_unsupported_labels_at_runtime() -> None: + tool = load_tool( + "measurement_detection_strategies_runtime_guard", REPO_ROOT / "tools/measurement/detection_strategies.py" + ) + + with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.rules_only): + workflow = EntityDetectionWorkflow(adapter=Mock()) + with pytest.raises(ValueError, match="unsupported high-confidence rule labels.*person"): + workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: ["Alice has token=sk-test-AAAAAAAAAAAAAAAAAAAAAAAA"]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + validation_single_chunk_full_text=False, + entity_labels=["api_key", "person"], + ) + + +def test_rules_covered_or_default_short_circuits_structured_fast_lane_labels() -> None: + tool = load_tool( + "measurement_detection_strategies_rules_covered_short_circuit", + REPO_ROOT / "tools/measurement/detection_strategies.py", + ) + adapter = Mock() + + with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.rules_covered_or_default): + workflow = EntityDetectionWorkflow(adapter=adapter) + result = workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: ["token=sk-test-AAAAAAAAAAAAAAAAAAAAAAAA\nPassword: fakePass123!"]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + validation_single_chunk_full_text=False, + entity_labels=["api_key", "password"], + ) + + adapter.run_workflow.assert_not_called() + assert "_anonymizer_row_order" not in result.dataframe.columns + entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities + assert [(entity.label, entity.value) for entity in entities] == [ + ("api_key", "sk-test-AAAAAAAAAAAAAAAAAAAAAAAA"), + ("password", "fakePass123!"), + ] + + +def test_rules_covered_or_default_falls_back_for_uncovered_structured_assignments() -> None: + tool = load_tool( + "measurement_detection_strategies_rules_covered_row_fallback", + REPO_ROOT / "tools/measurement/detection_strategies.py", + ) + original = EntityDetectionWorkflow.detect_and_validate_entities + calls: list[pd.DataFrame] = [] + + def fake_original( + self: EntityDetectionWorkflow, + dataframe: pd.DataFrame, + **kwargs: object, + ) -> object: + calls.append(dataframe.copy()) + output = dataframe.copy() + output[COL_DETECTED_ENTITIES] = [ + EntitiesSchema(entities=[]).model_dump(mode="json") for _ in range(len(output)) + ] + output[COL_TAGGED_TEXT] = output[COL_TEXT] + return SimpleNamespace(dataframe=output, failed_records=[]) + + rows = [ + "token=sk-test-AAAAAAAAAAAAAAAAAAAAAAAA", + '{"password": "SecretNoRule123!"}', + ] + EntityDetectionWorkflow.detect_and_validate_entities = fake_original # type: ignore[method-assign] + try: + with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.rules_covered_or_default): + workflow = EntityDetectionWorkflow(adapter=Mock()) + result = workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: rows}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + validation_single_chunk_full_text=False, + entity_labels=["api_key", "password"], + ) + finally: + EntityDetectionWorkflow.detect_and_validate_entities = original # type: ignore[method-assign] + + assert len(calls) == 1 + assert calls[0][COL_TEXT].tolist() == ['{"password": "SecretNoRule123!"}'] + assert "_anonymizer_row_order" not in result.dataframe.columns + assert result.dataframe[COL_TEXT].tolist() == rows + first_entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities + second_entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[1]).entities + assert [(entity.label, entity.value) for entity in first_entities] == [ + ("api_key", "sk-test-AAAAAAAAAAAAAAAAAAAAAAAA") + ] + assert second_entities == [] + + +def test_rules_covered_or_default_records_route_counts() -> None: + tool = load_tool( + "measurement_detection_strategies_rules_covered_route_counts", + REPO_ROOT / "tools/measurement/detection_strategies.py", + ) + original = EntityDetectionWorkflow.detect_and_validate_entities + + def fake_original( + self: EntityDetectionWorkflow, + dataframe: pd.DataFrame, + **kwargs: object, + ) -> object: + output = dataframe.copy() + output[COL_DETECTED_ENTITIES] = [ + EntitiesSchema(entities=[]).model_dump(mode="json") for _ in range(len(output)) + ] + output[COL_TAGGED_TEXT] = output[COL_TEXT] + return SimpleNamespace(dataframe=output, failed_records=[]) + + EntityDetectionWorkflow.detect_and_validate_entities = fake_original # type: ignore[method-assign] + collector = MeasurementCollector(record_hash_key="test-key") + try: + with measurement_session(collector): + with tool.experimental_detection_strategy_context( + tool.ExperimentalDetectionStrategy.rules_covered_or_default + ): + workflow = EntityDetectionWorkflow(adapter=Mock()) + workflow.detect_and_validate_entities( + pd.DataFrame( + { + COL_TEXT: [ + "token=sk-test-AAAAAAAAAAAAAAAAAAAAAAAA", + '{"password": "SecretNoRule123!"}', + ] + } + ), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + validation_single_chunk_full_text=False, + entity_labels=["api_key", "password"], + ) + finally: + EntityDetectionWorkflow.detect_and_validate_entities = original # type: ignore[method-assign] + + records = [record for record in collector.records if record["record_type"] == "model_workflow"] + assert len(records) == 1 + record = records[0] + assert record["workflow_name"] == "entity-detection-rules-covered-router" + assert record["route_total_row_count"] == 2 + assert record["route_rule_row_count"] == 1 + assert record["route_fallback_row_count"] == 1 + assert record["observed_total_requests"] == 0 + assert record["observed_total_tokens"] == 0 + + +def test_native_rules_router_strategy_runs_staged_detection_without_data_designer() -> None: + tool = load_tool( + "measurement_detection_strategies_native_rules_router", + REPO_ROOT / "tools/measurement/detection_strategies.py", + ) + + class SequencedClient: + def __init__(self) -> None: + self.prompts: list[str] = [] + self.outputs = [ + '{"entities": [{"value": "Alice", "label": "first_name", "reason": "name"}]}', + '{"decisions": [{"id": "first_name_0_5", "decision": "keep", "reason": "real name"}]}', + '{"entities": [{"value": "NVIDIA", "label": "organization_name", "reason": "employer"}]}', + ] + + def complete(self, request): # type: ignore[no-untyped-def] + self.prompts.append(request.prompt) + return SimpleNamespace(content=self.outputs.pop(0), elapsed_sec=0.1, usage={}) + + adapter = Mock() + client = SequencedClient() + + with tool.experimental_detection_strategy_context( + tool.ExperimentalDetectionStrategy.native_rules_router, + native_client=client, + ): + workflow = EntityDetectionWorkflow(adapter=adapter) + result = workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: ["Alice works at NVIDIA."]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + validation_single_chunk_full_text=False, + entity_labels=["first_name", "organization_name"], + ) + + adapter.run_workflow.assert_not_called() + entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities + assert [(entity.label, entity.value, entity.source) for entity in entities] == [ + ("first_name", "Alice", "direct_seed"), + ("organization_name", "NVIDIA", "augmenter"), + ] + assert len(client.prompts) == 3 + + +def test_native_rules_router_strategy_records_direct_model_usage() -> None: + tool = load_tool( + "measurement_detection_strategies_native_rules_router_usage", + REPO_ROOT / "tools/measurement/detection_strategies.py", + ) + + class SequencedClient: + def __init__(self) -> None: + self.outputs = [ + '{"entities": [{"value": "Alice", "label": "first_name", "reason": "name"}]}', + '{"decisions": [{"id": "first_name_0_5", "decision": "keep", "reason": "real name"}]}', + '{"entities": [{"value": "NVIDIA", "label": "organization_name", "reason": "employer"}]}', + ] + + def complete(self, _request): # type: ignore[no-untyped-def] + return SimpleNamespace( + content=self.outputs.pop(0), + elapsed_sec=0.1, + usage={"prompt_tokens": 10, "completion_tokens": 4, "total_tokens": 14}, + ) + + collector = MeasurementCollector(record_hash_key="test-key") + + with measurement_session(collector): + with tool.experimental_detection_strategy_context( + tool.ExperimentalDetectionStrategy.native_rules_router, + native_client=SequencedClient(), + native_runtime=tool.NativeDetectionRuntime(model="test/native", provider="test-provider"), + ): + workflow = EntityDetectionWorkflow(adapter=Mock()) + workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: ["Alice works at NVIDIA."]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + validation_single_chunk_full_text=False, + entity_labels=["first_name", "organization_name"], + ) + + records = [record for record in collector.records if record["record_type"] == "model_workflow"] + assert len(records) == 1 + record = records[0] + assert record["workflow_name"] == "entity-detection-native-rules-router" + assert record["observed_total_requests"] == 3 + assert record["observed_successful_requests"] == 3 + assert record["observed_failed_requests"] == 0 + assert record["observed_input_tokens"] == 30 + assert record["observed_output_tokens"] == 12 + assert record["observed_total_tokens"] == 42 + assert record["model_usage"]["native-direct"]["model_name"] == "test/native" + assert record["model_usage"]["native-direct"]["model_provider_name"] == "test-provider" + + def test_native_candidate_validate_no_augment_strategy_skips_data_designer_and_augmentation() -> None: tool = load_tool( "measurement_detection_strategies_native_candidate_validate", @@ -898,9 +1181,9 @@ def complete(self, _request): # type: ignore[no-untyped-def] assert record["observed_total_tokens"] == 27 -def test_native_single_pass_strategy_uses_only_native_spans() -> None: +def test_native_single_pass_strategy_adds_non_overlapping_rule_spans() -> None: tool = load_tool( - "measurement_detection_strategies_native_single_pass_native_spans", + "measurement_detection_strategies_native_single_pass_rule_guardrail", REPO_ROOT / "tools/measurement/detection_strategies.py", ) text = "Alice logged in.\nPassword: SuperSecret123!\n" @@ -929,6 +1212,7 @@ def complete(self, _request): # type: ignore[no-untyped-def] entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities assert [(entity.label, entity.value, entity.source) for entity in entities] == [ ("person", "Alice", "direct_single_pass"), + ("password", "SuperSecret123!", "rule"), ] @@ -975,66 +1259,9 @@ def complete(self, _request): # type: ignore[no-untyped-def] assert record["observed_failed_requests"] == 0 -def test_detector_only_strategy_finalizes_gliner_spans_without_validation_or_augmentation() -> None: - tool = load_tool( - "measurement_detection_strategies_detector_only", - REPO_ROOT / "tools/measurement/detection_strategies.py", - ) - adapter = Mock() - text = "Alice met Alice at the lab." - start = text.index("Alice") - - def fake_run_workflow(dataframe: pd.DataFrame, *, columns: list, **kwargs: object) -> SimpleNamespace: - assert [column.name for column in columns] == [ - COL_RAW_DETECTED, - COL_SEED_ENTITIES, - COL_SEED_ENTITIES_JSON, - COL_DETECTED_ENTITIES, - ] - assert kwargs["workflow_name"] == "entity-detection-detector-only" - row = { - COL_TEXT: dataframe[COL_TEXT].iloc[0], - COL_RAW_DETECTED: json.dumps( - { - "entities": [ - { - "text": "Alice", - "label": "person", - "start": start, - "end": start + len("Alice"), - "score": 0.99, - } - ] - } - ), - } - row = columns[1].generator_function(row) - row = columns[2].generator_function(row) - seed_entities = json.loads(row[COL_SEED_ENTITIES_JSON]) - assert [(entity["label"], entity["value"]) for entity in seed_entities] == [("person", "Alice")] - row = columns[3].generator_function(row) - return SimpleNamespace(dataframe=pd.DataFrame([row]), failed_records=[]) - - adapter.run_workflow.side_effect = fake_run_workflow - - with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.detector_only): - workflow = EntityDetectionWorkflow(adapter=adapter) - result = workflow.detect_and_validate_entities( - pd.DataFrame({COL_TEXT: [text]}), - model_configs=[], - selected_models=load_default_model_selection().detection, - gliner_detection_threshold=0.3, - entity_labels=["person"], - ) - - entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities - assert [(entity.label, entity.value) for entity in entities] == [("person", "Alice"), ("person", "Alice")] - assert "Alice" in result.dataframe[COL_TAGGED_TEXT].iloc[0] - - -def test_compact_validation_strategy_disables_full_text_single_chunk_validation() -> None: +def test_rules_covered_or_default_uses_default_pipeline_for_contextual_labels() -> None: tool = load_tool( - "measurement_detection_strategies_compact_validation", + "measurement_detection_strategies_rules_covered_default_fallback", REPO_ROOT / "tools/measurement/detection_strategies.py", ) original = EntityDetectionWorkflow.detect_and_validate_entities @@ -1060,34 +1287,867 @@ def fake_original( EntityDetectionWorkflow.detect_and_validate_entities = fake_original # type: ignore[method-assign] try: - with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.compact_validation): + with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.rules_covered_or_default): workflow = EntityDetectionWorkflow(adapter=Mock()) - workflow.detect_and_validate_entities( - pd.DataFrame({COL_TEXT: ["Alice works at Acme."]}), + result = workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: ["Alice has token=sk-test-AAAAAAAAAAAAAAAAAAAAAAAA"]}), model_configs=[], selected_models=load_default_model_selection().detection, gliner_detection_threshold=0.3, - entity_labels=["first_name"], + validation_single_chunk_full_text=False, + entity_labels=["api_key", "person"], ) finally: EntityDetectionWorkflow.detect_and_validate_entities = original # type: ignore[method-assign] - assert EntityDetectionWorkflow.detect_and_validate_entities is original + assert len(calls) == 1 + assert calls[0]["entity_labels"] == ["api_key", "person"] assert calls[0]["validation_single_chunk_full_text"] is False + assert EntityDetectionWorkflow.detect_and_validate_entities is original + assert EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities == [] -def test_prose_augment_focus_extends_and_restores_augment_prompt() -> None: +def test_rules_covered_or_default_uses_default_pipeline_for_narrow_prose_rule_labels() -> None: tool = load_tool( - "measurement_detection_strategies_prose_augment_focus", + "measurement_detection_strategies_rules_covered_prose_rule_fallback", REPO_ROOT / "tools/measurement/detection_strategies.py", ) - before = tool.dw._get_augment_prompt(data_summary=None, labels=["organization_name"], strict_labels=True) - - with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.prose_augment_focus): - inside = tool.dw._get_augment_prompt(data_summary=None, labels=["organization_name"], strict_labels=True) + original = EntityDetectionWorkflow.detect_and_validate_entities + calls = [] - after = tool.dw._get_augment_prompt(data_summary=None, labels=["organization_name"], strict_labels=True) - assert "Contextual prose recall focus" not in before - assert "Contextual prose recall focus" in inside + def fake_original( + self: EntityDetectionWorkflow, + dataframe: pd.DataFrame, + **kwargs: object, + ) -> object: + calls.append(kwargs) + return SimpleNamespace( + dataframe=pd.DataFrame( + [ + { + COL_TEXT: dataframe[COL_TEXT].iloc[0], + COL_DETECTED_ENTITIES: EntitiesSchema(entities=[]).model_dump(mode="json"), + } + ] + ), + failed_records=[], + ) + + EntityDetectionWorkflow.detect_and_validate_entities = fake_original # type: ignore[method-assign] + try: + with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.rules_covered_or_default): + workflow = EntityDetectionWorkflow(adapter=Mock()) + result = workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: ["Jordan worked at Acme Research Center and lived on Maple Street."]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + validation_single_chunk_full_text=False, + entity_labels=["organization_name", "street_address"], + ) + finally: + EntityDetectionWorkflow.detect_and_validate_entities = original # type: ignore[method-assign] + + assert len(calls) == 1 + assert calls[0]["entity_labels"] == ["organization_name", "street_address"] + assert EntityDetectionWorkflow.detect_and_validate_entities is original + assert EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities == [] + + +def test_detector_only_strategy_finalizes_gliner_spans_without_validation_or_augmentation() -> None: + tool = load_tool( + "measurement_detection_strategies_detector_only", + REPO_ROOT / "tools/measurement/detection_strategies.py", + ) + adapter = Mock() + text = "Alice met Alice at the lab." + start = text.index("Alice") + + def fake_run_workflow(dataframe: pd.DataFrame, *, columns: list, **kwargs: object) -> SimpleNamespace: + assert [column.name for column in columns] == [ + COL_RAW_DETECTED, + COL_SEED_ENTITIES, + COL_SEED_ENTITIES_JSON, + COL_DETECTED_ENTITIES, + ] + assert kwargs["workflow_name"] == "entity-detection-detector-only" + row = { + COL_TEXT: dataframe[COL_TEXT].iloc[0], + COL_RAW_DETECTED: json.dumps( + { + "entities": [ + { + "text": "Alice", + "label": "person", + "start": start, + "end": start + len("Alice"), + "score": 0.99, + } + ] + } + ), + } + row = columns[1].generator_function(row) + row = columns[2].generator_function(row) + seed_entities = json.loads(row[COL_SEED_ENTITIES_JSON]) + assert [(entity["label"], entity["value"]) for entity in seed_entities] == [("person", "Alice")] + row = columns[3].generator_function(row) + return SimpleNamespace(dataframe=pd.DataFrame([row]), failed_records=[]) + + adapter.run_workflow.side_effect = fake_run_workflow + + with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.detector_only): + workflow = EntityDetectionWorkflow(adapter=adapter) + result = workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: [text]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + entity_labels=["person"], + ) + + entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities + assert [(entity.label, entity.value) for entity in entities] == [("person", "Alice"), ("person", "Alice")] + assert "Alice" in result.dataframe[COL_TAGGED_TEXT].iloc[0] + + +def test_rules_guardrail_detector_only_adds_rule_spans_without_validation_or_augmentation() -> None: + tool = load_tool( + "measurement_detection_strategies_rules_guardrail_detector_only", + REPO_ROOT / "tools/measurement/detection_strategies.py", + ) + adapter = Mock() + token = "sk-test-AAAAAAAAAAAAAAAAAAAAAAAA" + text = f"Alice exported token={token}" + alice_start = text.index("Alice") + + def fake_run_workflow(dataframe: pd.DataFrame, *, columns: list, **kwargs: object) -> SimpleNamespace: + assert [column.name for column in columns] == [ + COL_RAW_DETECTED, + COL_SEED_ENTITIES, + COL_SEED_ENTITIES_JSON, + COL_DETECTED_ENTITIES, + ] + assert kwargs["workflow_name"] == "entity-detection-rules-guardrail-detector-only" + row = { + COL_TEXT: dataframe[COL_TEXT].iloc[0], + COL_RAW_DETECTED: json.dumps( + { + "entities": [ + { + "text": "Alice", + "label": "person", + "start": alice_start, + "end": alice_start + len("Alice"), + "score": 0.99, + } + ] + } + ), + } + for column in columns[1:]: + row = column.generator_function(row) + return SimpleNamespace(dataframe=pd.DataFrame([row]), failed_records=[]) + + adapter.run_workflow.side_effect = fake_run_workflow + + with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.rules_guardrail_detector_only): + workflow = EntityDetectionWorkflow(adapter=adapter) + result = workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: [text]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + entity_labels=["api_key", "person"], + ) + + entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities + assert [(entity.label, entity.value, entity.source) for entity in entities] == [ + ("person", "Alice", "detector"), + ("api_key", token, "rule"), + ] + + +def test_rules_guardrail_keeps_default_pipeline_and_adds_rule_spans() -> None: + tool = load_tool( + "measurement_detection_strategies_default_rules_guardrail", + REPO_ROOT / "tools/measurement/detection_strategies.py", + ) + original = EntityDetectionWorkflow.detect_and_validate_entities + text = "The applicant was born in 1978 and later moved to Berlin." + calls = [] + + def fake_original( + self: EntityDetectionWorkflow, + dataframe: pd.DataFrame, + **_: object, + ) -> object: + calls.append(self) + return SimpleNamespace( + dataframe=pd.DataFrame( + [ + { + COL_TEXT: dataframe[COL_TEXT].iloc[0], + COL_DETECTED_ENTITIES: EntitiesSchema(entities=[]).model_dump(mode="json"), + } + ] + ), + failed_records=[], + ) + + EntityDetectionWorkflow.detect_and_validate_entities = fake_original # type: ignore[method-assign] + try: + with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.rules_guardrail): + workflow = EntityDetectionWorkflow(adapter=Mock()) + result = workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: [text]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + entity_labels=["date_of_birth"], + ) + finally: + EntityDetectionWorkflow.detect_and_validate_entities = original # type: ignore[method-assign] + + assert len(calls) == 1 + assert EntityDetectionWorkflow.detect_and_validate_entities is original + entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities + assert [(entity.label, entity.value, entity.source) for entity in entities] == [("date_of_birth", "1978", "rule")] + + +def test_compact_validation_strategy_disables_full_text_single_chunk_validation() -> None: + tool = load_tool( + "measurement_detection_strategies_compact_validation", + REPO_ROOT / "tools/measurement/detection_strategies.py", + ) + original = EntityDetectionWorkflow.detect_and_validate_entities + calls = [] + + def fake_original( + self: EntityDetectionWorkflow, + dataframe: pd.DataFrame, + **kwargs: object, + ) -> object: + calls.append(kwargs) + return SimpleNamespace( + dataframe=pd.DataFrame( + [ + { + COL_TEXT: dataframe[COL_TEXT].iloc[0], + COL_DETECTED_ENTITIES: EntitiesSchema(entities=[]).model_dump(mode="json"), + } + ] + ), + failed_records=[], + ) + + EntityDetectionWorkflow.detect_and_validate_entities = fake_original # type: ignore[method-assign] + try: + with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.compact_validation): + workflow = EntityDetectionWorkflow(adapter=Mock()) + workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: ["Alice works at Acme."]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + entity_labels=["first_name"], + ) + finally: + EntityDetectionWorkflow.detect_and_validate_entities = original # type: ignore[method-assign] + + assert EntityDetectionWorkflow.detect_and_validate_entities is original + assert calls[0]["validation_single_chunk_full_text"] is False + + +def test_rules_guardrail_compact_validation_combines_rule_guardrail_and_compact_validation() -> None: + tool = load_tool( + "measurement_detection_strategies_rules_guardrail_compact_validation", + REPO_ROOT / "tools/measurement/detection_strategies.py", + ) + original = EntityDetectionWorkflow.detect_and_validate_entities + text = "The applicant was born in 1978 and later moved to Berlin." + calls = [] + + def fake_original( + self: EntityDetectionWorkflow, + dataframe: pd.DataFrame, + **kwargs: object, + ) -> object: + calls.append(kwargs) + return SimpleNamespace( + dataframe=pd.DataFrame( + [ + { + COL_TEXT: dataframe[COL_TEXT].iloc[0], + COL_DETECTED_ENTITIES: EntitiesSchema(entities=[]).model_dump(mode="json"), + } + ] + ), + failed_records=[], + ) + + EntityDetectionWorkflow.detect_and_validate_entities = fake_original # type: ignore[method-assign] + try: + with tool.experimental_detection_strategy_context( + tool.ExperimentalDetectionStrategy.rules_guardrail_compact_validation + ): + workflow = EntityDetectionWorkflow(adapter=Mock()) + result = workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: [text]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + entity_labels=["date_of_birth"], + ) + finally: + EntityDetectionWorkflow.detect_and_validate_entities = original # type: ignore[method-assign] + + assert calls[0]["validation_single_chunk_full_text"] is False + entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities + assert [(entity.label, entity.value, entity.source) for entity in entities] == [("date_of_birth", "1978", "rule")] + + +def test_rules_guardrail_prefers_rule_label_for_exact_span_overlap() -> None: + tool = load_tool( + "measurement_detection_strategies_default_rules_guardrail_exact_overlap", + REPO_ROOT / "tools/measurement/detection_strategies.py", + ) + original = EntityDetectionWorkflow.detect_and_validate_entities + text = "Idilio describes himself as secular and leans progressive on most political issues." + start = text.index("secular") + calls = [] + + def fake_original( + self: EntityDetectionWorkflow, + dataframe: pd.DataFrame, + **_: object, + ) -> object: + calls.append(self) + return SimpleNamespace( + dataframe=pd.DataFrame( + [ + { + COL_TEXT: dataframe[COL_TEXT].iloc[0], + COL_DETECTED_ENTITIES: EntitiesSchema( + entities=[ + { + "id": "political_view_0", + "value": "secular", + "label": "political_view", + "start_position": start, + "end_position": start + len("secular"), + "score": 1.0, + "source": "detector", + } + ] + ).model_dump(mode="json"), + } + ] + ), + failed_records=[], + ) + + EntityDetectionWorkflow.detect_and_validate_entities = fake_original # type: ignore[method-assign] + try: + with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.rules_guardrail): + workflow = EntityDetectionWorkflow(adapter=Mock()) + result = workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: [text]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + entity_labels=["political_view", "religious_belief"], + ) + finally: + EntityDetectionWorkflow.detect_and_validate_entities = original # type: ignore[method-assign] + + assert len(calls) == 1 + entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities + assert [(entity.label, entity.value, entity.source) for entity in entities] == [ + ("religious_belief", "secular", "rule") + ] + + +def test_rules_guardrail_can_apply_explicit_rule_labels_outside_model_labels() -> None: + tool = load_tool( + "measurement_detection_strategies_default_rules_guardrail_rule_labels", + REPO_ROOT / "tools/measurement/detection_strategies.py", + ) + original = EntityDetectionWorkflow.detect_and_validate_entities + text = "Outside the lab, Idilio shares a modest house on West Roberts Drive with his wife." + calls = [] + + def fake_original( + self: EntityDetectionWorkflow, + dataframe: pd.DataFrame, + **kwargs: object, + ) -> object: + calls.append(kwargs["entity_labels"]) + return SimpleNamespace( + dataframe=pd.DataFrame( + [ + { + COL_TEXT: dataframe[COL_TEXT].iloc[0], + COL_DETECTED_ENTITIES: EntitiesSchema(entities=[]).model_dump(mode="json"), + } + ] + ), + failed_records=[], + ) + + EntityDetectionWorkflow.detect_and_validate_entities = fake_original # type: ignore[method-assign] + try: + with tool.experimental_detection_strategy_context( + tool.ExperimentalDetectionStrategy.rules_guardrail, + rule_labels=["street_address"], + ): + workflow = EntityDetectionWorkflow(adapter=Mock()) + result = workflow.run( + pd.DataFrame({COL_TEXT: [text]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + entity_labels=["first_name"], + privacy_goal=None, + tag_latent_entities=False, + ) + finally: + EntityDetectionWorkflow.detect_and_validate_entities = original # type: ignore[method-assign] + + assert calls == [["first_name"]] + detected_entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities + final_entities = EntitiesSchema.from_raw(result.dataframe[COL_FINAL_ENTITIES].iloc[0]).entities + expected = [("street_address", "West Roberts Drive", "rule")] + assert [(entity.label, entity.value, entity.source) for entity in detected_entities] == expected + assert [(entity.label, entity.value, entity.source) for entity in final_entities] == expected + + +def test_prose_augment_focus_extends_and_restores_augment_prompt() -> None: + tool = load_tool( + "measurement_detection_strategies_prose_augment_focus", + REPO_ROOT / "tools/measurement/detection_strategies.py", + ) + before = tool.dw._get_augment_prompt(data_summary=None, labels=["organization_name"], strict_labels=True) + + with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.prose_augment_focus): + inside = tool.dw._get_augment_prompt(data_summary=None, labels=["organization_name"], strict_labels=True) + + after = tool.dw._get_augment_prompt(data_summary=None, labels=["organization_name"], strict_labels=True) + assert "Contextual prose recall focus" not in before + assert "Contextual prose recall focus" in inside assert "organization and institution names" in inside assert after == before + + +def test_rules_guardrail_no_augment_adds_rule_spans_after_validation() -> None: + tool = load_tool( + "measurement_detection_strategies_rules_guardrail", REPO_ROOT / "tools/measurement/detection_strategies.py" + ) + adapter = Mock() + + def fake_run_workflow(dataframe: pd.DataFrame, *, columns: list, **_: object) -> SimpleNamespace: + assert [column.name for column in columns][-3:] == [ + COL_AUGMENTED_ENTITIES, + COL_MERGED_ENTITIES, + COL_DETECTED_ENTITIES, + ] + row = { + COL_TEXT: dataframe[COL_TEXT].iloc[0], + COL_MERGED_ENTITIES: EntitiesSchema(entities=[]).model_dump(mode="json"), + COL_VALIDATED_ENTITIES: {"decisions": []}, + } + row = columns[-1].generator_function(row) + return SimpleNamespace(dataframe=pd.DataFrame([row]), failed_records=[]) + + adapter.run_workflow.side_effect = fake_run_workflow + + with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.rules_guardrail_no_augment): + workflow = EntityDetectionWorkflow(adapter=adapter) + result = workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: ["token=sk-test-AAAAAAAAAAAAAAAAAAAAAAAA\nPassword: fakePass123!"]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + entity_labels=["api_key", "password"], + ) + + entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities + assert [(entity.label, entity.value) for entity in entities] == [ + ("api_key", "sk-test-AAAAAAAAAAAAAAAAAAAAAAAA"), + ("password", "fakePass123!"), + ] + assert adapter.run_workflow.call_args.kwargs["workflow_name"] == "entity-detection-rules-guardrail-no-augment" + + +def test_rules_filter_guardrail_no_augment_filters_rule_spans_before_validation() -> None: + tool = load_tool( + "measurement_detection_strategies_rules_filter_guardrail", + REPO_ROOT / "tools/measurement/detection_strategies.py", + ) + adapter = Mock() + token = "sk-test-AAAAAAAAAAAAAAAAAAAAAAAA" + text = f"Alice used token={token}" + alice_start = text.index("Alice") + token_start = text.index(token) + + def fake_run_workflow(dataframe: pd.DataFrame, *, columns: list, **_: object) -> SimpleNamespace: + row = { + COL_TEXT: dataframe[COL_TEXT].iloc[0], + COL_RAW_DETECTED: json.dumps( + { + "entities": [ + { + "text": "Alice", + "label": "person", + "start": alice_start, + "end": alice_start + len("Alice"), + "score": 0.99, + }, + { + "text": token, + "label": "api_key", + "start": token_start, + "end": token_start + len(token), + "score": 0.99, + }, + ] + } + ), + } + row = columns[1].generator_function(row) + row = columns[2].generator_function(row) + seed_entities = EntitiesSchema.from_raw(row[COL_SEED_ENTITIES]).entities + assert [(entity.label, entity.value) for entity in seed_entities] == [("person", "Alice")] + assert [candidate["label"] for candidate in row[COL_SEED_VALIDATION_CANDIDATES]["candidates"]] == ["person"] + row[COL_MERGED_ENTITIES] = EntitiesSchema(entities=[]).model_dump(mode="json") + row[COL_VALIDATED_ENTITIES] = {"decisions": []} + row = columns[-1].generator_function(row) + return SimpleNamespace(dataframe=pd.DataFrame([row]), failed_records=[]) + + adapter.run_workflow.side_effect = fake_run_workflow + + with tool.experimental_detection_strategy_context( + tool.ExperimentalDetectionStrategy.rules_filter_guardrail_no_augment + ): + workflow = EntityDetectionWorkflow(adapter=adapter) + result = workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: [text]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + entity_labels=["api_key", "person"], + ) + + entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities + assert [(entity.label, entity.value) for entity in entities] == [("api_key", token)] + assert ( + adapter.run_workflow.call_args.kwargs["workflow_name"] == "entity-detection-rules-filter-guardrail-no-augment" + ) + + +def test_rules_filter_guardrail_keeps_augmentation_but_skips_rule_validation() -> None: + tool = load_tool( + "measurement_detection_strategies_rules_filter_guardrail_with_augmentation", + REPO_ROOT / "tools/measurement/detection_strategies.py", + ) + adapter = Mock() + token = "sk-test-AAAAAAAAAAAAAAAAAAAAAAAA" + text = f"Alice used token={token}" + alice_start = text.index("Alice") + token_start = text.index(token) + + def fake_run_workflow(dataframe: pd.DataFrame, *, columns: list, **_: object) -> SimpleNamespace: + assert [column.name for column in columns] == [ + COL_RAW_DETECTED, + COL_SEED_ENTITIES, + COL_SEED_VALIDATION_CANDIDATES, + COL_VALIDATION_DECISIONS, + COL_VALIDATED_ENTITIES, + COL_SEED_ENTITIES_JSON, + COL_AUGMENTED_ENTITIES, + COL_MERGED_ENTITIES, + COL_DETECTED_ENTITIES, + ] + row = { + COL_TEXT: dataframe[COL_TEXT].iloc[0], + COL_RAW_DETECTED: json.dumps( + { + "entities": [ + { + "text": "Alice", + "label": "person", + "start": alice_start, + "end": alice_start + len("Alice"), + "score": 0.99, + }, + { + "text": token, + "label": "api_key", + "start": token_start, + "end": token_start + len(token), + "score": 0.99, + }, + ] + } + ), + } + row = columns[1].generator_function(row) + row = columns[2].generator_function(row) + seed_entities = EntitiesSchema.from_raw(row[COL_SEED_ENTITIES]).entities + assert [(entity.label, entity.value) for entity in seed_entities] == [("person", "Alice")] + assert [candidate["label"] for candidate in row[COL_SEED_VALIDATION_CANDIDATES]["candidates"]] == ["person"] + + row[COL_VALIDATION_DECISIONS] = { + "decisions": [ + { + "id": "person_0_5", + "decision": "keep", + "proposed_label": "", + "reason": "test keep", + } + ] + } + row = columns[4].generator_function(row) + row = columns[5].generator_function(row) + validated_seed = EntitiesSchema.from_raw(row[COL_VALIDATED_SEED_ENTITIES]).entities + assert [(entity.label, entity.value, entity.source) for entity in validated_seed] == [ + ("person", "Alice", "detector"), + ("api_key", token, "rule"), + ] + assert f"{token}" in row[COL_INITIAL_TAGGED_TEXT] + + row[COL_AUGMENTED_ENTITIES] = {"entities": []} + row = columns[7].generator_function(row) + row = columns[8].generator_function(row) + return SimpleNamespace(dataframe=pd.DataFrame([row]), failed_records=[]) + + adapter.run_workflow.side_effect = fake_run_workflow + + with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.rules_filter_guardrail): + workflow = EntityDetectionWorkflow(adapter=adapter) + result = workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: [text]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + entity_labels=["api_key", "person"], + ) + + entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities + assert [(entity.label, entity.value, entity.source) for entity in entities] == [ + ("person", "Alice", "detector"), + ("api_key", token, "rule"), + ] + assert adapter.run_workflow.call_args.kwargs["workflow_name"] == "entity-detection-rules-filter-guardrail" + + +def test_rules_filter_guardrail_preserves_different_label_rule_overlap() -> None: + tool = load_tool( + "measurement_detection_strategies_rules_filter_guardrail_preserve_contextual_overlap", + REPO_ROOT / "tools/measurement/detection_strategies.py", + ) + adapter = Mock() + phrase = "Christian Democrat" + rule_value = "Christian" + text = f"He identifies as a {phrase}." + phrase_start = text.index(phrase) + + def fake_run_workflow(dataframe: pd.DataFrame, *, columns: list, **_: object) -> SimpleNamespace: + row = { + COL_TEXT: dataframe[COL_TEXT].iloc[0], + COL_RAW_DETECTED: json.dumps( + { + "entities": [ + { + "text": phrase, + "label": "political_view", + "start": phrase_start, + "end": phrase_start + len(phrase), + "score": 0.99, + }, + { + "text": rule_value, + "label": "religious_belief", + "start": phrase_start, + "end": phrase_start + len(rule_value), + "score": 0.99, + }, + ] + } + ), + } + row = columns[1].generator_function(row) + row = columns[2].generator_function(row) + seed_entities = EntitiesSchema.from_raw(row[COL_SEED_ENTITIES]).entities + assert [(entity.label, entity.value) for entity in seed_entities] == [("political_view", phrase)] + assert row[COL_SEED_VALIDATION_CANDIDATES]["candidates"][0]["label"] == "political_view" + + row[COL_VALIDATION_DECISIONS] = { + "decisions": [ + { + "id": f"political_view_{phrase_start}_{phrase_start + len(phrase)}", + "decision": "keep", + "proposed_label": "", + "reason": "test keep", + } + ] + } + row = columns[4].generator_function(row) + row = columns[5].generator_function(row) + validated_seed = EntitiesSchema.from_raw(row[COL_VALIDATED_SEED_ENTITIES]).entities + assert any(entity.label == "political_view" and entity.value == phrase for entity in validated_seed) + assert f"{phrase}" in row[COL_INITIAL_TAGGED_TEXT] + + row[COL_AUGMENTED_ENTITIES] = {"entities": []} + row = columns[7].generator_function(row) + row = columns[8].generator_function(row) + return SimpleNamespace(dataframe=pd.DataFrame([row]), failed_records=[]) + + adapter.run_workflow.side_effect = fake_run_workflow + + with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.rules_filter_guardrail): + workflow = EntityDetectionWorkflow(adapter=adapter) + result = workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: [text]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + entity_labels=["political_view", "religious_belief"], + ) + + entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities + assert any(entity.label == "political_view" and entity.value == phrase for entity in entities) + + +def test_rules_filter_guardrail_preserves_longer_same_label_detector_overlap() -> None: + tool = load_tool( + "measurement_detection_strategies_rules_filter_guardrail_preserve_longer_same_label", + REPO_ROOT / "tools/measurement/detection_strategies.py", + ) + adapter = Mock() + rule_value = "Great Health" + detector_value = f"{rule_value} and Mountain Timber" + text = f"After apprenticeships with {detector_value}, Darwin started his own shop." + detector_start = text.index(detector_value) + + def fake_run_workflow(dataframe: pd.DataFrame, *, columns: list, **_: object) -> SimpleNamespace: + row = { + COL_TEXT: dataframe[COL_TEXT].iloc[0], + COL_RAW_DETECTED: json.dumps( + { + "entities": [ + { + "text": detector_value, + "label": "organization_name", + "start": detector_start, + "end": detector_start + len(detector_value), + "score": 0.99, + } + ] + } + ), + } + row = columns[1].generator_function(row) + row = columns[2].generator_function(row) + seed_entities = EntitiesSchema.from_raw(row[COL_SEED_ENTITIES]).entities + assert [(entity.label, entity.value) for entity in seed_entities] == [("organization_name", detector_value)] + assert row[COL_SEED_VALIDATION_CANDIDATES]["candidates"][0]["value"] == detector_value + + row[COL_VALIDATION_DECISIONS] = { + "decisions": [ + { + "id": f"organization_name_{detector_start}_{detector_start + len(detector_value)}", + "decision": "keep", + "proposed_label": "", + "reason": "test keep", + } + ] + } + row = columns[4].generator_function(row) + row = columns[5].generator_function(row) + row[COL_AUGMENTED_ENTITIES] = {"entities": []} + row = columns[7].generator_function(row) + row = columns[8].generator_function(row) + return SimpleNamespace(dataframe=pd.DataFrame([row]), failed_records=[]) + + adapter.run_workflow.side_effect = fake_run_workflow + + with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.rules_filter_guardrail): + workflow = EntityDetectionWorkflow(adapter=adapter) + result = workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: [text]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + entity_labels=["organization_name"], + ) + + entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities + assert [(entity.label, entity.value) for entity in entities] == [("organization_name", detector_value)] + + +def test_rules_filter_guardrail_does_not_shadow_validated_different_label_exact_span() -> None: + tool = load_tool( + "measurement_detection_strategies_rules_filter_guardrail_no_shadow_exact_span", + REPO_ROOT / "tools/measurement/detection_strategies.py", + ) + adapter = Mock() + value = "Bowdoin College" + text = f"He completed his MLS at {value}, and his early career followed." + start = text.index(value) + + def fake_run_workflow(dataframe: pd.DataFrame, *, columns: list, **_: object) -> SimpleNamespace: + row = { + COL_TEXT: dataframe[COL_TEXT].iloc[0], + COL_RAW_DETECTED: json.dumps( + { + "entities": [ + { + "text": value, + "label": "university", + "start": start, + "end": start + len(value), + "score": 0.99, + } + ] + } + ), + } + row = columns[1].generator_function(row) + row = columns[2].generator_function(row) + assert row[COL_SEED_VALIDATION_CANDIDATES]["candidates"][0]["label"] == "university" + + row[COL_VALIDATION_DECISIONS] = { + "decisions": [ + { + "id": f"university_{start}_{start + len(value)}", + "decision": "keep", + "proposed_label": "", + "reason": "test keep", + } + ] + } + row = columns[4].generator_function(row) + row = columns[5].generator_function(row) + validated_seed = EntitiesSchema.from_raw(row[COL_VALIDATED_SEED_ENTITIES]).entities + assert [(entity.label, entity.value) for entity in validated_seed] == [("university", value)] + + row[COL_AUGMENTED_ENTITIES] = {"entities": []} + row = columns[7].generator_function(row) + row = columns[8].generator_function(row) + return SimpleNamespace(dataframe=pd.DataFrame([row]), failed_records=[]) + + adapter.run_workflow.side_effect = fake_run_workflow + + with tool.experimental_detection_strategy_context(tool.ExperimentalDetectionStrategy.rules_filter_guardrail): + workflow = EntityDetectionWorkflow(adapter=adapter) + result = workflow.detect_and_validate_entities( + pd.DataFrame({COL_TEXT: [text]}), + model_configs=[], + selected_models=load_default_model_selection().detection, + gliner_detection_threshold=0.3, + entity_labels=["organization_name", "university"], + ) + + entities = EntitiesSchema.from_raw(result.dataframe[COL_DETECTED_ENTITIES].iloc[0]).entities + assert [(entity.label, entity.value) for entity in entities] == [("university", value)] diff --git a/tests/tools/test_extract_signature_deltas.py b/tests/tools/test_extract_signature_deltas.py index 922d73eb..5e442932 100644 --- a/tests/tools/test_extract_signature_deltas.py +++ b/tests/tools/test_extract_signature_deltas.py @@ -113,21 +113,21 @@ def test_extract_signature_deltas_masks_candidate_only_context(tmp_path: Path) - assert "[ORGANIZATION_NAME:" in row.masked_context -def test_extract_signature_deltas_recovers_artifact_detail_context(tmp_path: Path) -> None: +def test_extract_signature_deltas_recovers_guardrail_rule_context(tmp_path: Path) -> None: analyzer = load_tool( - "measurement_detection_artifact_context_builder", REPO_ROOT / "tools/measurement/analyze_detection_artifacts.py" + "measurement_detection_artifact_rule_builder", REPO_ROOT / "tools/measurement/analyze_detection_artifacts.py" ) tool = load_tool( - "measurement_extract_signature_deltas_context", REPO_ROOT / "tools/measurement/extract_signature_deltas.py" + "measurement_extract_signature_deltas_rule", REPO_ROOT / "tools/measurement/extract_signature_deltas.py" ) baseline_root = tmp_path / "baseline" candidate_root = tmp_path / "candidate" baseline = _write_artifact_case(baseline_root, analyzer, [], "The applicant was born in 1990.") candidate = _write_artifact_case(candidate_root, analyzer, [], "The applicant was born in 1990.") - detail_entity = analyzer.EntitySchema( + rule_entity = analyzer.EntitySchema( value="1990", label="date_of_birth", start_position=26, end_position=30, source="rule" ) - detail_row = analyzer.build_detection_artifact_row_from_entities( + rule_row = analyzer.build_detection_artifact_row_from_entities( workflow_name="entity-detection", batch_file="entity-detection/parquet-files/batch_00000.parquet", row_index=0, @@ -135,9 +135,9 @@ def test_extract_signature_deltas_recovers_artifact_detail_context(tmp_path: Pat seed_validation_candidate_count=0, merged_validation_candidate_count=0, augmented_entities=[], - final_entities=[detail_entity], + final_entities=[rule_entity], ).model_dump() - pd.json_normalize([{**_case_metadata(), **detail_row}], sep=".").to_json(candidate, orient="records", lines=True) + pd.json_normalize([{**_case_metadata(), **rule_row}], sep=".").to_json(candidate, orient="records", lines=True) result = tool.extract_signature_deltas( baseline, @@ -150,7 +150,7 @@ def test_extract_signature_deltas_recovers_artifact_detail_context(tmp_path: Pat row = result.rows[0] assert row.label == "date_of_birth" assert row.source == "rule" - assert row.resolution == "artifact_details" + assert row.resolution == "rule" assert "1990" not in row.masked_context assert "[DATE_OF_BIRTH:" in row.masked_context diff --git a/tests/tools/test_measurement_tools.py b/tests/tools/test_measurement_tools.py index 99642535..5d1f5a79 100644 --- a/tests/tools/test_measurement_tools.py +++ b/tests/tools/test_measurement_tools.py @@ -98,6 +98,152 @@ def test_benchmark_exports_detection_artifact_analysis(tmp_path: Path) -> None: assert "Alice" not in output_path.read_text(encoding="utf-8") +def test_benchmark_exports_rules_only_synthetic_detection_artifacts(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_tool_rules_only_synthetic_artifacts", + REPO_ROOT / "tools/measurement/run_benchmarks.py", + ) + input_path = tmp_path / "input.csv" + secret = "sk-test-AAAAAAAAAAAAAAAAAAAAAAAA" + pd.DataFrame({"text": [f"export API_KEY={secret}"]}).to_csv(input_path, index=False) + config = tool.ConfigSpec( + id="rules-only-redact", + replace="redact", + detect={"entity_labels": ["api_key", "email", "password", "url"]}, + experimental_detection_strategy="rules_only", + ) + case = tool.BenchmarkCase( + suite_id="rules-suite", + workload_id="input", + config_id="rules-only-redact", + repetition=0, + case_id="input__rules-only-redact__r000", + ) + output_path = tmp_path / "raw" / "input__rules-only-redact__r000.detection-artifacts.jsonl" + + result = tool.export_rules_only_case_detection_artifacts( + config, + tool.AnonymizerInput(source=str(input_path), text_column="text"), + output_path, + case=case, + ) + + assert result == output_path + text = output_path.read_text(encoding="utf-8") + assert secret not in text + row = json.loads(text) + assert row["workflow_name"] == "entity-detection-rules-only" + assert row["final_entity_count"] == 1 + assert row["final_entity_signature_count"] == 1 + assert row["final_label_counts.api_key"] == 1 + assert row["final_source_counts.rule"] == 1 + assert any(key.startswith("final_entity_signature_labels.") for key in row) + + +def test_benchmark_exports_rules_covered_or_default_synthetic_artifacts_for_structured_fast_lane_labels( + tmp_path: Path, +) -> None: + tool = load_tool( + "measurement_benchmark_tool_rules_covered_synthetic_artifacts", + REPO_ROOT / "tools/measurement/run_benchmarks.py", + ) + input_path = tmp_path / "input.csv" + secret = "sk-test-AAAAAAAAAAAAAAAAAAAAAAAA" + pd.DataFrame({"text": [f"export API_KEY={secret}"]}).to_csv(input_path, index=False) + config = tool.ConfigSpec( + id="rules-covered-redact", + replace="redact", + detect={"entity_labels": ["api_key", "email", "password", "url"]}, + experimental_detection_strategy="rules_covered_or_default", + ) + case = tool.BenchmarkCase( + suite_id="rules-suite", + workload_id="input", + config_id="rules-covered-redact", + repetition=0, + case_id="input__rules-covered-redact__r000", + ) + output_path = tmp_path / "raw" / "input__rules-covered-redact__r000.detection-artifacts.jsonl" + + result = tool.export_rules_only_case_detection_artifacts( + config, + tool.AnonymizerInput(source=str(input_path), text_column="text"), + output_path, + case=case, + ) + + assert result == output_path + row = json.loads(output_path.read_text(encoding="utf-8")) + assert row["workflow_name"] == "entity-detection-rules-only" + assert row["final_entity_count"] == 1 + assert row["final_label_counts.api_key"] == 1 + + +def test_benchmark_does_not_export_rules_covered_or_default_artifacts_for_contextual_labels(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_tool_rules_covered_contextual_artifacts", + REPO_ROOT / "tools/measurement/run_benchmarks.py", + ) + input_path = tmp_path / "input.csv" + pd.DataFrame({"text": ["Alice has token=sk-test-AAAAAAAAAAAAAAAAAAAAAAAA"]}).to_csv(input_path, index=False) + config = tool.ConfigSpec( + id="rules-covered-redact", + replace="redact", + detect={"entity_labels": ["api_key", "person"]}, + experimental_detection_strategy="rules_covered_or_default", + ) + case = tool.BenchmarkCase( + suite_id="rules-suite", + workload_id="input", + config_id="rules-covered-redact", + repetition=0, + case_id="input__rules-covered-redact__r000", + ) + + result = tool.export_rules_only_case_detection_artifacts( + config, + tool.AnonymizerInput(source=str(input_path), text_column="text"), + tmp_path / "raw" / "input__rules-covered-redact__r000.detection-artifacts.jsonl", + case=case, + ) + + assert result is None + + +def test_benchmark_does_not_export_rules_covered_artifacts_for_narrow_prose_rule_labels(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_tool_rules_covered_prose_rule_artifacts", + REPO_ROOT / "tools/measurement/run_benchmarks.py", + ) + input_path = tmp_path / "input.csv" + pd.DataFrame({"text": ["Jordan worked at Acme Research Center and lived on Maple Street."]}).to_csv( + input_path, + index=False, + ) + config = tool.ConfigSpec( + id="rules-covered-redact", + replace="redact", + detect={"entity_labels": ["organization_name", "street_address"]}, + experimental_detection_strategy="rules_covered_or_default", + ) + case = tool.BenchmarkCase( + suite_id="rules-suite", + workload_id="input", + config_id="rules-covered-redact", + repetition=0, + case_id="input__rules-covered-redact__r000", + ) + + result = tool.export_rules_only_case_detection_artifacts( + config, + tool.AnonymizerInput(source=str(input_path), text_column="text"), + tmp_path / "raw" / "input__rules-covered-redact__r000.detection-artifacts.jsonl", + case=case, + ) + + assert result is None + + def test_benchmark_detection_artifact_analysis_ignores_stale_artifacts(tmp_path: Path) -> None: tool = load_tool("measurement_benchmark_tool_artifact_delta", REPO_ROOT / "tools/measurement/run_benchmarks.py") artifact_root = tmp_path / "artifacts" @@ -264,6 +410,89 @@ def test_benchmark_patches_detection_artifacts_from_final_trace_dataframe(tmp_pa assert "final_entity_signature_labels.stale" not in row +def test_rules_covered_or_default_detection_artifacts_use_final_trace_dataframe(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_tool_rules_covered_trace_artifacts", + REPO_ROOT / "tools/measurement/run_benchmarks.py", + ) + case = tool.BenchmarkCase( + suite_id="suite-a", + workload_id="input", + config_id="rules-covered", + repetition=0, + case_id="input__rules-covered__r000", + ) + config = tool.ConfigSpec( + id="rules-covered", + replace="redact", + detect={"entity_labels": ["api_key", "password"]}, + experimental_detection_strategy="rules_covered_or_default", + ) + trace_dataframe = pd.DataFrame( + { + COL_FINAL_ENTITIES: [ + { + "entities": [ + { + "value": "sk-test-AAAAAAAAAAAAAAAAAAAAAAAA", + "label": "api_key", + "start_position": 6, + "end_position": 38, + "source": "rule", + } + ] + }, + { + "entities": [ + { + "value": "SecretNoRule123!", + "label": "password", + "start_position": 14, + "end_position": 30, + "source": "detector", + } + ] + }, + ] + } + ) + paths = tool._CaseRunPaths( + raw_path=tmp_path / "raw" / "case.jsonl", + artifact_output_path=tmp_path / "raw" / "case.detection-artifacts.jsonl", + trace_path=None, + artifact_snapshot={}, + ) + tool.write_detection_artifact_payloads([_stale_detection_artifact_payload()], paths.artifact_output_path) + contexts = {"artifact_path": tmp_path / "artifacts"} + input_path = tmp_path / "input.csv" + pd.DataFrame({"text": ["token=sk-test-AAAAAAAAAAAAAAAAAAAAAAAA"]}).to_csv(input_path, index=False) + execution = tool._CaseExecution( + input_data=tool.AnonymizerInput(source=str(input_path), text_column="text"), + trace_dataframe=trace_dataframe, + ) + + result = tool._case_detection_artifact_path( + contexts, + paths, + case=case, + config=config, + execution=execution, + ) + + assert result == paths.artifact_output_path + rows = [json.loads(line) for line in paths.artifact_output_path.read_text(encoding="utf-8").splitlines()] + assert len(rows) == 2 + assert [row["workflow_name"] for row in rows] == [ + "entity-detection-final-trace", + "entity-detection-final-trace", + ] + assert [row["row_index"] for row in rows] == [0, 1] + assert [row["final_source_counts.rule"] for row in rows] == [1.0, None] + assert [row["final_source_counts.detector"] for row in rows] == [None, 1.0] + assert "sk-test" not in paths.artifact_output_path.read_text(encoding="utf-8") + assert "SecretNoRule123!" not in paths.artifact_output_path.read_text(encoding="utf-8") + + def test_run_suite_records_detection_artifact_analysis_path( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, @@ -1150,6 +1379,16 @@ def test_benchmark_config_accepts_experimental_detection_strategy() -> None: "measurement_benchmark_tool_detection_strategy_config", REPO_ROOT / "tools/measurement/run_benchmarks.py" ) + config = tool.ConfigSpec( + id="rules-only", + replace="redact", + experimental_detection_strategy="rules_only", + ) + + assert config.experimental_detection_strategy == tool.ExperimentalDetectionStrategy.rules_only + anonymizer_config = tool.build_anonymizer_config(config) + assert not hasattr(anonymizer_config.detect, "experimental_detection_strategy") + detector_only = tool.ConfigSpec( id="detector-only", replace="redact", @@ -1157,8 +1396,22 @@ def test_benchmark_config_accepts_experimental_detection_strategy() -> None: ) assert detector_only.experimental_detection_strategy == tool.ExperimentalDetectionStrategy.detector_only - anonymizer_config = tool.build_anonymizer_config(detector_only) - assert not hasattr(anonymizer_config.detect, "experimental_detection_strategy") + + rules_covered = tool.ConfigSpec( + id="rules-covered", + replace="redact", + experimental_detection_strategy="rules_covered_or_default", + ) + + assert rules_covered.experimental_detection_strategy == tool.ExperimentalDetectionStrategy.rules_covered_or_default + + native_rules_router = tool.ConfigSpec( + id="native-rules-router", + replace="redact", + experimental_detection_strategy="native_rules_router", + ) + + assert native_rules_router.experimental_detection_strategy == tool.ExperimentalDetectionStrategy.native_rules_router native_candidate_validate = tool.ConfigSpec( id="native-candidate-validate", @@ -1257,6 +1510,30 @@ def test_benchmark_config_accepts_experimental_detection_strategy() -> None: ) +def test_benchmark_config_accepts_experimental_rule_labels() -> None: + tool = load_tool("measurement_benchmark_tool_rule_labels_config", REPO_ROOT / "tools/measurement/run_benchmarks.py") + + config = tool.ConfigSpec( + id="rules-guardrail", + replace="redact", + experimental_detection_strategy="rules_guardrail", + experimental_rule_labels=["street_address"], + ) + + assert config.experimental_rule_labels == ["street_address"] + anonymizer_config = tool.build_anonymizer_config(config) + assert not hasattr(anonymizer_config.detect, "experimental_rule_labels") + + detector_only = tool.ConfigSpec( + id="rules-guardrail-detector-only", + replace="redact", + experimental_detection_strategy="rules_guardrail_detector_only", + experimental_rule_labels=["api_key"], + ) + + assert detector_only.experimental_rule_labels == ["api_key"] + + def test_benchmark_spec_accepts_dd_parser_compat() -> None: tool = load_tool( "measurement_benchmark_tool_dd_parser_compat_config", REPO_ROOT / "tools/measurement/run_benchmarks.py" @@ -1272,6 +1549,200 @@ def test_benchmark_spec_accepts_dd_parser_compat() -> None: assert spec.dd_parser_compat == tool.DDParserCompatMode.raw_json +def test_benchmark_preflight_rejects_rules_only_without_explicit_labels(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_tool_rules_only_without_labels", REPO_ROOT / "tools/measurement/run_benchmarks.py" + ) + input_path = tmp_path / "input.csv" + pd.DataFrame({"text": ["token=sk-test-AAAAAAAAAAAAAAAAAAAAAAAA"]}).to_csv(input_path, index=False) + spec_path = tmp_path / "suite.yaml" + spec_path.write_text( + """ +suite_id: rules-only-no-labels +workloads: + - id: input + source: input.csv +configs: + - id: rules-only-redact + experimental_detection_strategy: rules_only + replace: redact +""", + encoding="utf-8", + ) + spec = tool.load_spec(spec_path) + + with pytest.raises(ValueError, match="requires explicit detect.entity_labels"): + tool.preflight_suite(spec, spec_path=spec_path) + + +def test_benchmark_preflight_rejects_rules_only_unsupported_labels(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_tool_rules_only_unsupported_labels", REPO_ROOT / "tools/measurement/run_benchmarks.py" + ) + input_path = tmp_path / "input.csv" + pd.DataFrame({"text": ["token=sk-test-AAAAAAAAAAAAAAAAAAAAAAAA"]}).to_csv(input_path, index=False) + spec_path = tmp_path / "suite.yaml" + spec_path.write_text( + """ +suite_id: rules-only-unsupported-labels +workloads: + - id: input + source: input.csv +configs: + - id: rules-only-redact + experimental_detection_strategy: rules_only + detect: + entity_labels: [api_key, person] + replace: redact +""", + encoding="utf-8", + ) + spec = tool.load_spec(spec_path) + + with pytest.raises(ValueError, match="unsupported high-confidence rule labels.*person"): + tool.preflight_suite(spec, spec_path=spec_path) + + +def test_benchmark_preflight_accepts_rules_only_supported_labels(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_tool_rules_only_supported_labels", REPO_ROOT / "tools/measurement/run_benchmarks.py" + ) + input_path = tmp_path / "input.csv" + pd.DataFrame({"text": ["token=sk-test-AAAAAAAAAAAAAAAAAAAAAAAA"]}).to_csv(input_path, index=False) + spec_path = tmp_path / "suite.yaml" + spec_path.write_text( + """ +suite_id: rules-only-supported-labels +workloads: + - id: input + source: input.csv +configs: + - id: rules-only-redact + experimental_detection_strategy: rules_only + detect: + entity_labels: [api_key, email, http_cookie, password, pin, unique_id, url, user_name] + replace: redact +""", + encoding="utf-8", + ) + spec = tool.load_spec(spec_path) + + tool.preflight_suite(spec, spec_path=spec_path) + + +def test_benchmark_preflight_accepts_rules_covered_or_default_contextual_labels(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_tool_rules_covered_contextual_labels", + REPO_ROOT / "tools/measurement/run_benchmarks.py", + ) + input_path = tmp_path / "input.csv" + pd.DataFrame({"text": ["Alice has token=sk-test-AAAAAAAAAAAAAAAAAAAAAAAA"]}).to_csv(input_path, index=False) + spec_path = tmp_path / "suite.yaml" + spec_path.write_text( + """ +suite_id: rules-covered-contextual-labels +workloads: + - id: input + source: input.csv +configs: + - id: rules-covered-redact + experimental_detection_strategy: rules_covered_or_default + detect: + entity_labels: [api_key, person] + replace: redact +""", + encoding="utf-8", + ) + spec = tool.load_spec(spec_path) + + tool.preflight_suite(spec, spec_path=spec_path) + + +def test_benchmark_preflight_rejects_experimental_rule_labels_for_non_rule_strategy(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_tool_rule_labels_non_rule_strategy", + REPO_ROOT / "tools/measurement/run_benchmarks.py", + ) + input_path = tmp_path / "input.csv" + pd.DataFrame({"text": ["Alice"]}).to_csv(input_path, index=False) + spec_path = tmp_path / "suite.yaml" + spec_path.write_text( + """ +suite_id: rule-labels-non-rule-strategy +workloads: + - id: input + source: input.csv +configs: + - id: redact + experimental_detection_strategy: prose_augment_focus + experimental_rule_labels: [street_address] + replace: redact +""", + encoding="utf-8", + ) + spec = tool.load_spec(spec_path) + + with pytest.raises(ValueError, match="experimental_rule_labels requires a rule-backed strategy"): + tool.preflight_suite(spec, spec_path=spec_path) + + +def test_benchmark_preflight_accepts_experimental_rule_labels_for_compact_rule_guardrail( + tmp_path: Path, +) -> None: + tool = load_tool( + "measurement_benchmark_tool_rule_labels_compact_rule_guardrail", + REPO_ROOT / "tools/measurement/run_benchmarks.py", + ) + input_path = tmp_path / "input.csv" + pd.DataFrame({"text": ["Alice lives on West Roberts Drive."]}).to_csv(input_path, index=False) + spec_path = tmp_path / "suite.yaml" + spec_path.write_text( + """ +suite_id: rule-labels-compact-rule-guardrail +workloads: + - id: input + source: input.csv +configs: + - id: redact + experimental_detection_strategy: rules_guardrail_compact_validation + experimental_rule_labels: [street_address] + replace: redact +""", + encoding="utf-8", + ) + spec = tool.load_spec(spec_path) + + tool.preflight_suite(spec, spec_path=spec_path) + + +def test_benchmark_preflight_rejects_unsupported_experimental_rule_labels(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_tool_rule_labels_unsupported", + REPO_ROOT / "tools/measurement/run_benchmarks.py", + ) + input_path = tmp_path / "input.csv" + pd.DataFrame({"text": ["Alice"]}).to_csv(input_path, index=False) + spec_path = tmp_path / "suite.yaml" + spec_path.write_text( + """ +suite_id: rule-labels-unsupported +workloads: + - id: input + source: input.csv +configs: + - id: redact + experimental_detection_strategy: rules_guardrail + experimental_rule_labels: [person] + replace: redact +""", + encoding="utf-8", + ) + spec = tool.load_spec(spec_path) + + with pytest.raises(ValueError, match="unsupported experimental_rule_labels.*person"): + tool.preflight_suite(spec, spec_path=spec_path) + + def test_benchmark_case_enters_experimental_detection_strategy_context( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, @@ -1282,7 +1753,7 @@ def test_benchmark_case_enters_experimental_detection_strategy_context( captured_measurements: list[Any] = [] captured_parser_compat: list[Any] = [] captured_strategies: list[Any] = [] - captured_context_kwargs: list[dict[str, Any]] = [] + captured_rule_labels: list[Any] = [] @contextmanager def fake_measurement_session(config: Any) -> Iterator[None]: @@ -1290,9 +1761,9 @@ def fake_measurement_session(config: Any) -> Iterator[None]: yield None @contextmanager - def fake_detection_strategy_context(strategy: Any, **kwargs: Any) -> Iterator[None]: + def fake_detection_strategy_context(strategy: Any, *, rule_labels: list[str] | None = None) -> Iterator[None]: captured_strategies.append(strategy) - captured_context_kwargs.append(kwargs) + captured_rule_labels.append(rule_labels) yield None @contextmanager @@ -1310,36 +1781,32 @@ def run(self, *, config: Any, data: Any) -> None: monkeypatch.setattr(tool, "experimental_detection_strategy_context", fake_detection_strategy_context) spec = tool.BenchmarkSpec( - suite_id="native-suite", + suite_id="rules-suite", dd_parser_compat="raw_json", - native_runtime=tool.NativeRuntimeSpec( - runtime_id="native-test", - endpoint="http://runtime.example/v1", - model="test-model", - ), workloads=[tool.WorkloadSpec(id="input", source="input.csv")], configs=[ tool.ConfigSpec( - id="native-single-pass-redact", + id="rules-only-redact", replace="redact", - experimental_detection_strategy="native_single_pass", + experimental_detection_strategy="rules_only", + experimental_rule_labels=["api_key"], ) ], ) pd.DataFrame({"text": ["token=sk-test-AAAAAAAAAAAAAAAAAAAAAAAA"]}).to_csv(tmp_path / "input.csv", index=False) case = tool.BenchmarkCase( - suite_id="native-suite", + suite_id="rules-suite", workload_id="input", - config_id="native-single-pass-redact", + config_id="rules-only-redact", repetition=0, - case_id="input__native-single-pass-redact__r000", + case_id="input__rules-only-redact__r000", ) tool._execute_case( FakeAnonymizer(), spec.workloads[0], spec.configs[0], - raw_path=tmp_path / "raw" / "input__native-single-pass-redact__r000.jsonl", + raw_path=tmp_path / "raw" / "input__rules-only-redact__r000.jsonl", trace_path=None, case=case, spec=spec, @@ -1349,9 +1816,8 @@ def run(self, *, config: Any, data: Any) -> None: ) assert captured_parser_compat == [tool.DDParserCompatMode.raw_json] - assert captured_strategies == [tool.ExperimentalDetectionStrategy.native_single_pass] - assert captured_context_kwargs[0]["native_runtime"].endpoint == "http://runtime.example/v1" - assert captured_context_kwargs[0]["native_runtime"].model == "test-model" - assert captured_measurements[0].run_tags["experimental_detection_strategy"] == "native_single_pass" - assert captured_measurements[0].run_tags["native_runtime_id"] == "native-test" + assert captured_strategies == [tool.ExperimentalDetectionStrategy.rules_only] + assert captured_rule_labels == [["api_key"]] + assert captured_measurements[0].run_tags["experimental_detection_strategy"] == "rules_only" + assert captured_measurements[0].run_tags["experimental_rule_labels"] == ["api_key"] assert captured_measurements[0].run_tags["dd_parser_compat"] == "raw_json" diff --git a/tests/tools/test_screen_strategy_comparisons.py b/tests/tools/test_screen_strategy_comparisons.py index 8b2fa237..95d1c3f2 100644 --- a/tests/tools/test_screen_strategy_comparisons.py +++ b/tests/tools/test_screen_strategy_comparisons.py @@ -35,9 +35,9 @@ def test_screen_strategy_comparisons_reads_comparison_csvs_only(tmp_path: Path) { "workload_id": "shell-3", "baseline_config_id": "default", - "candidate_config_id": "detector-only", + "candidate_config_id": "rules-only", "baseline_strategy": "default", - "candidate_strategy": "detector_only", + "candidate_strategy": "rules_only", "baseline_replacement_strategy": "default", "candidate_replacement_strategy": "local_structured_substitute", "baseline_case_count": 3, @@ -91,7 +91,7 @@ def test_screen_strategy_comparisons_reads_comparison_csvs_only(tmp_path: Path) "source_path": "analysis/default-vs-candidates.csv", "workload_id": "shell-3", "baseline_config_id": "default", - "candidate_config_id": "detector-only", + "candidate_config_id": "rules-only", "safety_verdict": "review", "performance_verdict": "improved", "candidate_verdict": "review", @@ -121,13 +121,13 @@ def test_screen_strategy_comparisons_reads_comparison_csvs_only(tmp_path: Path) assert shell.baseline_case_count == 3 assert shell.candidate_case_count == 3 assert shell.shared_stable_final_entity_signature_count == 12 - detector_local = next( + rules_local = next( group for group in result.groups - if group.group_key == "strategy:detector_only|replacement:local_structured_substitute" + if group.group_key == "strategy:rules_only|replacement:local_structured_substitute" ) - assert detector_local.candidate_replacement_strategy == "local_structured_substitute" - assert detector_local.row_count == 1 + assert rules_local.candidate_replacement_strategy == "local_structured_substitute" + assert rules_local.row_count == 1 no_augment = next(group for group in result.groups if group.group_key == "strategy:no_augment") assert no_augment.row_count == 1 assert no_augment.reject_count == 1 @@ -141,10 +141,10 @@ def test_screen_strategy_comparisons_writes_csv(tmp_path: Path) -> None: ) rows = [ tool.ScreenRow( - source_path="analysis/default-vs-detector-only.csv", + source_path="analysis/default-vs-rules.csv", workload_id="shell", baseline_config_id="default", - candidate_config_id="detector-only", + candidate_config_id="rules", baseline_replacement_strategy="default", candidate_replacement_strategy="local_structured_substitute", safety_verdict="review", @@ -248,8 +248,8 @@ def test_screen_strategy_comparisons_surfaces_candidate_original_value_leaks(tmp { "workload_id": "structured-secrets", "baseline_config_id": "default", - "candidate_config_id": "native-single-pass", - "candidate_strategy": "native_single_pass", + "candidate_config_id": "rules-covered", + "candidate_strategy": "rules_covered_or_default", "safety_verdict": "fail", "performance_verdict": "improved", "candidate_verdict": "reject", @@ -291,6 +291,78 @@ def test_screen_strategy_comparisons_surfaces_candidate_original_value_leaks(tmp assert "collision_labels=date:1" in rendered +def test_screen_strategy_comparisons_marks_rule_fast_lane_review_when_only_provenance_flags_remain( + tmp_path: Path, +) -> None: + tool = load_tool( + "measurement_screen_strategy_comparisons_fast_lane_review", + REPO_ROOT / "tools/measurement/screen_strategy_comparisons.py", + ) + pd.DataFrame( + [ + { + "workload_id": "structured-secrets", + "baseline_config_id": "default", + "candidate_config_id": "rules-covered", + "candidate_strategy": "rules_covered_or_default", + "safety_verdict": "review", + "performance_verdict": "improved", + "candidate_verdict": "review", + "candidate_original_value_leak_count": 0, + "candidate_original_value_leak_record_count": 0, + "flags": '["no_candidate_detector_entities", "candidate_uses_rule_entities"]', + } + ] + ).to_csv(tmp_path / "comparison.csv", index=False) + + result = tool.screen_comparison_paths([tmp_path]) + + assert result.groups[0].recommendation == "fast_lane_review" + + +def test_screen_strategy_comparisons_treats_covered_boundary_deltas_as_fast_lane_review( + tmp_path: Path, +) -> None: + tool = load_tool( + "measurement_screen_strategy_comparisons_fast_lane_boundary_review", + REPO_ROOT / "tools/measurement/screen_strategy_comparisons.py", + ) + pd.DataFrame( + [ + { + "workload_id": "structured-identifiers", + "baseline_config_id": "default", + "candidate_config_id": "rules-local", + "candidate_strategy": "rules_covered_or_default", + "candidate_replacement_strategy": "local_structured_substitute", + "safety_verdict": "review", + "performance_verdict": "improved", + "candidate_verdict": "review", + "candidate_original_value_leak_count": 0, + "candidate_original_value_leak_record_count": 0, + "flags": ( + '["entity_count_loss", "span_boundary_mismatch", ' + '"no_candidate_detector_entities", "candidate_uses_rule_entities"]' + ), + "baseline_only_final_entity_signature_label_counts.api_key": 2, + "baseline_only_final_entity_signature_label_counts.http_cookie": 3, + "baseline_only_candidate_covered_signature_label_counts.api_key": 2, + "baseline_only_candidate_covered_signature_label_counts.http_cookie": 3, + "baseline_only_candidate_overlapping_signature_label_counts.http_cookie": 1, + "baseline_only_candidate_uncovered_signature_count": 0, + "baseline_stable_candidate_uncovered_signature_count": 0, + } + ] + ).to_csv(tmp_path / "comparison.csv", index=False) + + result = tool.screen_comparison_paths([tmp_path]) + + row = result.rows[0] + assert row.baseline_only_label_counts == {} + assert result.groups[0].baseline_only_label_counts == {} + assert result.groups[0].recommendation == "fast_lane_review" + + def test_screen_strategy_comparisons_surfaces_label_policy_review(tmp_path: Path) -> None: tool = load_tool( "measurement_screen_strategy_comparisons_label_policy_review", @@ -496,18 +568,18 @@ def test_screen_strategy_comparisons_groups_default_detection_by_replacement_str assert tool.group_base_for_row(row, config_aliases={}) == "replacement:local_structured_substitute" -def test_screen_strategy_comparisons_keeps_generic_review_without_leak_metrics() -> None: +def test_screen_strategy_comparisons_keeps_stale_rule_review_generic_without_leak_metrics() -> None: tool = load_tool( - "measurement_screen_strategy_comparisons_generic_review", + "measurement_screen_strategy_comparisons_fast_lane_review_stale", REPO_ROOT / "tools/measurement/screen_strategy_comparisons.py", ) group = tool.ScreenGroup( - group_key="strategy:detector_only", - candidate_strategy="detector_only", + group_key="strategy:rules_covered_or_default", + candidate_strategy="rules_covered_or_default", row_count=1, review_count=1, performance_verdict_counts={"improved": 1}, - flag_counts={"candidate_skips_llm_validation": 1}, + flag_counts={"candidate_uses_rule_entities": 1, "no_candidate_detector_entities": 1}, ) assert tool.group_recommendation(group) == "review_only" @@ -608,8 +680,8 @@ def test_screen_strategy_comparisons_groups_candidate_strategy_conflicts(tmp_pat { "workload_id": "shell", "baseline_config_id": "default", - "candidate_config_id": "detector-only", - "candidate_strategy": "detector_only", + "candidate_config_id": "rules", + "candidate_strategy": "rules_only", "safety_verdict": "review", "performance_verdict": "improved", "candidate_verdict": "review", @@ -624,7 +696,7 @@ def test_screen_strategy_comparisons_groups_candidate_strategy_conflicts(tmp_pat result = tool.screen_comparison_paths([tmp_path]) groups = {group.group_key: group for group in result.groups} - assert list(groups) == ["strategy:detector_only", "strategy:no_augment"] + assert list(groups) == ["strategy:rules_only", "strategy:no_augment"] no_augment = groups["strategy:no_augment"] assert no_augment.row_count == 2 assert no_augment.viable_count == 1 @@ -764,20 +836,20 @@ def test_screen_strategy_comparisons_can_group_by_strategy_and_workload_family(t { "workload_id": "shell-secrets-3", "baseline_config_id": "default", - "candidate_config_id": "detector-only-shell", - "candidate_strategy": "detector_only", + "candidate_config_id": "rules-only-shell", + "candidate_strategy": "rules_only", "safety_verdict": "review", "performance_verdict": "improved", "candidate_verdict": "review", "pipeline_elapsed_sec_delta_pct": -99.9, "observed_total_tokens_delta": -11000, - "flags": '["candidate_skips_llm_validation"]', + "flags": '["candidate_uses_rule_entities"]', }, { "workload_id": "biographies-r5-offset5", "baseline_config_id": "default", - "candidate_config_id": "detector-only-bio", - "candidate_strategy": "detector_only", + "candidate_config_id": "rules-only-bio", + "candidate_strategy": "rules_only", "safety_verdict": "fail", "performance_verdict": "improved", "candidate_verdict": "reject", @@ -793,11 +865,11 @@ def test_screen_strategy_comparisons_can_group_by_strategy_and_workload_family(t result = tool.screen_comparison_paths([tmp_path], group_by=tool.GroupBy.strategy_workload_family) groups = {group.group_key: group for group in result.groups} - assert list(groups) == ["strategy:detector_only|family:shell-secrets", "strategy:detector_only|family:biographies"] - assert groups["strategy:detector_only|family:shell-secrets"].recommendation == "review_only" - assert groups["strategy:detector_only|family:shell-secrets"].workload_families == ["shell-secrets"] - assert groups["strategy:detector_only|family:biographies"].recommendation == "reject" - assert groups["strategy:detector_only|family:biographies"].baseline_only_label_counts == {"first_name": 4} + assert list(groups) == ["strategy:rules_only|family:shell-secrets", "strategy:rules_only|family:biographies"] + assert groups["strategy:rules_only|family:shell-secrets"].recommendation == "review_only" + assert groups["strategy:rules_only|family:shell-secrets"].workload_families == ["shell-secrets"] + assert groups["strategy:rules_only|family:biographies"].recommendation == "reject" + assert groups["strategy:rules_only|family:biographies"].baseline_only_label_counts == {"first_name": 4} def test_workload_family_normalizes_slice_and_offset_suffixes() -> None: diff --git a/tests/tools/test_staged_detection_output_analysis.py b/tests/tools/test_staged_detection_output_analysis.py index 4c791501..f2ccb7f9 100644 --- a/tests/tools/test_staged_detection_output_analysis.py +++ b/tests/tools/test_staged_detection_output_analysis.py @@ -44,12 +44,13 @@ def test_analyze_staged_detection_output_summarizes_native_detection_probe(tmp_p "record_type": "staged_detection_case", "case_id": "shell-row-0", "row_index": 0, - "seed_source": "gliner", + "seed_source": "rules_router", "status": "completed", "elapsed_sec": 0.002, "model_elapsed_sec": 0.0, "model_phase_count": 0, "model_request_count": 0, + "rule_covered_label_set": True, "final_entity_count": 5, "final_entity_signature_count": 5, "final_label_counts": {"api_key": 2, "email": 1, "password": 1, "url": 1}, @@ -67,12 +68,13 @@ def test_analyze_staged_detection_output_summarizes_native_detection_probe(tmp_p "record_type": "staged_detection_case", "case_id": "bio-row-0", "row_index": 0, - "seed_source": "direct_llm", + "seed_source": "rules_plus_direct_llm", "status": "completed", "elapsed_sec": 10.0, "model_elapsed_sec": 9.5, "model_phase_count": 3, "model_request_count": 3, + "rule_covered_label_set": False, "final_entity_count": 3, "final_entity_signature_count": 3, "final_label_counts": {"person": 2, "api_key": 1}, @@ -90,12 +92,13 @@ def test_analyze_staged_detection_output_summarizes_native_detection_probe(tmp_p "record_type": "staged_detection_case", "case_id": "bio-row-1", "row_index": 1, - "seed_source": "direct_llm", + "seed_source": "rules_plus_direct_llm", "status": "error", "elapsed_sec": 1.0, "model_elapsed_sec": 0.8, "model_phase_count": 1, "model_request_count": 1, + "rule_covered_label_set": False, "final_entity_count": 0, "final_entity_signature_count": 0, "total_usage": {"prompt_tokens": 10, "completion_tokens": 2, "total_tokens": 12}, @@ -110,32 +113,89 @@ def test_analyze_staged_detection_output_summarizes_native_detection_probe(tmp_p assert result.case_count == 3 assert result.group_count == 2 groups = {row.seed_source: row for row in result.groups} - assert groups["gliner"].case_count == 1 - assert groups["gliner"].completed_case_count == 1 - assert groups["gliner"].model_elapsed_sec_sum == 0.0 - assert groups["gliner"].model_request_count_sum == 0 - assert groups["gliner"].baseline_shared_signature_rate == 1.0 - assert groups["direct_llm"].case_count == 2 - assert groups["direct_llm"].completed_case_count == 1 - assert groups["direct_llm"].error_case_count == 1 - assert groups["direct_llm"].elapsed_sec_sum == pytest.approx(11.0) - assert groups["direct_llm"].model_elapsed_sec_sum == pytest.approx(10.3) - assert groups["direct_llm"].model_request_count_sum == 4 - assert groups["direct_llm"].total_tokens_sum == 132 - assert groups["direct_llm"].baseline_final_entity_signature_count_sum == 4 - assert groups["direct_llm"].shared_final_entity_signature_count_sum == 2 - assert groups["direct_llm"].baseline_only_final_entity_signature_count_sum == 2 - assert groups["direct_llm"].direct_only_final_entity_signature_count_sum == 1 - assert groups["direct_llm"].baseline_shared_signature_rate == pytest.approx(0.5) + assert groups["rules_router"].case_count == 1 + assert groups["rules_router"].completed_case_count == 1 + assert groups["rules_router"].model_elapsed_sec_sum == 0.0 + assert groups["rules_router"].model_request_count_sum == 0 + assert groups["rules_router"].rule_covered_case_count == 1 + assert groups["rules_router"].baseline_shared_signature_rate == 1.0 + assert groups["rules_router"].fast_lane_verdict == "review" + assert groups["rules_router"].flags == ["too_few_cases"] + assert groups["rules_plus_direct_llm"].case_count == 2 + assert groups["rules_plus_direct_llm"].completed_case_count == 1 + assert groups["rules_plus_direct_llm"].error_case_count == 1 + assert groups["rules_plus_direct_llm"].elapsed_sec_sum == pytest.approx(11.0) + assert groups["rules_plus_direct_llm"].model_elapsed_sec_sum == pytest.approx(10.3) + assert groups["rules_plus_direct_llm"].model_request_count_sum == 4 + assert groups["rules_plus_direct_llm"].total_tokens_sum == 132 + assert groups["rules_plus_direct_llm"].baseline_final_entity_signature_count_sum == 4 + assert groups["rules_plus_direct_llm"].shared_final_entity_signature_count_sum == 2 + assert groups["rules_plus_direct_llm"].baseline_only_final_entity_signature_count_sum == 2 + assert groups["rules_plus_direct_llm"].direct_only_final_entity_signature_count_sum == 1 + assert groups["rules_plus_direct_llm"].baseline_shared_signature_rate == pytest.approx(0.5) + assert groups["rules_plus_direct_llm"].fast_lane_verdict == "reject" + assert groups["rules_plus_direct_llm"].flags == [ + "too_few_cases", + "case_errors", + "baseline_signature_loss", + "uses_model", + "not_fully_rule_covered", + ] label_deltas = {(row.seed_source, row.delta_type, row.label): row.count for row in result.label_deltas} assert label_deltas == { - ("direct_llm", "baseline_only", "city"): 1, - ("direct_llm", "baseline_only", "person"): 1, - ("direct_llm", "direct_only", "api_key"): 1, + ("rules_plus_direct_llm", "baseline_only", "city"): 1, + ("rules_plus_direct_llm", "baseline_only", "person"): 1, + ("rules_plus_direct_llm", "direct_only", "api_key"): 1, } +def test_staged_detection_output_analysis_requires_repeated_cases_for_fast_lane(tmp_path: Path) -> None: + tool = load_tool( + "measurement_staged_detection_output_analysis_repeated_gate", + REPO_ROOT / "tools/measurement/analyze_staged_detection_output.py", + ) + output_dir = tmp_path / "staged" + output_dir.mkdir() + _write_jsonl( + output_dir / "staged-detection-cases.jsonl", + [ + { + "record_type": "staged_detection_case", + "case_id": f"shell-row-{index}", + "row_index": index, + "seed_source": "rules_router", + "status": "completed", + "elapsed_sec": 0.002, + "model_elapsed_sec": 0.0, + "model_phase_count": 0, + "model_request_count": 0, + "rule_covered_label_set": True, + "final_entity_count": 5, + "final_entity_signature_count": 5, + "total_usage": {}, + "comparison": { + "baseline_final_entity_signature_count": 5, + "shared_final_entity_signature_count": 5, + "baseline_only_final_entity_signature_count": 0, + "direct_only_final_entity_signature_count": 0, + "baseline_only_label_counts": {}, + "direct_only_label_counts": {}, + }, + } + for index in range(3) + ], + ) + + result = tool.analyze_staged_detection_output(output_dir) + + group = result.groups[0] + assert group.seed_source == "rules_router" + assert group.case_count == 3 + assert group.fast_lane_verdict == "fast_lane_candidate" + assert group.flags == [] + + def test_staged_detection_output_analysis_writes_csv_tables(tmp_path: Path) -> None: tool = load_tool( "measurement_staged_detection_output_analysis_export", @@ -148,7 +208,7 @@ def test_staged_detection_output_analysis_writes_csv_tables(tmp_path: Path) -> N { "case_id": "case-0", "row_index": 0, - "seed_source": "gliner", + "seed_source": "rules_router", "status": "completed", "elapsed_sec": 0.01, "model_elapsed_sec": 0.0, diff --git a/tests/tools/test_staged_detection_probe.py b/tests/tools/test_staged_detection_probe.py index 29987f90..c380cba2 100644 --- a/tests/tools/test_staged_detection_probe.py +++ b/tests/tools/test_staged_detection_probe.py @@ -507,6 +507,86 @@ def test_staged_detection_augmentation_prompt_discourages_grouped_person_and_sur assert "also return the surname substring as last_name" in prompt +def test_staged_detection_can_seed_from_rules_without_llm_seed_prompt() -> None: + tool = load_tool( + "measurement_staged_detection_probe_rules_seed", + REPO_ROOT / "tools/measurement/staged_detection_probe.py", + ) + llm_client = SequencedClient( + tool, + [ + '{"decisions": [{"id": "email_6_23", "decision": "keep", "reason": "email address"}]}', + '{"entities": [{"value": "NVIDIA", "label": "organization_name", "reason": "employer"}]}', + ], + ) + + result = tool.run_staged_detection_case( + tool.StagedDetectionRequest( + case_id="case-1", + text="Email alice@example.com at NVIDIA.", + labels=["email", "organization_name"], + row_index=0, + ), + client=llm_client, + seed_source=tool.SeedSource.rules, + ) + + assert result.status == tool.CaseStatus.completed + assert result.seed_source == tool.SeedSource.rules + assert result.phase_usage.seed == {} + assert result.phase_model_work == tool.PhaseModelWork(seed=False, validation=True, augmentation=True) + assert result.phase_skip_reasons.seed == "deterministic_rules" + assert result.phase_skip_reasons.validation is None + assert result.model_phase_count == 2 + assert result.phase_model_requests == tool.PhaseModelRequests(seed=0, validation=1, augmentation=1) + assert result.model_request_count == 2 + assert result.seed_suggestion_count == 1 + assert result.seed_entity_count == 1 + assert result.final_label_counts == {"email": 1, "organization_name": 1} + assert result.artifact.final_source_counts == {"augmenter": 1, "rule": 1} + assert result.total_usage == {"prompt_tokens": 20, "completion_tokens": 10, "total_tokens": 30} + assert len(llm_client.prompts) == 2 + + +def test_staged_detection_can_add_rules_to_direct_llm_seed_without_validating_rules() -> None: + tool = load_tool( + "measurement_staged_detection_probe_rules_plus_direct_seed", + REPO_ROOT / "tools/measurement/staged_detection_probe.py", + ) + llm_client = SequencedClient( + tool, + [ + '{"entities": [{"value": "NVIDIA", "label": "organization_name", "reason": "employer"}]}', + '{"decisions": [{"id": "organization_name_27_33", "decision": "keep", "reason": "employer"}]}', + '{"entities": []}', + ], + ) + + result = tool.run_staged_detection_case( + tool.StagedDetectionRequest( + case_id="case-1", + text="Email alice@example.com at NVIDIA.", + labels=["email", "organization_name"], + row_index=0, + ), + client=llm_client, + seed_source=tool.SeedSource.rules_plus_direct_llm, + ) + + assert result.status == tool.CaseStatus.completed + assert result.seed_source == tool.SeedSource.rules_plus_direct_llm + assert result.seed_suggestion_count == 2 + assert result.seed_entity_count == 2 + assert result.validation_candidate_count == 1 + assert result.validation_decision_count == 1 + assert result.final_label_counts == {"email": 1, "organization_name": 1} + assert result.artifact.final_source_counts == {"direct_seed": 1, "rule": 1} + assert result.phase_model_requests == tool.PhaseModelRequests(seed=1, validation=1, augmentation=1) + assert result.model_request_count == 3 + assert '"label":"email"' not in llm_client.prompts[1] + assert '"label":"organization_name"' in llm_client.prompts[1] + + def test_staged_detection_baseline_comparison_skips_rows_without_signature_hashes() -> None: tool = load_tool( "measurement_staged_detection_probe_missing_baseline_signatures", @@ -535,6 +615,159 @@ def test_staged_detection_baseline_comparison_skips_rows_without_signature_hashe assert compared.comparison is None +def test_staged_detection_can_trust_rules_without_validation_prompt() -> None: + tool = load_tool( + "measurement_staged_detection_probe_rules_trusted_seed", + REPO_ROOT / "tools/measurement/staged_detection_probe.py", + ) + llm_client = SequencedClient(tool, ['{"entities": []}']) + + result = tool.run_staged_detection_case( + tool.StagedDetectionRequest( + case_id="case-1", + text=( + "$ docker run -e DATABASE_URL='postgres://app_user:fakeDbPass123!@db.example.test:5432/app' " + "-e API_KEY=ghp_FAKEtoken1234567890abcdef myapp:latest\nPassword: fakeLoginPass!" + ), + labels=["api_key", "password", "email", "url"], + row_index=0, + ), + client=llm_client, + seed_source=tool.SeedSource.rules_trusted, + ) + + assert result.status == tool.CaseStatus.completed + assert result.seed_source == tool.SeedSource.rules_trusted + assert result.phase_usage.seed == {} + assert result.phase_usage.validation == {} + assert result.phase_model_work == tool.PhaseModelWork(seed=False, validation=False, augmentation=True) + assert result.phase_skip_reasons.seed == "deterministic_rules" + assert result.phase_skip_reasons.validation == "trusted_rules" + assert result.phase_skip_reasons.augmentation is None + assert result.model_phase_count == 1 + assert result.phase_model_requests == tool.PhaseModelRequests(seed=0, validation=0, augmentation=1) + assert result.model_request_count == 1 + assert result.rule_covered_label_set is True + assert result.validation_decision_count == 3 + assert result.final_label_counts == {"api_key": 1, "password": 1, "url": 1} + assert result.artifact.final_source_counts == {"rule": 3} + assert result.total_usage == {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15} + assert len(llm_client.prompts) == 1 + + +def test_staged_detection_can_skip_augmentation_when_all_labels_are_rule_covered() -> None: + tool = load_tool( + "measurement_staged_detection_probe_rules_trusted_no_augment", + REPO_ROOT / "tools/measurement/staged_detection_probe.py", + ) + llm_client = SequencedClient(tool, []) + + result = tool.run_staged_detection_case( + tool.StagedDetectionRequest( + case_id="case-1", + text="Email alice@example.com", + labels=["email"], + row_index=0, + ), + client=llm_client, + seed_source=tool.SeedSource.rules_trusted, + skip_augmentation_when_rule_covered=True, + ) + + assert result.status == tool.CaseStatus.completed + assert result.phase_usage.augmentation == {} + assert result.phase_model_work == tool.PhaseModelWork(seed=False, validation=False, augmentation=False) + assert result.phase_skip_reasons == tool.PhaseSkipReasons( + seed="deterministic_rules", + validation="trusted_rules", + augmentation="rule_covered_labels", + ) + assert result.model_phase_count == 0 + assert result.phase_model_requests == tool.PhaseModelRequests(seed=0, validation=0, augmentation=0) + assert result.model_request_count == 0 + assert result.rule_covered_label_set is True + assert result.augmented_suggestion_count == 0 + assert result.final_label_counts == {"email": 1} + assert result.total_usage == {} + assert len(llm_client.prompts) == 0 + + +def test_staged_detection_rules_router_short_circuits_rule_covered_labels() -> None: + tool = load_tool( + "measurement_staged_detection_probe_rules_router_short_circuit", + REPO_ROOT / "tools/measurement/staged_detection_probe.py", + ) + llm_client = SequencedClient(tool, []) + + result = tool.run_staged_detection_case( + tool.StagedDetectionRequest( + case_id="case-1", + text="Email alice@example.com and token ghp_FAKEtoken1234567890abcdef", + labels=["email", "api_key"], + row_index=0, + ), + client=llm_client, + seed_source=tool.SeedSource.rules_router, + ) + + assert result.status == tool.CaseStatus.completed + assert result.seed_source == tool.SeedSource.rules_router + assert result.phase_model_work == tool.PhaseModelWork(seed=False, validation=False, augmentation=False) + assert result.phase_skip_reasons == tool.PhaseSkipReasons( + seed="deterministic_rules", + validation="trusted_rules", + augmentation="rule_covered_labels", + ) + assert result.model_phase_count == 0 + assert result.phase_model_requests == tool.PhaseModelRequests(seed=0, validation=0, augmentation=0) + assert result.model_request_count == 0 + assert result.elapsed_sec is not None and result.elapsed_sec > 0.0 + assert result.model_elapsed_sec == 0.0 + assert result.rule_covered_label_set is True + assert result.final_label_counts == {"api_key": 1, "email": 1} + assert result.artifact.final_source_counts == {"rule": 2} + assert result.total_usage == {} + assert len(llm_client.prompts) == 0 + + +def test_staged_detection_rules_router_uses_direct_seed_for_contextual_labels() -> None: + tool = load_tool( + "measurement_staged_detection_probe_rules_router_mixed_labels", + REPO_ROOT / "tools/measurement/staged_detection_probe.py", + ) + llm_client = SequencedClient( + tool, + [ + '{"entities": [{"value": "Alice", "label": "first_name", "reason": "person name"}]}', + '{"decisions": [{"id": "first_name_0_5", "decision": "keep", "reason": "person name"}]}', + '{"entities": []}', + ], + ) + + result = tool.run_staged_detection_case( + tool.StagedDetectionRequest( + case_id="case-1", + text="Alice emails alice@example.com.", + labels=["email", "first_name"], + row_index=0, + ), + client=llm_client, + seed_source=tool.SeedSource.rules_router, + ) + + assert result.status == tool.CaseStatus.completed + assert result.seed_source == tool.SeedSource.rules_router + assert result.rule_covered_label_set is False + assert result.phase_model_work == tool.PhaseModelWork(seed=True, validation=True, augmentation=True) + assert result.phase_skip_reasons == tool.PhaseSkipReasons() + assert result.phase_model_requests == tool.PhaseModelRequests(seed=1, validation=1, augmentation=1) + assert result.model_request_count == 3 + assert result.final_label_counts == {"email": 1, "first_name": 1} + assert result.artifact.final_source_counts == {"direct_seed": 1, "rule": 1} + assert '"label":"email"' not in llm_client.prompts[1] + assert '"label":"first_name"' in llm_client.prompts[1] + + def test_staged_detection_can_chunk_validation_into_local_excerpts() -> None: tool = load_tool( "measurement_staged_detection_probe_chunked_validation", diff --git a/tools/measurement/README.md b/tools/measurement/README.md index 69a017af..f0cc8ffe 100644 --- a/tools/measurement/README.md +++ b/tools/measurement/README.md @@ -1,31 +1,30 @@ -# Measurement tools +# Measurement Tools -This directory contains developer tools for measuring Anonymizer runs, exporting -measurement JSONL to tables, and comparing benchmark strategies. Run the tools -inside the project environment, either with an activated venv or through -`uv run`. +`export_measurements.py` converts Anonymizer measurement JSONL into one table +per `record_type`. + +Run these tools inside the project environment, either with an activated venv +or through `uv run`. ```bash uv run python tools/measurement/export_measurements.py measurements.jsonl --output tables ``` -By default, `export_measurements.py` writes Parquet files plus -`manifest.json`: +By default it writes Parquet files plus `manifest.json`: - `run.parquet` - `stage.parquet` - `record.parquet` -- `ndd_workflow.parquet` when DataDesigner adapter records are present -- `model_workflow.parquet` when direct model workflow records are present +- `ndd_workflow.parquet` when adapter records are present +- `model_workflow.parquet` when non-DataDesigner model workflow records are + present Use `--format csv` or `--format jsonl` for non-Parquet output, and `--overwrite` to replace existing output files. -## Benchmark runner - `run_benchmarks.py` runs repeatable Anonymizer workloads and writes the same measurement JSONL format, one raw file per benchmark case plus a combined `measurements.jsonl`. @@ -38,7 +37,7 @@ uv run python tools/measurement/run_benchmarks.py suite.yaml \ --dd-trace last-message ``` -The repo-data smoke suite can be run with DataDesigner traces enabled: +To rerun the repo-data smoke suite with DataDesigner traces enabled: ```bash bash tools/measurement/examples/run-repo-data-smoke-with-dd-traces.sh @@ -46,7 +45,7 @@ bash tools/measurement/examples/run-repo-data-smoke-with-dd-traces.sh The script writes to `/tmp/anonymizer-repo-data-smoke-dd-traces` by default. Pass a different output directory as the first argument, or set -`DD_TRACE_MODE=all-messages` when full chat history is needed: +`DD_TRACE_MODE=all-messages` when you need full chat history: ```bash DD_TRACE_MODE=all-messages \ @@ -64,7 +63,7 @@ Benchmark suites are YAML files with three parts: Example: ```yaml -suite_id: biography-smoke +suite_id: shell-and-biography-smoke model_configs: ./model-configs.yaml model_providers: ./providers.yaml dd_parser_compat: none @@ -110,29 +109,47 @@ for URL-like sources because the runner cannot safely materialize a local subset without downloading the whole dataset first. Relative paths in suite files are resolved from the suite file's directory. -The runner refuses to write into a non-empty output directory unless -`--overwrite` is set. By default it also exports Parquet tables into `tables/`; -pass `--no-export` when only raw measurement JSONL is needed. - -Before starting a real run, the benchmark runner performs cheap preflight -checks: suite/config parsing, local dataset existence, CSV/Parquet text-column -metadata, provider YAML shape, native runtime requirements, and active -model-alias references. `--dry-run` runs those same checks, expands the planned -matrix, and skips output-dir writes and model work. Use `case_retries` and `case_retry_backoff_sec` for long-running suites on shared model endpoints. Retries are disabled by default. When enabled, a failed case is retried with the same `case_id` and output paths; the final case still records `attempt_count` and `attempt_errors` in `summary.json`. `--fail-fast` -remains fail-fast and bypasses retries. +remains fail-fast and bypasses retries. Treat retried cases as reliability +signals during analysis, especially when failures come from provider health +checks or rate limits. -## Benchmark-only detection strategies +Configs may also set `experimental_detection_strategy` for benchmark-only +pipeline probes: -Configs may set `experimental_detection_strategy` for benchmark-only pipeline -probes. These values are not public `Detect` config fields, and they should not -be treated as safe defaults across arbitrary data. +```yaml +configs: + - id: shell-rules-only + experimental_detection_strategy: rules_only + detect: + entity_labels: [api_key, email, http_cookie, password, pin, unique_id, url, user_name] + replace: + strategy: hash + digest_length: 12 +``` + +Native benchmark strategies require an explicit runtime. Set top-level +`native_runtime.endpoint` and `native_runtime.model`, set the standard +`ANONYMIZER_BENCH_NATIVE_ENDPOINT` and `ANONYMIZER_BENCH_NATIVE_MODEL` +environment variables, or override runtime fields per config with +`configs[].native_runtime`. GLiNER-seeded native strategies also require +`native_runtime.gliner_endpoint` and `native_runtime.gliner_model`, or the +standard `ANONYMIZER_BENCH_GLINER_ENDPOINT` and +`ANONYMIZER_BENCH_GLINER_MODEL` environment variables. The runner records +runtime id, alias, provider, model, and env-variable names as run tags; raw +endpoint URLs are not emitted into measurement tables. ```yaml +native_runtime: + runtime_id: local-vllm-json + endpoint_env: ANONYMIZER_BENCH_NATIVE_ENDPOINT + model_env: ANONYMIZER_BENCH_NATIVE_MODEL + provider: local-vllm + alias: native-direct configs: - id: native-single-pass experimental_detection_strategy: native_single_pass @@ -142,29 +159,71 @@ configs: Supported values: - `default`: run the normal Anonymizer detection pipeline. +- `rules_guardrail`: run the normal Anonymizer detection pipeline, then union + deterministic high-confidence rule spans into the final entity set. +- `rules_filter_guardrail`: remove GLiNER candidates that are fully covered by + same-label deterministic high-confidence rule spans before validation, add + non-overlapping rule spans back before augmentation so the augmenter sees them + as already tagged, then add non-overlapping rule spans into the final entity + set. Different-label overlaps and longer detector spans remain validation + candidates so contextual spans such as a multi-token political view, + university, or organization name are not shadowed by a shorter or differently + labeled rule span. - `no_augment`: run GLiNER detection and validation, but skip LLM augmentation. +- `rules_seed_no_augment`: add deterministic high-confidence secret spans to + the GLiNER seed set, validate those seeds, and skip LLM augmentation. +- `rules_guardrail_no_augment`: run GLiNER detection and validation, skip LLM + augmentation, then union deterministic high-confidence rule spans into the + final entity set. +- `rules_filter_guardrail_no_augment`: remove GLiNER candidates that are fully + covered by same-label deterministic high-confidence rule spans before + validation, skip LLM augmentation, then add non-overlapping rule spans into + the final entity set. +- `rules_guardrail_detector_only`: run only GLiNER detection and local + finalization, then union deterministic high-confidence rule spans into the + final entity set. - `detector_only`: run only GLiNER detection and local finalization. This skips LLM validation and LLM augmentation. +- `rules_only`: use only deterministic high-confidence rules for the detection + stage. +- `rules_covered_or_default`: if explicit `detect.entity_labels` are entirely + inside the structured-secret fast lane (`api_key`, `email`, `http_cookie`, + `password`, `pin`, `unique_id`, `url`, `user_name`), use deterministic rules + for rows whose structured assignments are covered and route suspicious + uncovered rows through the normal Anonymizer detection pipeline. Label sets + outside the fast lane always use normal detection. +- `native_rules_router`: run a benchmark-only native staged detector without + DataDesigner. Rule-covered label sets short-circuit through deterministic + rules with no model calls; other label sets use direct OpenAI-compatible + provider calls for seed extraction, validation, and augmentation. - `native_candidate_validate_no_augment`: run a benchmark-only native staged detector without DataDesigner using direct OpenAI-compatible calls for seed - extraction and validation, then skip augmentation. + extraction and validation, then skip augmentation. This isolates the cost and + recall impact of removing the augmentation phase from the native executor. - `detector_native_validate_no_augment`: run the normal GLiNER detector seed through Anonymizer/DataDesigner, then bypass DataDesigner validation and - augmentation with direct OpenAI-compatible validation calls. -- `detector_native_validate_native_augment`: run the normal GLiNER detector seed - through Anonymizer/DataDesigner, then bypass DataDesigner validation and + augmentation with direct OpenAI-compatible validation calls. This isolates + whether native validation can replace DataDesigner validation when candidate + quality is held closer to the default detector path. +- `detector_native_validate_native_augment`: run the normal GLiNER detector + seed through Anonymizer/DataDesigner, then bypass DataDesigner validation and augmentation with direct OpenAI-compatible validation and augmentation calls. + This keeps the default detector candidate source while testing whether direct + provider calls can replace the two downstream DataDesigner LLM phases. - `gliner_native_validate_no_augment`: run a direct hosted-GLiNER seed without DataDesigner, validate those detector candidates with direct - OpenAI-compatible calls, and skip augmentation. + OpenAI-compatible calls, and skip augmentation. This isolates DataDesigner + detector orchestration overhead while keeping a GLiNER-style candidate source. - `gliner_native_validate_native_augment`: run a direct hosted-GLiNER seed without DataDesigner, validate those detector candidates with direct - OpenAI-compatible calls, then run direct native augmentation. + OpenAI-compatible calls, then run direct native augmentation. This is the + fully staged no-DataDesigner detector/validator/augmenter lane for contextual + recall experiments. - `native_single_pass`: run a benchmark-only native detector without DataDesigner using one direct OpenAI-compatible provider call per row. The - model must return exact values plus `start`/`end` offsets; local code - validates offsets, resolves overlaps, and records parser/runtime failures as - `model_workflow` errors. + model must return exact values plus `start`/`end` offsets; local code validates + offsets, unions non-overlapping deterministic rule spans, resolves overlaps, + and records parser/runtime failures as `model_workflow` errors. - `native_single_pass_recall`: the same one-call native detector with a recall-oriented prompt that includes Anonymizer's label examples and stronger high-recall guidance. @@ -175,52 +234,88 @@ Supported values: - `native_single_pass_values_recall`: the value-only one-call detector with the recall-oriented prompt from `direct_detection_probe.py`. -Native benchmark strategies require an explicit runtime. Set top-level -`native_runtime.endpoint` and `native_runtime.model`, set the standard -`ANONYMIZER_BENCH_NATIVE_ENDPOINT` and `ANONYMIZER_BENCH_NATIVE_MODEL` -environment variables, or override runtime fields per config with -`configs[].native_runtime`. GLiNER-seeded native strategies also require -`native_runtime.gliner_endpoint` and `native_runtime.gliner_model`, or the -standard `ANONYMIZER_BENCH_GLINER_ENDPOINT` and -`ANONYMIZER_BENCH_GLINER_MODEL` environment variables. The runner records -runtime id, alias, provider, model, and env-variable names as run tags; raw -endpoint URLs are not emitted into measurement tables. - -```yaml -native_runtime: - runtime_id: local-vllm-json - endpoint_env: ANONYMIZER_BENCH_NATIVE_ENDPOINT - model_env: ANONYMIZER_BENCH_NATIVE_MODEL - provider: local-vllm - alias: native-direct -configs: - - id: native-single-pass - experimental_detection_strategy: native_single_pass - replace: redact -``` - -Use `detector_only` only as a lower-bound ablation. It skips the LLM validation -pass that drops false positives and reclassifies ambiguous spans. A faster run -that loses baseline signatures is a rejection. - -Use staged native strategies when the question is "can direct provider calls -replace part of DataDesigner orchestration?" They still need repeated signature, -leak, label-mismatch, parser, and reliability gates before any workload-specific -promotion. - -Use one-call native strategies for the more aggressive "collapse detection to -one call" experiment. They are often faster when the prompt works, but they are -more parser- and recall-sensitive. Any malformed JSON response becomes a failed -case in analysis, and any missed baseline signature should be treated as a -rejection rather than a latency win. - -## Replacement strategy probes +These strategies exist to compare performance options. They are not public +`Detect` config fields, and they should not be treated as safe defaults across +arbitrary data. The rule-backed strategies only cover deterministic +high-confidence spans for `api_key`, `date_of_birth`, `email`, +`http_cookie`, `organization_name`, `password`, `pin`, `religious_belief`, +`street_address`, `unique_id`, `url`, and `user_name`; they will not replace +contextual detection for prose identifiers such as names in biographies or +legal documents. The prose rules (`date_of_birth`, `organization_name`, +`religious_belief`, and `street_address`) are narrow contextual patterns and +are not enough to opt into `rules_covered_or_default`; those labels fall back +to default detection unless `rules_only` is explicitly selected. The structured +identifier rules require keyed or command-style syntax such as +`Cookie:`, `pin=`, `trace-id:`, `user_name=`, or service-principal flags. They +are not general entity recognizers. `detector_only` is also unsafe as a default +because it skips the LLM validation pass that drops false positives and +reclassifies ambiguous spans. `rules_only` requires explicit `entity_labels`, +and every label must be covered by those deterministic rules. Use +`rules_covered_or_default` when a benchmark suite may include both fully +structured-secret scans and contextual workloads; it keeps the no-DataDesigner +short-circuit for the former and falls back to the default pipeline for prose +or legal labels. + +Use `native_rules_router` when you want the same routing shape without +DataDesigner orchestration. It uses the resolved native runtime endpoint/model +from `native_runtime` or the standard benchmark runtime environment variables. +Treat it as a native-executor prototype: it can prove that DataDesigner overhead +is avoidable, but it must be compared against baseline signatures and +original-value leak metrics before any workload-specific promotion decision. + +Use `native_candidate_validate_no_augment` when you want a narrower native +executor diagnostic: direct seed candidates plus direct validation, with no +augmentation. It is useful for proving how much speed comes from removing a +phase, but a faster run that loses baseline signatures is still a rejection. + +Use `detector_native_validate_no_augment` when you want to keep the production +detector seed while testing a direct-provider validation path. It is not a +no-DataDesigner strategy because the detector seed still runs through the +adapter, but it tells you whether DataDesigner validation/augmentation is the +load-bearing part of a workload. The native validation shim preserves +`date_of_birth` over broader `date` reclassifications only when the local +candidate context contains birth/DOB language; generic filing or event dates can +still be reclassified to `date`. + +Use `detector_native_validate_native_augment` for the same detector-seed +question when augmentation recall is expected to be load-bearing. This arm still +uses DataDesigner for the detector seed, but direct provider calls own both +validation and augmentation. + +Use `gliner_native_validate_no_augment` or +`gliner_native_validate_native_augment` when the question is specifically +"what if GLiNER did not run through DataDesigner?" These strategies use the +staged direct executor's GLiNER seed client using +`native_runtime.gliner_endpoint`, `native_runtime.gliner_model`, or the standard +GLiNER runtime environment variables; the API key env var defaults to +`NVIDIA_API_KEY`. The no-augmentation arm is a lower-cost boundary; the +native-augmentation arm is the quality-oriented no-DataDesigner candidate. The +integrated benchmark strategies execute staged direct rows with bounded +parallelism so GLiNER and native validation/augmentation latency is not +serialized across records. These arms also normalize direct GLiNER `date` seeds +to `date_of_birth` only when the local seed context contains birth/DOB language. +Generic filing or event dates remain `date`. Both arms still need repeated +signature, leak, label-mismatch, and reliability gates before any +workload-specific promotion. + +Use `native_single_pass`, `native_single_pass_recall`, +`native_single_pass_values`, or `native_single_pass_values_recall` for the more +aggressive "collapse detection to one call" experiment. The first pair asks the +model for `start`/`end` offsets and validates them before falling back to exact +value matching. The value-only pair uses the standalone direct-probe prompt and +lets local code recover spans from exact returned values. Recall variants spend +more prompt tokens on label examples and high-recall guidance. All one-call +variants are expected to be faster than staged native detection when the prompt +works, but they are also more parser- and recall-sensitive. Any malformed JSON +response becomes a failed case in analysis, and any missed baseline signature +should be treated as a rejection rather than a latency win. Replacement-map generation has a separate benchmark-only knob: ```yaml configs: - id: structured-local-substitute + experimental_detection_strategy: rules_covered_or_default experimental_replacement_strategy: local_structured_substitute detect: entity_labels: [api_key, email, password, url] @@ -245,7 +340,14 @@ is deliberate. The local substitute map generator does not understand names, social relations, cultural consistency, or prose semantics; use the default DataDesigner-backed `Substitute` path for those workloads. -## DataDesigner traces +The runner refuses to write into a non-empty output directory unless +`--overwrite` is set. By default it also exports Parquet tables into +`tables/`; pass `--no-export` when you only want the raw measurement JSONL. +Before starting a real run, the benchmark runner performs cheap preflight +checks: suite/config parsing, local dataset existence, CSV/Parquet text-column +metadata, provider YAML shape, native runtime requirements, and active +model-alias references. `--dry-run` runs those same checks, expands the planned +matrix, and skips output-dir writes and model work. For debugging DataDesigner calls, pass `--dd-trace last-message` or `--dd-trace all-messages`. Trace records are written separately from sanitized @@ -258,8 +360,8 @@ values, replacement values, secrets, and PII. Treat them as debug artifacts: keep them out of shared benchmark bundles unless they have been reviewed or redacted. -Summarize traced calls without copying raw prompts or responses into analysis -output: +To summarize traced calls without copying raw prompts or responses into the +analysis output, run: ```bash uv run python tools/measurement/analyze_dd_traces.py \ @@ -271,65 +373,205 @@ uv run python tools/measurement/analyze_dd_traces.py \ This writes `trace_analysis.*` and `trace_group_analysis.*`. The row table captures run tags, workflow/model metadata, status, elapsed time, prompt and response lengths, token counts, and response-shape flags such as `raw_json`, -`fenced_json`, `embedded_json`, `text`, and `none`. +`fenced_json`, `embedded_json`, `text`, and `none`. The grouped table rolls those +fields up by workload, config, workflow, model, provider, status, error type, +and response shape. Use this when diagnosing local provider behavior, parser +compatibility, unexpected thinking text, or retry-heavy workflows. -## Direct probes +Some OpenAI-compatible local endpoints return raw JSON when their model config +uses `response_format: {type: json_object}`. DataDesigner structured recipes +currently prompt for markdown-fenced JSON, so those raw JSON responses can be +valid but still fail parsing. Set top-level `dd_parser_compat: raw_json` when a +benchmark suite needs this provider compatibility mode: -`direct_detection_probe.py` calls a local OpenAI-compatible endpoint directly -for a small slice of records. It is useful for prompt experiments before adding -a benchmark strategy. +```yaml +dd_parser_compat: raw_json +``` + +This is benchmark-only behavior. The runner patches DataDesigner structured +parser builders for the duration of a case, restores them afterward, and records +the mode in `run_tags.dd_parser_compat`. The fallback accepts either pure raw +JSON or a JSON object/array embedded after model reasoning text, then still +validates the extracted object against the requested schema. Keep the default +`none` unless a local provider or vLLM endpoint needs raw-JSON structured-output +compatibility. + +## DD-Free Direct Detection Probe + +Use `direct_detection_probe.py` to test a deliberately DD-free extraction path +against an OpenAI-compatible endpoint. This is a benchmark-only diagnostic: it +does not call DataDesigner, does not run GLiNER, and does not execute the +production detection graph. It sends one direct chat-completions request per +input row, then reuses Anonymizer's existing span postprocessing, occurrence +expansion, overlap resolution, and entity signature logic so results can be +compared against normal detection artifacts. + +Pass `--endpoint` and `--model`, or set `ANONYMIZER_BENCH_NATIVE_ENDPOINT` and +`ANONYMIZER_BENCH_NATIVE_MODEL`. + +Example biography probe: ```bash uv run python tools/measurement/direct_detection_probe.py \ docs/data/NVIDIA_synthetic_biographies.csv \ + --text-column biography \ + --labels age,city,company_name,degree,education_level,field_of_study,first_name,language,last_name,occupation,organization_name,place_name,political_view,race_ethnicity,religious_belief,state,university \ + --endpoint http://your-openai-compatible-endpoint/v1 \ + --model your-model-id \ + --baseline-artifacts "$BASELINE_ARTIFACTS" \ + --output /tmp/direct-detection-probe-biography \ + --overwrite \ + --json +``` + +Example legal probe: + +```bash +uv run python tools/measurement/direct_detection_probe.py \ + docs/data/TAB_legal_sample25.csv \ --text-column text \ - --endpoint http://gpu-dev-pod-serve-svc:8000/v1 \ - --model nvidia/nemotron-3-super \ - --labels person,email,api_key,password \ - --row-limit 5 \ - --output /tmp/direct-detection-probe + --labels application_number,city,country,date,date_of_birth,nationality,person \ + --endpoint http://your-openai-compatible-endpoint/v1 \ + --model your-model-id \ + --baseline-artifacts "$BASELINE_ARTIFACTS" \ + --output /tmp/direct-detection-probe-legal \ + --overwrite \ + --json ``` -`staged_detection_probe.py` runs a no-DataDesigner staged detector outside the -main benchmark harness. It can compare seed extraction, validation, and -augmentation boundaries before integrating a strategy into `run_benchmarks.py`. +The tool writes `direct-detection-cases.jsonl`, +`direct-detection-artifacts.jsonl`, and `summary.json`. Case rows include model +usage, elapsed time, raw/allowed suggestion counts, final label counts, final +signature hashes, and optional baseline comparison counts. Artifact rows use +the same opaque signature fields as `analyze_detection_artifacts.py` and omit +raw entity values. For baseline comparison, pass a per-case sidecar or another +artifact file with one row per `row_index`; duplicate row indexes are rejected +to avoid ambiguous comparisons. Treat the probe `summary.json` as a sensitive +debug artifact because it records the resolved endpoint/model runtime used for +the probe. +so a combined multi-case artifact cannot silently select the wrong baseline. + +When this probe shape is promising, move it into a normal benchmark suite with +`experimental_detection_strategy: native_single_pass_values` or +`native_single_pass_values_recall`. Those strategies use the same value-only +prompt family but run through `run_benchmarks.py`, measurement collection, case +retries, artifact capture, and pairwise strategy comparison. + +Interpret this probe as a lower-friction model-call experiment, not a safe +replacement for detection. A local one-row smoke against +`nvidia/nemotron-3-super` with vLLM JSON mode and thinking disabled produced: + +- Biography: 4.1s, 906 total tokens, 19 final signatures, 18/22 baseline + signatures shared; misses included `field_of_study` and `place_name`. +- Legal: 4.9s, 1,308 total tokens, 21 final signatures, 19/22 baseline + signatures shared; misses included `date`, `date_of_birth`, and + `nationality`. + +That result makes a DD-free native executor worth exploring, but only if it +preserves the production safety decomposition (`GLiNER/rules -> validate -> +augment -> finalize`). The one-shot direct prompt is useful as a speed/quality +boundary, not as a production candidate. + +## DD-Free Staged Detection Probe + +Use `staged_detection_probe.py` to test a more conservative DD-free route. This +probe still avoids DataDesigner, but it does not collapse detection into one +model response. It can run direct LLM seed extraction, direct GLiNER seeding, +deterministic rule seeding, trusted deterministic rule seeding, or rule-routed +DD-free execution. It then runs direct validation and direct augmentation unless +trusted rules or the rule router short-circuit are selected, where rule spans +bypass validation. It reuses Anonymizer's existing row-level postprocessing +helpers for validation application, augmentation merge, occurrence expansion, +overlap resolution, and artifact signatures. + +Example biography probe: ```bash uv run python tools/measurement/staged_detection_probe.py \ docs/data/NVIDIA_synthetic_biographies.csv \ - --text-column text \ - --endpoint http://gpu-dev-pod-serve-svc:8000/v1 \ - --model nvidia/nemotron-3-super \ - --labels person,email,api_key,password \ - --row-limit 5 \ - --output /tmp/staged-detection-probe + --text-column biography \ + --labels age,city,company_name,degree,education_level,field_of_study,first_name,language,last_name,occupation,organization_name,place_name,political_view,race_ethnicity,religious_belief,state,university \ + --endpoint http://your-openai-compatible-endpoint/v1 \ + --model your-model-id \ + --baseline-artifacts "$BASELINE_ARTIFACTS" \ + --output /tmp/staged-detection-probe-biography \ + --overwrite \ + --json ``` -Useful staged options: +Example legal probe: -- `--seed-source direct-llm`: use direct LLM seed extraction. -- `--seed-source gliner`: use direct hosted GLiNER seeding. -- `--skip-augmentation`: disable augmentation for any seed source. This is an - ablation for measuring how much recall the augmentation phase carries. -- `--validation-prompt-mode chunked-excerpt`: split seed validation candidates - into chunks of `--validation-max-entities-per-call` and send each chunk with a - tagged local excerpt bounded by `--validation-excerpt-window-chars`. +```bash +uv run python tools/measurement/staged_detection_probe.py \ + docs/data/TAB_legal_sample25.csv \ + --text-column text \ + --labels application_number,city,country,date,date_of_birth,nationality,person \ + --endpoint http://your-openai-compatible-endpoint/v1 \ + --model your-model-id \ + --baseline-artifacts "$BASELINE_ARTIFACTS" \ + --output /tmp/staged-detection-probe-legal \ + --overwrite \ + --json +``` -The staged tool writes `staged-detection-cases.jsonl`, +To replace the LLM seed phase with a direct GLiNER call, add +`--seed-source gliner` plus `--gliner-endpoint` and `--gliner-model`, or set +`ANONYMIZER_BENCH_GLINER_ENDPOINT` and `ANONYMIZER_BENCH_GLINER_MODEL`. The +probe reads the GLiNER API key from `--gliner-api-key-env`, which defaults to +`NVIDIA_API_KEY`. + +To replace the LLM seed phase with deterministic local rules, add +`--seed-source rules`. This still sends rule candidates through the validator. +Use `--seed-source rules-trusted` to bypass validation for high-confidence rule +spans and run only augmentation afterward. The trusted mode is a diagnostic for +rule-covered workloads; it is not a general prose/legal safety default. +Use `--seed-source rules-plus-direct-llm` to add deterministic rule spans to +direct LLM seed spans while validating only the direct LLM seed candidates. This +tests a mixed native path where obvious structured secrets are trusted locally +without giving up contextual model seeding for the rest of the record. +Use `--seed-source rules-router` to make that split explicit: if every requested +label is supported by deterministic rules, the probe runs trusted local rules +with no model calls; otherwise it falls back to `rules-plus-direct-llm`. +When the requested labels are all covered by deterministic rules, add +`--skip-augmentation-when-rule-covered` to measure a fully local short-circuit +with no model calls. +Use `--skip-augmentation` to disable augmentation for any seed source. This is +only a diagnostic for measuring how much recall the augmentation phase carries; +signature loss should reject the candidate even when latency improves. + +To test whether direct validation can preserve the phase boundary with less +prompt text, add `--validation-prompt-mode chunked-excerpt`. This splits seed +validation candidates into chunks of `--validation-max-entities-per-call` and +sends each chunk with a tagged local excerpt bounded by +`--validation-excerpt-window-chars`. The default remains `full-text`, which +keeps the prior one-call behavior. Treat this as a request-count/token tradeoff: +chunked excerpts can reduce prompt payload, but they also create more validator +requests and can remove context needed for labels such as legal roles, +education, demographics, or prose locations. + +The tool writes `staged-detection-cases.jsonl`, `staged-detection-artifacts.jsonl`, and `summary.json`. Case rows include per-phase usage for seed extraction, validation, and augmentation, true case -wall time in `elapsed_sec`, model-call time in `model_elapsed_sec`, +wall time in `elapsed_sec`, model-call time in `model_elapsed_sec`, plus `phase_model_work`, `phase_skip_reasons`, `phase_model_requests`, `model_phase_count`, `model_request_count`, total usage, and optional baseline -signature deltas. Treat `summary.json` as a sensitive debug artifact because it +signature deltas. Use these fields to distinguish local work, provider latency, +and a provider that returned no token accounting. +Treat the staged probe `summary.json` as a sensitive debug artifact because it records the resolved endpoint/model runtime used for the probe. +For example, a fully local rule-covered run should show `model_phase_count: 0`, +`model_request_count: 0`, `rule_covered_label_set: true`, and +`phase_skip_reasons.augmentation: "rule_covered_labels"`; `elapsed_sec` should +still capture the local rule/postprocess wall time while `model_elapsed_sec` +remains `0.0`. A chunked-excerpt validation run should usually keep +`model_phase_count` unchanged while raising `phase_model_requests.validation`. -Summarize staged probe outputs: +To summarize those staged probe outputs without hand-written `jq`, run: ```bash uv run python tools/measurement/analyze_staged_detection_output.py \ - /tmp/staged-detection-probe \ - --output /tmp/staged-detection-probe/analysis \ + /tmp/staged-detection-probe-biography \ + --output /tmp/staged-detection-probe-biography/analysis \ --format csv ``` @@ -338,13 +580,219 @@ The analyzer accepts either the staged output directory or the source group, and label-delta tables. Use `group_analysis.csv` for latency, token, request, and signature-overlap totals; use `label_delta_analysis.csv` to see which labels account for baseline-only misses or direct-only additions. The -analysis tables omit raw text and raw entity values. +analysis tables still omit raw text and raw entity values. + +The grouped table also includes a conservative `fast_lane_verdict`: + +- `fast_lane_candidate`: every case completed, every case was fully + rule-covered, the seed-source group has at least three cases, model requests + were zero, and baseline comparison found no missing signatures. +- `reject`: at least one case errored or the candidate lost any baseline + signature. +- `review`: baseline comparison is missing, fewer than three cases were + analyzed, the candidate still used model calls, or not every case was fully + rule-covered. + +Use `fast_lane_candidate` only as a workload-scoped promotion signal. It does +not prove that the same no-DataDesigner path is safe for prose/legal labels or +for data shapes outside the sampled suite. + +A refreshed local one-row smoke against `nvidia/nemotron-3-super` with vLLM JSON +mode and thinking disabled produced: + +- Biography: 13.7s, 4,550 total tokens, 24 final signatures, 20/22 baseline + signatures shared. The staged path recovered two signatures missed by the + one-shot direct probe, but still missed an `age` and a `place_name` signature + and added four direct-only signatures. +- Legal: 17.5s, 6,425 total tokens, 21 final signatures, 19/22 baseline + signatures shared. This did not improve signature overlap over the one-shot + direct probe and was materially slower. + +A direct hosted GLiNER seed smoke reached NVIDIA's endpoint but failed before +local validation with `DEGRADED function cannot be invoked` for +`nvidia/gliner-pii`. Keep the `--seed-source gliner` mode as a native executor +option, but do not treat hosted GLiNER availability as stable for local +performance conclusions. + +Rules seeding changed the tradeoff. On biography row 0, `rules` took 6.1s and +1,565 tokens but shared only 17/22 baseline signatures; `rules-trusted` took +5.2s and 1,019 tokens and shared 18/22. On legal row 0, `rules` took 7.1s and +2,213 tokens with 20/22 shared signatures; `rules-trusted` took 6.4s and 1,431 +tokens with the same 20/22 shared signatures. On the three-row shell-secrets +slice, `rules` exposed a validation regression: the validator reclassified a +database URL as a password, leaving row 1 with 2/3 shared baseline signatures. +`rules-trusted` preserved all shell baseline signatures and reduced each row to +one augmentation call, but that no-op augmentation still consumed 398-533 tokens +per row. With `--skip-augmentation-when-rule-covered`, the same trusted-rules +shell run preserved all 12 baseline signatures with zero model usage. Use this +as evidence for a native executor with rule-covered short circuiting, not as +evidence that trusted rules are safe for arbitrary text. + +Interpret this as evidence for native orchestration, not as a ready strategy. +The staged shape is closer to Anonymizer's safety model than one-shot +extraction, but the naive direct prompts spend too many tokens. The next useful +experiment is a native executor that preserves the same phase boundaries while +using compact production-equivalent prompts, direct provider clients, and a +cheap deterministic or detector-backed seed phase instead of LLM-seeded +extraction. + +## No-DataDesigner Strategy Pivot + +The strongest current performance signal comes from not invoking +DataDesigner at all for records whose requested labels and text shape are +covered by deterministic structured-secret extractors. On a local shell/structured-secret slice, +the staged `rules-router` path preserved every compared baseline signature with +zero model requests and millisecond-level elapsed time. In full Anonymizer +benchmarks, `rules_covered_or_default` plus `local_structured_substitute` +reduced structured substitute workloads by 38-99% wall time and removed most or +all observed model tokens, depending on whether the run still fell back to +default detection. + +The benchmark harness now has several integrated native strategies for that +next experiment. `native_rules_router` reuses the staged DD-free executor inside +Anonymizer's detection workflow, so benchmark cases still exercise the normal +replacement and measurement plumbing. `native_candidate_validate_no_augment` +removes augmentation to isolate the recall cost of that phase. +`detector_native_validate_no_augment` keeps the default detector seed and +switches only validation to direct provider calls. `native_single_pass` is the +more radical variant: it asks the local provider for all spans in one JSON +response and then lets Anonymizer validate offsets and finalize entities +locally. Use these arms to compare native provider calls against the +DataDesigner-backed `default` strategy on the same workloads. + +Treat that as a workload router, not a global replacement. The same DD-free +direct LLM approach on biography and legal prose still lost roughly a quarter +to a third of baseline signatures in repeated local probes, even though it +avoided DataDesigner. That is not an anonymization-safe trade by itself. The +current evidence points to three separate lanes: + +- **Structured fast lane:** if the explicit labels are all deterministic-rule + labels and rule extraction covers the workload, skip DataDesigner, skip model + calls, and use local redact/hash/substitute. This is the most promising path + for shell history, secrets, config files, audit logs, and similarly keyed + records. +- **Native model lane:** for prose or mixed records, preserve the production + detection decomposition but call providers directly: seed, validate, augment, + finalize. The prototype exists as `staged_detection_probe.py`, and the + benchmark harness includes detector-seeded and native-seeded variants, but + their current prompts are still research prompts and are too lossy/costly to + promote. +- **Single-pass model lane:** for a sharper boundary test, collapse prose or + mixed detection into one direct JSON span extraction call. This only becomes + interesting if it preserves baseline signatures; parser errors, invalid + offsets, or missed signatures should send the workload back to the default + pipeline. +- **Safety fallback:** route unsupported labels, uncertain text shapes, direct + parser failures, and signature-loss evidence back to the normal + DataDesigner-backed pipeline until a native executor proves equal or better + recall on repeated workload-specific comparisons. + +This changes the performance strategy from "make every DataDesigner phase +faster" to "avoid DataDesigner when the safety case is trivial, and use +DataDesigner as the fallback for hard cases." The benchmark interpretation +should therefore privilege signature coverage, original-value leak checks, +source provenance, and reliability flags over raw latency wins. A no-DD result +that is faster but loses baseline signatures remains a reject; a no-DD result +that is fully rule-covered, leak-free, and stable across repetitions is a +candidate for a production fast lane. + +## Output Layout + +A benchmark run writes one raw measurement file per case, then combines them: + +```text +benchmark-runs/suite-id/ + raw/ + inputs/ + biographies__redact-default__r000.csv + biographies__redact-default__r000.jsonl + biographies__redact-default__r000.detection-artifacts.jsonl + support__hash-agent-labels__r000.jsonl + artifacts/ + biographies__redact-default__r000/ + traces/ + biographies__redact-default__r000.jsonl + measurements.jsonl + summary.json + detection-artifacts.jsonl + tables/ + manifest.json + run.parquet + stage.parquet + record.parquet + ndd_workflow.parquet +``` + +Raw per-case JSONL files are streamed as measurement events are recorded, so a +long run leaves inspectable partial output before the case exits. The combined +`measurements.jsonl` is written after the completed and errored case files are +collected. + +Use `summary.json` to inspect case status, retry attempts, and errors. If a +case succeeds after retry, the combined `measurements.jsonl` contains the final +successful attempt while `summary.json` preserves the earlier failure messages. +Use `measurements.jsonl` when you need the original structured records. Use +`tables/` for analysis. +Use `traces/` only when `--dd-trace` was enabled and you need raw +DataDesigner message-level debugging. + +Treat `summary.json`, `raw/inputs/`, `artifacts/`, +`raw/*.detection-artifacts.jsonl`, and `traces/` as sensitive outputs. They can +contain source text, entity values, replacement values, prompts, model +responses, exception messages, or other PII-bearing debug data. The exported +measurement tables and detection signature ids are designed for analysis +without raw values, but debug sidecars are not sanitized bundles. + +Detection workflow artifacts can be analyzed separately when you need to know +whether augmentation helped or only added cost. `run_benchmarks.py` writes +`detection-artifacts.jsonl` automatically when export is enabled and detection +artifacts are present. The automatic export analyzes each case immediately after +it runs, then combines per-case sidecars from `raw/`; rows include `suite_id`, +`workload_id`, `config_id`, `repetition`, `case_id`, and `run_id` so they can be +joined to `measurements.jsonl` and exported tables. `rules_only` cases do not +produce DataDesigner parquet artifacts, so the runner writes a synthetic +rules-only sidecar from the same deterministic rules. That sidecar includes +counts, source=`rule`, and opaque entity signatures, but not raw entity values. +Routed strategies whose final entity set can differ from raw DataDesigner +artifacts, including row-aware `rules_covered_or_default`, write sidecars from +the final trace dataframe so rule-routed and fallback-routed rows are both +represented. + +Row-aware routed strategies also emit sanitized route telemetry into +`measurements.jsonl`, and `analyze_benchmark_output.py` surfaces it in +`case_analysis.*` and `group_analysis.*`. Use `route_total_row_count`, +`route_rule_row_count`, and `route_fallback_row_count` to confirm how many rows +used the zero-model rules lane versus the normal detection fallback before +interpreting request, token, or latency deltas. +You can also run the analyzer by hand against an artifact directory: + +```bash +uv run python tools/measurement/analyze_detection_artifacts.py \ + benchmark-runs/suite-id/artifacts \ + --output benchmark-runs/suite-id/detection-artifacts.jsonl +``` + +The analyzer reads `entity-detection*` parquet artifacts and emits one row per +artifact row. It reports seed, augmentation, and final entity counts; duplicate +augmentation suggestions; new augmented values that survived into final +entities; final label/source counts; and weak `api_key` shape warnings. The +output intentionally omits raw entity values. -## Benchmark analysis +Use this alongside the exported measurement tables when comparing +`default` against `no_augment`: -`analyze_benchmark_output.py` joins `measurements.jsonl`, optional -DataDesigner traces, and detection artifact sidecars into richer case/group -tables: +- High `augmented_duplicate_seed_value_count` with low + `augmented_new_final_value_count` means augmentation probably added cost + without improving that case. +- High `augmented_new_final_value_count` means augmentation found spans that + the detector+validator path missed. +- High `weak_api_key_shape_count` usually means the label set is mismatched to + the workload. For example, legal prose constrained to + `[person, email, api_key, password]` can force dates or case identifiers into + `api_key` because better prose labels are unavailable. + +For a ready-made case and grouped summary that joins `measurements.jsonl` with +`detection-artifacts.jsonl`, use: ```bash uv run python tools/measurement/analyze_benchmark_output.py \ @@ -353,159 +801,1207 @@ uv run python tools/measurement/analyze_benchmark_output.py \ --format csv ``` -Important outputs: - -- `case_analysis.*`: one row per benchmark case. -- `group_analysis.*`: median and aggregate metrics grouped by workload/config. -- `model_usage.*`: one row per measured model usage entry. -- `model_usage_group_analysis.*`: model usage rolled up by workflow/model. +By default this joins `benchmark-runs/suite-id/measurements.jsonl` with +`benchmark-runs/suite-id/detection-artifacts.jsonl`. To use a refreshed or +relocated sidecar that still contains benchmark case metadata, pass it +explicitly: -Use `--detection-artifacts` to provide an explicit detection artifact JSONL -sidecar. Otherwise, the analyzer reads `detection-artifacts.jsonl` in the -benchmark directory when present. +```bash +uv run python tools/measurement/analyze_benchmark_output.py \ + benchmark-runs/suite-id \ + --detection-artifacts benchmark-runs/suite-id/current-analysis/detection-artifacts.jsonl \ + --output benchmark-runs/suite-id/current-analysis \ + --format csv +``` -`compare_strategy_pairs.py` compares baseline/candidate case rows: +The override sidecar must include `case_id` or `run_id` values that match the +measurement rows. A raw artifact scan produced from only the DataDesigner +parquet directory can summarize detection artifacts, but it cannot be safely +joined to benchmark measurements unless benchmark case metadata is preserved. + +This writes `case_analysis.*`, `group_analysis.*`, `model_analysis.*`, and +`model_group_analysis.*`. It keeps fully local cases with no model workflow +rows, such as rule-covered `rules_only` or `native_rules_router` cases, in the +comparison with zero observed requests/tokens. Native direct-call strategies +that bypass DataDesigner write `model_workflow` rows, so their provider request +and token counts still contribute to case, group, and model summaries. When the +benchmark was run with current sidecar export, `rules_only` also has +artifact-derived signatures and source counts; older runs may only have +record-level entity counts. The joined case/group tables include +successful/failed request counts, input/output token splits, record counts, +dataset input-token throughput, `seed_validation_candidate_count`, +`estimated_seed_validation_chunk_count`, and `observed_failed_request_rate`; +use these when testing +`detect.validation_max_entities_per_call` so you can distinguish a real chunk +count change from provider retry variance. The model tables split the same +usage by `workflow_name` and `model_name`, which is useful for separating local +detector cost from validator, augmenter, substitute, or rewrite model cost. +When record-level measurements include ground-truth entities, the joined tables +also expose exact and relaxed entity-quality metrics. The relaxed metrics count +span overlap, with small label-equivalence groups for common aliases such as +`user_name` / `username` and `api_key` / `auth_token`. Case and group tables +also count empty detections, including empty records that had ground-truth +entities. If your suite adds portable topology tags such as `endpoint_count`, +`gpu_count`, or `tensor_parallelism`, the analysis computes per-endpoint and +per-GPU input-token throughput; otherwise those normalized fields remain null. +The case/group tables also surface incomplete benchmark cases with +`case_failed`, `error_stage_count`, `error_ndd_workflow_count`, +`error_model_workflow_count`, `failed_case_count`, and `failed_case_rate`. +Check these before interpreting a fast candidate as a safe improvement; a +failed repetition can otherwise look like entity instability or a latency win. +The joined case/group tables also expose final entity source counts from +detection artifacts, including `artifact_final_detector_entity_count`, +`artifact_final_rule_entity_count`, and +`artifact_final_augmenter_entity_count`. Use these to verify whether a faster +strategy is still relying on contextual detector/validator spans, or whether it +has shifted a workload entirely onto deterministic rules. +They also include `artifact_final_entity_signature_count` and +`artifact_final_entity_signature_hashes`, which are opaque per-row identifiers +derived from the final entity label and offsets. They do not include raw or +normalized entity values. The companion +`artifact_final_entity_signature_labels` field maps each opaque hash to its +entity label. These fields do not expose raw entity values, but they let +analysis tools detect when two configs report the same entity count while +protecting different spans. + +To compare a baseline and candidate strategy across common workloads, use: ```bash uv run python tools/measurement/compare_strategy_pairs.py \ benchmark-runs/suite-id/analysis/case_analysis.csv \ - --baseline-config default \ - --candidate-config native-single-pass \ - --output benchmark-runs/suite-id/analysis/default-vs-native-single-pass.csv + --baseline-strategy no_augment \ + --candidate-strategy rules_filter_guardrail_no_augment \ + --output benchmark-runs/suite-id/analysis/strategy_comparison.csv ``` -When one CSV does not contain both arms, pass `--candidate-case-analysis`: +If the candidate was run in a separate benchmark directory, pass a second case +analysis file: ```bash uv run python tools/measurement/compare_strategy_pairs.py \ - baseline/analysis/case_analysis.csv \ - --candidate-case-analysis candidate/analysis/case_analysis.csv \ - --baseline-strategy default \ - --candidate-strategy detector_native_validate_no_augment \ - --output comparison.csv + benchmark-runs/baseline-suite/analysis/case_analysis.csv \ + --candidate-case-analysis benchmark-runs/candidate-suite/analysis/case_analysis.csv \ + --baseline-strategy no_augment \ + --candidate-strategy rules_guardrail_no_augment ``` -`screen_strategy_comparisons.py` screens many comparison CSVs: +The comparison reports latency, request, token, entity-count, validation +candidate-count, augmentation-count, final source-count, and opaque +entity-signature deltas. It also reports original-value leak deltas from +`original_value_leak_count` and `original_value_leak_record_count`. The +`augmented_entity_count_delta` and +`augmented_new_final_value_count_delta` columns are especially useful for +no-augmentation and model-routing ablations: a faster candidate that removes +new final values from augmentation needs signature checks before promotion. +When signature labels are available, it also reports label counts for +baseline-only, candidate-only, and shared signatures. For repeated selector +runs, it also compares signatures that are stable across every repetition, +which catches cases where a candidate finds a sensitive span only +intermittently. It adds conservative flags such as +`baseline_case_failures`, `candidate_case_failures`, `entity_count_loss`, +`entity_signature_loss`, `span_boundary_mismatch`, +`covered_label_mismatch`, +`candidate_original_value_leak`, +`candidate_replacement_missing_final_entity`, +`candidate_duplicate_synthetic_replacement`, +`failed_request_increase`, `bridge_fallback_increase`, +`stable_entity_signature_loss`, `no_candidate_detector_entities`, +`candidate_uses_rule_entities`, `candidate_skips_llm_validation`, and +`replacement_only_detection_instability`, plus five verdict fields: + +- `value_protection_verdict`: `pass`, `review`, or `fail`. This axis focuses on + whether the candidate still protects the sensitive values. Candidate case + failures, candidate original-value leaks, missing replacement-map entries, + replacement collisions, and uncovered baseline signatures fail. Rule + provenance, validation skipping, + provider retry pressure, and covered boundary or label mismatches do not fail + this axis by themselves; they are represented in the semantic and overall + safety verdicts. +- `signature_parity_verdict`: `pass`, `review`, or `fail`. This axis focuses on + exact baseline signature semantics. Covered label or boundary mismatches stay + review-gated even when `value_protection_verdict` passes. +- `safety_verdict`: `pass`, `review`, or `fail`. Candidate case failures and + entity/signature loss fail. Candidate original-value leaks also fail, even + when entity signatures match. Baseline case failures, baseline + original-value leaks, rule-only, rule-heavy, or validation-skipping + candidates require review. Candidate provider failed-request increases or + bridge-fallback increases also require review: they are reliability signals, + not anonymization leaks. +- `performance_verdict`: `improved`, `mixed`, `regressed`, `unchanged`, or + `unknown`, based on available latency, request, and token deltas. +- `candidate_verdict`: `candidate_viable`, `review`, or `reject`. A candidate + is viable only when safety passes and measured performance improves. + +Use verdicts for triage, then inspect the underlying flags and label-count +deltas before promoting a strategy beyond benchmark experiments. +For replacement-only comparisons where the detection strategy is unchanged, +`replacement_only_detection_instability` means the candidate and baseline were +still run through independent detection passes and their detection artifacts +drifted. Treat that as a prompt to consult fixed-trace replacement replay before +blaming or promoting the replacement-map backend. +In fixed-trace replacement replay, +`candidate_duplicate_synthetic_replacement` means the local replacement backend +protected every original value but collapsed at least two replacements in the +same row to the same synthetic value. That is review-gated as a substitute +quality and relational-consistency concern rather than treated as an immediate +privacy leak. +When the replay CSV contains +`candidate_covers_baseline_replacement_missing_final_entity`, +`candidate_covers_baseline_original_value_leak`, or +`candidate_covers_baseline_replacement_synthetic_original_collision`, the +candidate removed a defect observed in the DataDesigner-backed substitute arm +on the same fixed detection trace. In that case `value_protection_verdict` can +pass while `signature_parity_verdict` remains review-gated, because the +candidate covered more of the final-entity set than the flawed baseline. + +For `rules_covered_or_default`, compare rule-covered configs by config ID so +the zero-model lane is checked against the same explicit label set: ```bash -uv run python tools/measurement/screen_strategy_comparisons.py benchmark-runs/ \ - --output benchmark-runs/strategy-screen.csv +uv run python tools/measurement/compare_strategy_pairs.py \ + benchmark-runs/suite-id/analysis/case_analysis.csv \ + --baseline-config rule-labels-default \ + --candidate-config rule-labels-covered-or-default \ + --output benchmark-runs/suite-id/analysis/rules-covered-comparison.csv ``` -Use `--group-by strategy_workload_family` when the same candidate behaves -differently across workload families. Use `--config-aliases aliases.json` to -group related config IDs, such as temperature or validation-window variants of -the same strategy. +Promote the fast path only when +`baseline_only_candidate_uncovered_signature_count` is zero on the target +workload, `candidate_original_value_leak_count` is zero, `candidate_verdict` is +at least `review`, and the review flags are expected rule fast-lane flags such +as `candidate_uses_rule_entities`, `no_candidate_detector_entities`, +`entity_count_loss`, or `span_boundary_mismatch`. Exact +`baseline_only_final_entity_signature_count` can be nonzero when a candidate +protects the same sensitive value with a wider or slightly narrower keyed span; +use the covered/overlapping/uncovered columns to decide whether that is an +acceptable workload policy. A run that has uncovered signatures or leaks +original detected values should reject: +in the June 8, 2026 sudo-password smoke run, the pre-fix comparison rejected the +candidate with `lost_labels=password:1`; after the narrow sudo rule was added, +the same comparison had no baseline-only signatures and remained review-gated +only because the final spans were rule-sourced. + +The command output also includes a rollup summary with verdict counts and the +workloads in each candidate-verdict bucket, which is useful for repeated runs +over larger suites. + +To screen many comparison CSVs from one or more benchmark directories, use: + +```bash +uv run python tools/measurement/screen_strategy_comparisons.py \ + benchmark-runs/ \ + --output benchmark-runs/strategy-screen.csv \ + --group-output benchmark-runs/strategy-groups.csv +``` -## Pandas patterns +When screening a scratch directory that contains older analysis outputs, filter +by source-path fragments: -Analysis tables are regular CSV/Parquet files. A typical local workflow: +```bash +uv run python tools/measurement/screen_strategy_comparisons.py \ + /tmp/anonymizer-benchmark-scratch \ + --source-include analysis-current-csv \ + --source-include analysis-failure-aware-csv \ + --output current-strategy-screen.csv \ + --group-output current-strategy-groups.csv +``` + +Use `--source-exclude` to omit known stale or exploratory subdirectories. +For example, if a scratch directory contains a pre-fix comparison and a rerun, +screen only current evidence by excluding the stale source-path fragment: + +```bash +uv run python tools/measurement/screen_strategy_comparisons.py \ + /tmp/anonymizer-perf-goal \ + --source-include comparison \ + --source-exclude before-sudo \ + --source-exclude structured-secrets-varied-comparison.csv \ + --output /tmp/anonymizer-perf-goal/strategy-screen-current.csv \ + --group-output /tmp/anonymizer-perf-goal/strategy-screen-current-groups.csv +``` + +The screen walks CSV files recursively, ignores non-comparison tables such as +`case_analysis.csv` and `group_analysis.csv`, and combines rows produced by +`compare_strategy_pairs.py`. It deduplicates exact repeated rows from copied +analysis directories, then sorts viable candidates first, then review and reject +rows, preserving latency/token deltas, flags, lost-label summaries, and +augmentation deltas. It also preserves baseline/candidate case counts, +baseline/candidate detection strategies, baseline/candidate replacement +strategies, stable-signature evidence counts, and candidate original-value leak +counts and labels. For DataDesigner-free experiments, it also preserves +`value_protection_verdict`, `signature_parity_verdict`, and label-mismatch +label counts, so one-off candidate rows are visible as weak evidence even before +opening the source comparison CSV. This is the quickest way to check whether a +benchmark directory contains any candidate worth rerunning on a larger workload +slice. + +Use the `evidence_level` column to separate current safety evidence from older +or weaker comparison rows. `split_verdicts` means the row has separate value +protection and signature-parity verdicts, `stable_signatures` means it has +stable-signature counts but not split verdicts, `signature_counts` means it only +has raw signature counts, and `legacy` means the screen can only use the older +aggregate verdict columns. The group output includes `evidence_level_counts` so +mixed scratch directories do not make a legacy row look as strong as a current +split-verdict rerun. + +The optional group output aggregates rows by candidate strategy when the +candidate used a non-default experimental strategy, or by candidate config +otherwise. This keeps ordinary config experiments, such as model routing or +prompt-parameter changes, from being collapsed under `strategy:default`. When +the same experiment used multiple config IDs, pass a JSON alias map: + +```json +{ + "biography-hybrid-augment-temp07": "biography-temp07-routing", + "biography-augment-temp07": "biography-temp07-routing" +} +``` + +```bash +uv run python tools/measurement/screen_strategy_comparisons.py \ + benchmark-runs/ \ + --group-by strategy_workload_family \ + --config-aliases config-aliases.json \ + --group-output benchmark-runs/strategy-family-groups.csv +``` + +Aliases only affect default-strategy, default-replacement config grouping. +Non-default experimental detection strategies still group by strategy; when a +candidate also uses a non-default replacement strategy, the group key appends +`replacement:`. If detection is default and only replacement changes, +the group key is `replacement:`. Use the group output to find +candidates with conflicting evidence, such as a no-augmentation candidate that +passes one slice and rejects on another. The +group table includes both best and worst latency, token, and request deltas so a +single fast slice does not hide a slower or unsafe repeat. It also includes +minimum baseline/candidate case counts and the minimum shared stable-signature +count observed in the group, plus summed candidate original-value leak counts +and leak labels. The +`recommendation` column is deliberately conservative: +`single_slice_viable` means one viable row exists but needs repeat evidence, +`candidate_family_viable` requires two or more viable rows and no review or +reject rows, `promising_needs_review` means viable rows exist but review-gated +rows remain and at least one split-verdict row is also viable, +`needs_split_verdict_rerun` means viable-looking and review-gated rows exist but +the group has only older signature-count or stable-signature evidence, or a +review-only group mixes current split-verdict rows with older comparison rows +that should be rerun under the current verdict schema, +`needs_viable_split_verdict` means older viable rows exist and split-verdict +evidence exists, but every split-verdict row is still review- or reject-gated, +`replacement_replay_review` means an improved replacement-strategy group is +review-gated by detection artifact drift even though the detection strategy did +not change; use fixed-trace replacement replay to isolate replacement-map +behavior, +`reliability_review` means every row improved performance but one or more rows +are review-gated by provider reliability signals such as failed-request or +sync-bridge fallback increases, +`fast_lane_review` means a `rules_only` or +`rules_covered_or_default` group improved performance, had explicit zero +candidate original-value leaks, had no uncovered baseline signatures, and is +review-gated only by expected rule fast-lane provenance or span-boundary flags, +`label_policy_review` means every row improved performance, passed +`value_protection_verdict`, and was review-gated on `signature_parity_verdict` +because the candidate protected a baseline value under a different label, +`review_only` means the family has no failures, still needs manual review, and +every review-gated row is `improved`, +`review_mixed_performance` +means the family has no failures but has mixed performance evidence, +`no_performance_win` means review-gated rows exist without an improvement +signal, `reject` means no viable rows survived, and `conflicting_evidence` means +at least one viable row and at least one rejected row exist for the same +candidate family. + +When a strategy's safety depends on workload shape, group by workload family: + +```bash +uv run python tools/measurement/screen_strategy_comparisons.py \ + benchmark-runs/ \ + --group-by strategy_workload_family \ + --output benchmark-runs/strategy-screen.csv \ + --group-output benchmark-runs/strategy-family-groups.csv +``` + +This keeps evidence from families such as shell-secret command logs, legal +records, and biographies separate. Use this mode before claiming a broad +performance improvement from a strategy that may only be safe on rule-covered +secret workloads. Use `--group-by strategy_workload` for an even stricter +per-workload grouping. + +The exporter groups records by `record_type`: + +- `run`: one row per Anonymizer run, with sanitized config, workload, model, and + runtime metadata. +- `stage`: one row per measured pipeline stage, with elapsed time, row counts, + and throughput fields. +- `record`: one row per input row when record-level measurement is enabled, + with text-size buckets, entity counts, replacement counts, rewrite scores, + and estimated nominal LLM call counts. +- `ndd_workflow`: one row per DataDesigner adapter call, with model aliases, + elapsed time, row counts, failed-record counts, and observed token/request + usage when DataDesigner exposes it. +- `model_workflow`: one row per non-DataDesigner model-backed workflow, such as + `native_rules_router`, `native_candidate_validate_no_augment`, + `detector_native_validate_no_augment`, + `detector_native_validate_native_augment`, `native_single_pass`, and the + other `native_single_pass*` strategies, with the same sanitized usage fields + as `ndd_workflow`. + +The tables never store raw text, prompts, generated outputs, entity values, or +replacement maps. `record_hash` is a run-scoped HMAC, so it can join rows within +one run but should not be treated as a durable dataset identifier. + +## Analysis Patterns + +Start with these questions: + +- Which workload/config pair is fastest at the same quality target? +- Which stage dominates wall time: detection, replacement, rewrite, or a + DataDesigner sub-workflow? +- Does latency scale with text length, entity count, or rewrite repair work? +- Do token counts, request counts, and failed records explain latency outliers? +- Are quality metrics worse on one data shape, such as legal text, biographies, + support tickets, shell history, or mixed natural-language/code records? + +Most analyses join `stage`, `record`, `ndd_workflow`, and `model_workflow` back +to `run` through `run_id`, then group by run tags: + +- `run_tags.suite_id` +- `run_tags.workload_id` +- `run_tags.config_id` +- `run_tags.experimental_detection_strategy` +- `run_tags.experimental_replacement_strategy` +- `run_tags.dd_parser_compat` +- `run_tags.repetition` +- `run_tags.case_id` + +Prefer medians and percentiles over averages when comparing latency. LLM calls +usually have long tails, and one retry or provider stall can distort a mean. + +For staged DD-free detection probes, convert the probe output first: + +```bash +uv run python tools/measurement/analyze_staged_detection_output.py \ + /tmp/anonymizer-perf-goal/no-dd-rules-plus-direct-biography-r5-current \ + --output /tmp/anonymizer-perf-goal/no-dd-rules-plus-direct-biography-r5-current/analysis \ + --format csv +``` + +Then read `analysis/group_analysis.csv` to compare `elapsed_sec_sum`, +`model_elapsed_sec_sum`, `model_request_count_sum`, `total_tokens_sum`, +`baseline_shared_signature_rate`, and +`baseline_only_final_entity_signature_count_sum`. Use `fast_lane_verdict` as +the first gate: `reject` means stop and inspect losses before running larger +slices; `fast_lane_candidate` means the sampled workload is a plausible +zero-model rule-covered lane with repeated evidence; `review` means the output +is incomplete, has too few cases, or still uses model work. The staged analyzer +requires at least three cases in a seed-source group before a clean zero-model +run can become `fast_lane_candidate`; one-row smokes remain `review` even when +they preserve all compared signatures. Read +`analysis/label_delta_analysis.csv` when the shared-signature rate is low; it +shows which labels drove the baseline-only losses or direct-only additions. + +## Pandas Examples + +Load exported tables: ```python +from pathlib import Path + import pandas as pd -cases = pd.read_parquet("benchmark-runs/suite/analysis/case_analysis.parquet") -groups = pd.read_parquet("benchmark-runs/suite/analysis/group_analysis.parquet") - -cols = [ - "workload_id", - "config_id", - "experimental_detection_strategy", - "median_pipeline_elapsed_sec", - "median_observed_total_requests", - "median_observed_total_tokens", - "median_artifact_final_entity_signature_count", -] -print(groups[cols].sort_values(["workload_id", "median_pipeline_elapsed_sec"])) +tables = Path("benchmark-runs/shell-and-biography-smoke/tables") +run = pd.read_parquet(tables / "run.parquet") +stage = pd.read_parquet(tables / "stage.parquet") +record = pd.read_parquet(tables / "record.parquet") +ndd = pd.read_parquet(tables / "ndd_workflow.parquet") +``` + +Compare end-to-end stage latency by workload and config: -failures = cases[ - (cases["case_failed"]) | - (cases["observed_failed_requests"] > 0) | - (cases["dd_trace_error_count"] > 0) +```python +stage_group_cols = ["run_tags.workload_id", "run_tags.config_id", "stage"] + +stage_summary = ( + stage + .groupby(stage_group_cols) + .agg( + runs=("run_id", "nunique"), + median_sec=("elapsed_sec", "median"), + p95_sec=("elapsed_sec", lambda s: s.quantile(0.95)), + rows_per_sec=("rows_per_sec", "median"), + ) + .reset_index() + .sort_values(["run_tags.workload_id", "stage", "median_sec"]) +) + +print(stage_summary) +``` + +Find slow records and relate them to text size and entity count: + +```python +record_view = record[ + [ + "run_tags.workload_id", + "run_tags.config_id", + "record_hash", + "text_length_tokens", + "text_length_tokens_bucket", + "final_entity_count", + "nominal_llm_call_count", + "utility_score", + "leakage_mass", + ] +].copy() + +shape_group_cols = [ + "run_tags.workload_id", + "run_tags.config_id", + "text_length_tokens_bucket", ] -print(failures[["case_id", "config_id", "observed_failed_requests", "dd_trace_error_count"]]) + +by_shape = ( + record_view + .groupby(shape_group_cols) + .agg( + records=("record_hash", "count"), + median_entities=("final_entity_count", "median"), + median_nominal_calls=("nominal_llm_call_count", "median"), + median_utility=("utility_score", "median"), + median_leakage=("leakage_mass", "median"), + ) + .reset_index() +) + +print(by_shape) ``` -Compare a candidate against a baseline: +Summarize DataDesigner token and request usage: ```python -comparison = pd.read_csv("benchmark-runs/suite/analysis/default-vs-native.csv") -candidate_rows = comparison[ - ["workload_id", "candidate_verdict", "safety_verdict", "performance_verdict", "flags"] +workflow_group_cols = [ + "run_tags.workload_id", + "run_tags.config_id", + "run_tags.experimental_detection_strategy", + "run_tags.experimental_replacement_strategy", + "run_tags.dd_parser_compat", + "workflow_name", ] -print(candidate_rows) + +token_summary = ( + ndd + .groupby(workflow_group_cols) + .agg( + calls=("workflow_name", "count"), + median_sec=("elapsed_sec", "median"), + total_input_tokens=("observed_input_tokens", "sum"), + total_output_tokens=("observed_output_tokens", "sum"), + total_requests=("observed_total_requests", "sum"), + failed_records=("failed_record_count", "sum"), + ) + .reset_index() + .sort_values(["run_tags.workload_id", "run_tags.config_id", "median_sec"]) +) + +print(token_summary) +``` + +Summarize provider usage by workflow and model: + +```python +model_usage = pd.read_csv("benchmark-runs/suite-id/analysis/model_group_analysis.csv") + +retry_sources = ( + model_usage + .sort_values( + ["sum_observed_failed_requests", "sum_observed_total_tokens"], + ascending=[False, False], + ) + [ + [ + "workload_id", + "config_id", + "workflow_name", + "model_name", + "sum_observed_total_requests", + "sum_observed_failed_requests", + "observed_failed_request_rate", + "sum_observed_total_tokens", + ] + ] +) + +print(retry_sources) ``` -Find candidate-specific misses: +Join run metadata to stage timing: ```python -loss_cols = [ - column for column in comparison.columns - if column.startswith("baseline_only_final_entity_signature_label_counts.") +run_meta = run[ + [ + "run_id", + "mode", + "strategy", + "detect.entity_label_count", + "detect.validation_max_entities_per_call", + ] ] -print(comparison[["workload_id", *loss_cols]].fillna(0)) + +stage_with_config = stage.merge(run_meta, on="run_id", how="left") + +config_group_cols = ["mode", "strategy", "detect.entity_label_count", "stage"] + +print(stage_with_config.groupby(config_group_cols)["elapsed_sec"].median()) ``` -## Metric interpretation +For quick interactive work, CSV can be easier than Parquet: + +```bash +uv run python tools/measurement/export_measurements.py \ + benchmark-runs/suite-id/measurements.jsonl \ + --output /tmp/suite-csv \ + --format csv \ + --overwrite +``` + +## Signature Delta Review + +Use `extract_signature_deltas.py` when a fast candidate has fewer, more, or +different final entity signatures than a higher-recall reference run. The tool +compares two `detection-artifacts.jsonl` files and recovers local context from +the DataDesigner artifact parquet files. Entity values are masked by default: +the output stores label, source, span offsets, value length, signature id, and a +small context window with the entity replaced by a placeholder. It does not +emit a hash derived from the raw entity value. + +Example: review spans found by a text/raw-parser reference but missed by a +hybrid candidate for one workload/config pair: + +```bash +uv run python tools/measurement/extract_signature_deltas.py \ + /tmp/reference/detection-artifacts.jsonl \ + /tmp/candidate/detection-artifacts.jsonl \ + --baseline-artifact-root /tmp/reference/artifacts \ + --candidate-artifact-root /tmp/candidate/artifacts \ + --baseline-config legal-default \ + --candidate-config legal-hybrid-rules-guardrail \ + --workload legal-r2 \ + --output /tmp/legal-signature-deltas.csv \ + --format csv +``` + +Interpretation: + +- `baseline_only` rows are spans the candidate missed relative to the + reference. +- `candidate_only` rows are spans the candidate found that the reference did + not. +- `resolution=parquet` means the span was recovered from DataDesigner's final + detection artifacts. +- `resolution=artifact_details` means the span was reconstructed from + sanitized final signature details plus the artifact row's source text. This + is common for benchmark-only strategies that patch final entities from an + in-memory dataframe after a seed-stage artifact is written. +- `resolution=rule` means the span was reconstructed from deterministic + rule-guardrail logic because it was added after DataDesigner wrote parquet. +- `resolution=metadata_only` means only the opaque signature metadata was + available; use this as a signal to rerun with trace/artifact capture if the + delta matters. + +## Current Local Findings + +These findings come from small local vLLM runs against +`nvidia/nemotron-3-super`; treat them as triage signals, not defaults. + +| Strategy | Latest local result | Status | Implication | +| --- | --- | --- | --- | +| `rules_only` on the three-row shell-secrets slice | Preserved all 12 stable signatures; median latency moved from 7.2s to 0.004s, requests from 12 to 0, and tokens from 11,019 to 0. | Review | Viable only for bounded secret scans where every requested label is covered by deterministic rules. | +| `rules_guardrail_detector_only` on the same shell-secrets slice | Preserved stable signatures and reduced model work, but one candidate repetition failed during GLiNER health checks. | Review | Useful as a structured-secret diagnostic, but less attractive than `rules_only` when labels are fully rule-covered. | +| `rules_filter_guardrail` on the same shell-secrets slice | Retry-enabled rerun completed all 6 cases. It preserved all 12 signatures, reduced seed validation candidates from 11 to 0, median pipeline latency from 8.0s to 3.9s, requests from 12.5 to 7.0, and tokens from 10,966 to 3,647. | Review | Useful as a mixed-workload probe; keep it review-gated because final entity provenance is rule-only for this slice. | +| `rules_filter_guardrail` on a mixed biography/legal/shell probe | After changing rule filtering to preserve different-label overlaps, repeated two-row biography, one-row legal, and three-row shell runs had no stable or unstable signature loss. Median pipeline latency moved from 28.5s to 20.0s on biography, 19.4s to 18.2s on legal, and 8.7s to 6.1s on shell. | Review | Historical positive probe only; the larger five-row non-shell repeat below did not preserve this signal. | +| `rules_filter_guardrail` on offset biography/legal slices | After hardening rule filtering so only fully covered same-label spans are skipped and rule reinsertion is additive, the five-row biography offset slice had no signature loss but moved into review because requests increased slightly while tokens decreased. The richer two-row legal offset slice rejected: latency, requests, and tokens regressed and one repetition missed three `court_name` signatures while adding one rule-backed `date_of_birth`. | Mixed | The hardened strategy is safer than the first version, but it still needs per-workload gates and is not a broad legal/prose default. | +| `rules_filter_guardrail` on current five-row biography/legal repeats | Biography preserved stable and unstable signatures but regressed latency from 37.8s to 45.9s and requests from 20.5 to 22.5. Legal improved latency from 60.6s to 51.2s and tokens from 63,072 to 61,568, but lost five stable `date` signatures and made two stable `person` signatures unstable. | Reject | Do not promote this as a prose/legal default; the safety and latency tradeoff is workload-dependent and fails the legal signature gate. | +| `rules_guardrail` on a five-row legal slice | Same-suite repeated comparison against default preserved stable and unstable signatures, but latency regressed from 39.6s to 47.1s, requests rose from 20.0 to 20.5, and tokens were roughly flat at 60,998 to 60,757. | Mixed | Deterministic date guardrails can improve coverage without signature loss, but they are not a legal-prose performance win on this slice. | +| `detector_only` and `rules_guardrail_detector_only` on prose/legal slices | Faster on one-row smoke checks, but lost baseline signatures on biography and legal samples. A current detector-only isolation rerun moved biography 27.3s → 0.9s and 8,416 → 526 tokens, but lost two `first_name` and one `organization_name` signatures. Legal moved 52.0s → 1.0s and 14,095 → 1,078 tokens, kept `date_of_birth`, but still lost one `date` and one `nationality` signature while adding many extra spans. | Reject | Local finalization alone is not a safe replacement for validation and augmentation on contextual text. The legal rerun is useful diagnostically because raw detector output kept `date_of_birth`, so a later native-validation miss likely came from validation behavior rather than detector seeding. | +| One-shot DD-free direct detection on biography/legal row 0 | Biography completed in 5.1s with 902 tokens but shared only 18/22 baseline signatures. Legal completed in 5.8s with 1,308 tokens but shared only 19/22 baseline signatures. | Reject as replacement | This is a useful speed boundary and prompt experiment, but a single extraction prompt drops core detections on non-shell workloads. | +| Standalone direct-detection five-row probe | A fresh local probe compared one direct extraction call per row against the current staged direct reference. On legal, compact direct detection moved from 62.3s and 15 requests to 17.1s and 5 requests, but shared only 75/147 reference signatures and missed 72. Recall prompting improved legal to 31.1s, 109 final entities, 102 shared, and 45 missed. On biographies, compact direct detection moved from 85.7s and 15 requests to 21.2s and 5 requests, with 91/102 shared signatures, 11 missed, and no extras; recall prompting regressed to 62 shared and 40 missed. Outputs: `/tmp/anonymizer-perf-goal/direct-detection-legal-r5-compact-after-guard`, `/tmp/anonymizer-perf-goal/direct-detection-legal-r5-recall-after-guard`, `/tmp/anonymizer-perf-goal/direct-detection-biography-r5-compact-after-guard`, `/tmp/anonymizer-perf-goal/direct-detection-biography-r5-recall-after-guard`. The benchmark harness can now run this value-only prompt shape through `native_single_pass_values` and `native_single_pass_values_recall`. | Mixed diagnostic | The one-call path is the clearest lower-bound latency test, but it is not a general anonymization-safe replacement. Compact one-call extraction may deserve workload-specific follow-up for biographies; legal still needs augmentation or a stronger candidate source. Recall prompting is not monotonic across domains. | +| Staged DD-free detection on biography/legal row 0 | Biography improved to 20/22 shared signatures but took 13.7s and 4,550 tokens. Legal stayed at 19/22 shared signatures while taking 17.5s and 6,425 tokens. Hosted GLiNER seeding was unavailable due a `DEGRADED function cannot be invoked` response for `nvidia/gliner-pii`. | Mixed diagnostic | A native no-DataDesigner executor is still plausible, but only if it preserves phase boundaries with much cheaper seed/validation prompts or deterministic code. Naive direct LLM phases are not enough. | +| Chunked-excerpt validation in staged DD-free detection | On current one-row reruns, biography preserved the same 20/22 shared signatures as full-text validation but moved from 10.8s, 4,527 tokens, and 3 model requests to 13.3s, 5,648 tokens, and 6 requests. Legal preserved the same 19/22 shared signatures but moved from 14.7s, 6,425 tokens, and 3 requests to 17.2s, 7,727 tokens, and 7 requests. | Reject | Splitting direct validation into local excerpts increases repeated instruction overhead and request count on these non-shell rows. Do not pursue validator excerpting as a standalone no-DD speedup unless longer records show a different request/token crossover. | +| Rules-seeded staged DD-free detection | `rules` improved biography/legal latency but still lost baseline signatures; legal row 0 reached 20/22 shared signatures at 7.1s and 2,213 tokens. On shell-secrets, validation reclassified a database URL as a password and lost one baseline URL signature. `rules-trusted` fixed that shell loss and preserved all 12 shell signatures with one augmentation call per row, but still missed biography/legal signatures. With `--skip-augmentation-when-rule-covered`, trusted rules preserved all 12 shell signatures with zero model usage. | Mixed diagnostic | Deterministic seed spans are useful, but rule-covered spans should not always go through LLM validation. A native executor needs workload gates and should short-circuit locally when every requested label is rule-covered. | +| Rules + direct LLM staged DD-free detection | `rules-plus-direct-llm` preserved all 12 shell-secrets signatures while avoiding validation, but still used two model calls per row and 726-938 tokens because the direct seed and augmentation phases still ran. On row-0 smokes it looked like the most plausible mixed no-DD path: biography shared 20/22 signatures at 10.8s and 4,465 tokens, and legal shared 20/22 at 11.5s and 5,929 tokens. The five-row gate rejected it: biography shared only 80/114 baseline signatures, lost 34 signatures, and took 85.7s versus the DD baseline's 32.9-47.8s; legal shared 108/145, lost 37 signatures, and took 62.3s versus the DD baseline's ~39.5s. | Reject for contextual workloads | Trusting deterministic structured spans locally is still useful, but direct LLM seed/validation/augmentation is not a safe or faster replacement for DataDesigner-backed contextual detection on prose/legal slices. Keep no-DD promotion limited to fully rule-covered structured-secret lanes unless a new native executor passes repeated signature gates. | +| Rules router staged DD-free detection | `rules-router` preserved all 12 shell-secrets signatures with no seed, validation, or augmentation model calls. The mixed/contextual fall-through did not generalize: on five biographies it shared 96/114 default signatures and lost 18 baseline signatures; on five legal rows it shared 86/145 default signatures and lost 59 baseline signatures. The benchmark-safe expression of this result is `rules_covered_or_default`, which short-circuits only fully rule-covered label sets and otherwise runs default detection. | Mixed | Keep the router only for the zero-model rule-covered structured-secret lane. Do not use the direct local LLM fall-through as a prose/legal replacement; use default Anonymizer or another signature-gated strategy for contextual rows. | +| Integrated `native_rules_router` benchmark with corrected direct-call metering | A five-row benchmark-harness run on biography/legal confirmed the staged finding. Biography moved from 32.9s to 85.6s, requests from 20 to 15, and tokens from 43,354 to 26,644, but entities fell from 114 to 102 and 34 baseline signatures were uncovered. Legal moved from 54.3s to 62.3s, requests from 21 to 15, and tokens from 60,649 to 31,894, but 37 baseline signatures were uncovered. Both workloads rejected. | Reject for contextual workloads | Direct native calls can reduce request and token counts while still losing safety and wall-time. Treat lower token counts as insufficient evidence; contextual promotion requires signature preservation and latency improvement together. | +| Integrated `native_candidate_validate_no_augment` smoke | One-row biography/legal benchmark-harness smoke proved the no-augmentation native executor is much cheaper but unsafe. Biography moved from 24.8s to 5.9s, requests from 4 to 2, and tokens from 8,092 to 2,000, but entities fell from 15 to 12 and lost `age`, `first_name`, and `organization_name` signatures. Legal moved from 49.8s to 10.9s, requests from 4 to 2, and tokens from 13,791 to 3,823, but entities fell from 23 to 21 and lost `date`, `date_of_birth`, and `nationality` signatures. Both rows had zero original-value leaks. | Reject for contextual workloads | Removing augmentation from the native executor gives the expected speed boundary, but augmentation or a stronger candidate source remains load-bearing for contextual recall. Keep this arm as a diagnostic, not a promotion candidate. | +| Integrated `detector_native_validate_no_augment` smoke | Keeping the detector seed and replacing DataDesigner validation/augmentation with direct validation is much cheaper, but quality remains workload-dependent. Biography still rejects: latest one-row rerun moved 26.6s -> 6.7s and 8,398 -> 2,347 tokens, but entities fell 15 -> 14 and two augmenter-sourced child `first_name` signatures were uncovered. A focused one-row legal repeat improved median latency from 15.0s to 11.0s, requests from 4 to 3, and tokens from 9,516 to 4,150 with zero leaks. After row-parallel direct validation plus deterministic DOB-context label normalization, a wider three-row, two-repeat legal gate moved median elapsed from 40.6s to 21.3s, requests from 12.5 to 6.5, and tokens from 37,972 to 17,902 with zero original-value leaks. The split verdicts were `value_protection=pass`, `signature_parity=review`, and `performance=improved`: a filing-date span that baseline labeled `date_of_birth` was protected as `date`, while separate birth-context years were added as `date_of_birth`. | Mixed: biography reject, legal label-policy review | The promising shape is not "remove DataDesigner everywhere"; it is "keep DD as fallback, use deterministic fast lanes where provably covered, and only replace validation when a native validator preserves both coverage and label semantics across repeated gates." The legal repeats now show a real latency win, but a DD-free candidate may protect values correctly while disagreeing with DataDesigner label semantics. That should stay review-gated until label policy says whether such covered reclassification is acceptable. | +| Integrated `detector_native_validate_no_augment` substitute gate | A one-row legal substitute smoke first showed the same review shape as the redact gate: latency moved from 21.1s to 15.2s, requests from 5 to 3, and tokens from 12,192 to 6,871 with zero original-value leaks. The wider three-row, two-repeat substitute gate still improved performance, but rejected on safety: median pipeline latency moved from 44.0s to 33.4s, requests from 15.0 to 9.0, tokens from 47,958 to 28,465.5, and failed requests from 3.0 to 0.0, while both baseline and candidate leaked two original `date` values across two row-runs. Replacement-map coverage was complete; local replay showed the leak was a substitute collision where one synthetic date reused another protected original date in the same record. The candidate added 11 stable signatures, but had covered label mismatches including a stable `date_of_birth` -> `date` mismatch. | Reject for substitute promotion | Native validation reduces detection cost even when substitute still uses normal replacement-map generation, but speed cannot promote a substitute strategy while original values survive in replaced output. The leak appears in the default substitute arm too, so this is a baseline substitute safety issue separate from the native validator. | +| Integrated `gliner_native_validate_*` no-DataDesigner gate | A one-row biography/legal smoke tested direct hosted GLiNER seeding outside DataDesigner plus direct native validation, with and without direct native augmentation. Biography no-augment rejected despite improving latency/tokens because it lost two `first_name` signatures. Biography with native augmentation passed the one-row gate: latency 13.7s -> 10.2s, requests 4 -> 3, tokens 8,033 -> 5,035, entities 22 -> 24, zero leaks, and only candidate-only additions. After bounded per-row parallelism and targeted label-boundary guidance in the integrated no-DD executor, a repeat-3 five-row biography gate improved median wall time 40.7s -> 25.5s, requests 21 -> 15, and tokens 43,371 -> 27,643 with zero original-value leaks and no case failures. The guidance removed the earlier `first_name` label mismatches, but repeat comparison rejected the candidate: four baseline signatures were only covered with mismatched labels (`degree`: 1, `last_name`: 2, `place_name`: 1), and six stable baseline signatures became unstable (`degree`: 1, `last_name`: 2, `organization_name`: 2, `place_name`: 1). Legal improved latency/tokens in both one-row arms, but stayed review-gated because a generic filing date that baseline labeled `date_of_birth` was protected as `date` by the candidate; the seed guardrail correctly does not promote dates without birth/DOB context. | Reject for contextual biographies | Direct GLiNER outside DataDesigner is a useful performance diagnostic, but repeated stable-signature gates block promotion on this biography slice. Lower requests/tokens plus faster wall time are insufficient if label semantics are unstable. | +| Integrated `native_single_pass` benchmark smoke | One-row benchmark-harness smoke on biography/legal showed the speed boundary for collapsing detection into one direct provider call. Biography improved latency 10.3s → 1.7s, requests 4 → 1, and tokens 5,059 → 597, but found 4 entities versus 7 and lost three `person` signatures, so it rejected. Legal improved latency 19.2s → 1.1s, requests 5 → 1, and tokens 7,107 → 838 while preserving both signatures, so that single row was viable. | Mixed diagnostic | The one-call native extractor is worth keeping as a benchmark arm, but it is not safe for broad contextual use. Promotion needs repeated workload-specific signature gates; a legal-row win does not cancel the biography miss. | +| Integrated `native_single_pass` five-row gate | After adding a local deterministic rule guardrail, the larger biography/legal run still rejected both contextual workloads. Biography moved from 24.1s to 8.3s, requests from 21 to 5, and tokens from 26,759 to 3,078, but entities fell from 36 to 21 and it lost 16 `person` signatures. Legal moved from 35.7s to 6.1s, requests from 21 to 5, and tokens from 38,569 to 5,781, but entities fell from 14 to 12 and it lost three `person` signatures. | Reject for contextual workloads | Local rules can cheaply protect deterministic secret shapes, but they do not fix contextual recall. Collapsing detection to one direct call remains a useful lower-bound latency experiment, not a safe contextual replacement. | +| Integrated `native_single_pass_recall` five-row gate | The recall prompt improved raw recall, especially on legal text, but still rejected both workloads. Biography moved from 23.0s to 10.2s, requests from 21 to 5, and tokens from 26,730 to 4,072, but entities fell from 36 to 26 and it still lost 16 `person` signatures. Legal moved from 32.2s to 8.7s, requests from 21 to 5, and tokens from 38,085 to 6,885; entity count rose from 14 to 20, but two baseline `person` signatures were still uncovered. | Reject for contextual workloads | Prompt recall can improve counts without satisfying anonymization safety. One-call contextual extraction remains below the signature gate even when it is much faster and cheaper than default. | +| Integrated `native_single_pass_values*` value-only five-row gate | Two repetitions on five NVIDIA biography rows and five TAB legal rows confirmed the value-only one-call prompt shape is only a speed boundary. Compact values mode improved latency by 55.6% on biographies and 68.9% on legal, with 15-15.5 fewer requests and 31,770-60,491 fewer tokens, but rejected both workloads after losing 45 biography and 123 legal baseline-only signatures. Recall values mode still rejected: it improved latency by 31.9% on biographies and 38.7% on legal, but lost 40 biography and 96 legal baseline-only signatures. Output: `/tmp/anonymizer-native-values-paired-r5`. | Reject for contextual workloads | Returning values instead of offsets makes parsing cheaper but does not solve contextual recall. Keep this arm in the harness as a lower-bound diagnostic; do not promote one-call extraction on biographies or legal text without a different seed source or repeated signature parity. | +| Structured fast-lane router tightening | `rules_covered_or_default` now short-circuits only the structured-secret labels `api_key`, `email`, `http_cookie`, `password`, `pin`, `unique_id`, `url`, and `user_name`. Narrow prose rule labels such as `date_of_birth`, `organization_name`, `religious_belief`, and `street_address` fall back to default detection unless `rules_only` is explicitly selected. A shell-secret smoke still found 12 entities across 3 records with 0 model rows, 0 requests, and 0 tokens. | Review | This preserves the no-DataDesigner win without assuming all inputs are shell logs. Local prose rules remain useful as explicit experiments or guardrails, but they are not complete enough for automatic contextual anonymization. | +| Narrow prose-label augmentation skip probe | On one synthetic `organization_name` + `street_address` row, `rules_covered_or_default` correctly fell back to model-backed detection instead of using the zero-model fast lane. A repeat-3 comparison then found `rules_guardrail_no_augment` preserved the same two signatures with zero leaks while moving median latency 3.0s → 2.6s, requests 4 → 3, and tokens 3,069 → 2,133. | Candidate | Skipping augmentation can be viable for tightly scoped prose-label slices when detector+validator already recover the needed spans. This is not a broad prose default; promote only through repeated signature gates, especially on biographies and legal text where augmentation may carry recall. | +| Real biography/legal no-augmentation check | On two NVIDIA biography rows, pure `no_augment` rejected: latency regressed 24.1s → 28.8s, entities fell 48 → 46, and two `first_name` signatures were lost. `rules_guardrail_no_augment` improved biography latency/tokens (24.1s → 18.3s, 17,992 → 11,905 tokens) but still rejected after losing the same two `first_name` signatures and using rule-sourced spans. On two TAB legal rows at offset 2, `no_augment` preserved signatures and reduced tokens but regressed latency (27.2s → 38.6s) and increased failed-request rate; `rules_guardrail_no_augment` preserved signatures with modest latency/token gains but remained review-gated because it introduced rule-sourced spans. | Mixed: biography reject, legal review | The synthetic augmentation-skip win does not generalize to biography prose. Augmentation remains load-bearing for contextual name recall, and legal gains need repeated runs plus failed-request scrutiny before promotion. | +| `rules_covered_or_default` mixed benchmark harness run | A two-row synthetic shell-secret run initially exposed a rule hole: default found one sudo-stdin password that the rule-only path missed. After adding a narrow `echo "..." | sudo -S` password rule, the rerun preserved all 9 shell signatures with detection latency 21.4s → 0.004s, requests 8 → 0, and tokens 9,854 → 0. One-row biography and legal contextual configs included `person`, so they fell back to model-backed detection and matched default entity counts. | Review | This is the safest implementation shape for the no-DataDesigner idea: use local rules only where labels and observed signatures prove coverage, and treat every missed signature as a rule-quality bug or a reason to fall back. | +| `rules_covered_or_default` current mixed fallback run | Current-code rerun completed all 6 cases. Shell secrets preserved all 9 signatures with pipeline latency 23.1s → 0.005s, requests 8 → 0, and tokens 10,173 → 0. The biography and legal configs included `person`, so both candidate cases fell back to model-backed detection and matched default entity counts and signatures: biography 7/7, legal 2/2. | Review | The router is behaving as designed after the rule-only `tagged_text` contract fix: structured secret configs can short-circuit locally, while contextual non-shell configs stay on default detection. | +| `rules_covered_or_default` repeated shell-secret run | A three-repetition shell-only suite completed all 6 cases. The candidate preserved all 9 final signatures in every repetition with median detection latency 29.4s → 0.004s, requests 9 → 0, and tokens 10,112 → 0. Default detection was unstable on this tiny slice: one repetition missed one `api_key`, so stable signatures were 8 for default and 9 for the rules path. The comparison remained review-gated, not viable, because all candidate spans were rule-sourced. | Review | Repeated evidence strengthens the structured-secret fast path but also shows why promotion should use stable-signature comparisons rather than treating default as perfectly deterministic on every repetition. | +| `rules_covered_or_default` on non-shell structured secrets | A four-row JSON/env/HTTP-header/YAML-style suite initially rejected after exposing two deterministic-rule gaps: URLs swallowed trailing semicolon separators and `session_id=...` cookie values were not protected. After tightening URL boundaries and adding a narrow `session_id` assignment rule, the rerun preserved all 17 default signatures while moving detection latency 25.8s → 0.010s, requests 16 → 0, and tokens 19,167 → 0. A repeat-3 run then kept all candidate signatures stable: default produced 15, 15, and 16 entities with median 18,822 tokens, while the rules path produced 17 entities every time with zero model requests and zero tokens. | Review | The no-DataDesigner fast lane is not shell-specific, but it must remain rule-coverage and signature-gated. Treat every structured-secret miss as either a narrow rule bug with tests or a reason to fall back to default detection. | +| `local_structured_substitute` on non-shell structured secrets | A four-row JSON/env/HTTP-header/YAML-style substitute suite preserved the same 17 final entities with zero original-value leaks. In a repeat-3 run, DataDesigner-backed substitute had median pipeline latency 38.1s, 4 requests, and 13,967 tokens for replacement-map generation; individual DD-backed runs ranged from 30.7s to 62.4s. `local_structured_substitute` had median latency 0.005s, 0 requests, and 0 tokens while preserving the same 17 replacements. | Review | Replacement-map generation is now another defensible no-DataDesigner lane for structured labels. Keep it benchmark-only until repeated gates and a policy decision define which structured labels deserve public API support. | +| `local_structured_substitute` with model-backed detection fallback | A one-row audit-style structured-identifier suite requested `api_key`, `http_cookie`, `pin`, `unique_id`, and `user_name`, so `rules_covered_or_default` fell back to normal model-backed detection in both arms. Both arms found the same 5 final entities with zero original-value leaks. Local replacement removed the replacement-map workflow, moving pipeline latency 53.6s → 33.0s, requests 5 → 4, and tokens 11,547 → 7,694. The pairwise comparison marked the candidate viable. | Candidate | Local replacement-map generation can help even when detection still needs DataDesigner. This is a cleaner promotion path than rule-only detection because contextual detection provenance is preserved; keep rejecting contextual replacement labels such as `person`. | +| `local_structured_substitute` with default detection on varied audit/config/HTTP identifiers | A four-row repeat-3 suite isolated replacement-map generation by keeping default model-backed detection in both arms. After adding a local synthetic-original collision guard, the guarded rerun kept value protection clean: zero original leaks, zero missing replacement-map entries, and zero synthetic-original collisions. Local substitute moved median pipeline latency 18.8s -> 12.7s, requests 21 -> 17, and tokens 24,324 -> 17,015. A current fixed-trace replay held detection constant at 21 entities and measured replacement only: DataDesigner substitute took 6.15s while local structured substitute took 0.003s, with 21/21 replacements and zero leaks/collisions in both arms. Regenerating the older repeat comparison with split verdicts moved the strategy-screen group out of `needs_split_verdict_rerun`; adding the fixed-trace replay comparison moved it to `promising_needs_review`. All three rows have `value_protection=pass`; the replay row has `signature_parity=pass` and `candidate_verdict=candidate_viable`, while one full-pipeline pairwise row has `signature_parity=review` because two covered signatures used different labels (`api_key`, `unique_id`). The comparison now tags this drift as `replacement_only_detection_instability` because detection strategy did not change. | Promising needs review | This is the cleanest structured-label promotion path because detector provenance stays model-backed in full-pipeline runs and the replacement backend passes fixed-trace replay. It is not fully promoted because normal pairwise runs still need monitoring for provider reliability and detection-run label drift. | +| `local_structured_substitute` fixed-trace replay on biography structured labels | A five-row NVIDIA synthetic biography replay used model-backed detection for `date_of_birth`, `organization_name`, `religious_belief`, and `street_address`, then replayed both substitute backends three times on the same 56 detected entities. After making local replacement-map generation avoid per-record duplicate synthetic values, both arms produced 159 replacements across 15 replacement attempts with zero duplicate synthetics, zero missing replacement-map entries, zero original-value leaks, and zero synthetic-original collisions. DataDesigner substitute took 23.59s for replacement-map generation and local structured substitute took 0.006s. The replay comparison marks `value_protection=pass`, `signature_parity=pass`, `safety=pass`, and `candidate_verdict=candidate_viable`. Output: `/tmp/anonymizer-perf-goal/biography-supported-structured-replacement-replay-repeat3.json`; comparison: `/tmp/anonymizer-perf-goal/biography-supported-structured-replacement-replay-repeat3-comparison.csv`; screen: `/tmp/anonymizer-perf-goal/strategy-screen-local-substitute-with-biography-replay-groups.csv`. | Candidate | This broadens the replacement-only result beyond shell or config logs without claiming DD-free contextual detection. The speed and leak profile are strong, the duplicate-collapse issue is fixed for this slice, and repeated replacement-only evidence shows the local backend can preserve replacement-map coverage when detection is held fixed. The remaining gate is policy: decide which structured labels and text shapes are eligible for deterministic substitute generation in production-facing configuration. | +| Expanded `rules_covered_or_default` + `local_structured_substitute` on an audit-style structured identifier record | After adding narrow keyed coverage for `http_cookie`, `pin`, `unique_id`, and `user_name`, the candidate protected all baseline signatures, found one additional `unique_id`, had zero original-value leaks, and moved pipeline latency 9.2s → 0.005s, requests 5 → 0, and tokens 6,075 → 0. | Review | This extends the no-DataDesigner fast lane beyond shell logs into keyed audit/config/HTTP-style structured records. It remains review-gated because every final span is rule-sourced and this run used one row. | +| Expanded `rules_covered_or_default` + `local_structured_substitute` on varied audit/config/HTTP identifiers | A four-row repeat-3 suite preserved every baseline-only signature through containing or overlapping candidate spans, with zero original-value leaks. Median pipeline latency moved 21.1s → 0.006s, requests 21 → 0, and tokens 24,332 → 0. The comparison records 8 exact baseline-only signatures, 8 candidate-covered signatures, 2 span-boundary mismatches, and 0 uncovered signatures. | Review | This is the strongest no-DataDesigner result so far for non-shell structured records. It is still not a default: all final spans are rule-sourced, and two protected values had different span boundaries such as `token=` versus ``, so promotion needs a workload policy gate. | +| Row-aware `rules_covered_or_default` + local substitute smoke | A four-row JSON/env/HTTP-header/YAML-style suite initially rejected because quoted JSON `user`/`pin` keys were not rule-covered. After adding quoted-key coverage and changing the router to fall back per row on suspicious uncovered structured assignments, the structured candidate moved pipeline latency 9.7s -> 0.0s, requests 20 -> 0, and tokens 20,080 -> 0 while matching entity count 10 -> 10 with zero original-value leaks. One-row biography and legal controls included `person`, used default detection in both arms, and passed comparison gates. | Review | The no-DataDesigner path is now safer: eligible labels are necessary but not sufficient, and rows with uncovered structured fields go through normal detection. The structured candidate still stays review-gated because one `HF_TOKEN` value was protected under a different label/boundary than the default `http_cookie` span. | +| Row-aware `rules_covered_or_default` + local substitute repeat gate | A focused repeat-3 split-verdict suite reran the same four structured rows after the row-aware router change. All 6 cases completed. Default substitute had median pipeline latency 12.4s, 21 requests, and 20,071 tokens; the row-aware rules/local candidate had median latency 0.006s, 0 requests, and 0 tokens. Both arms found 10 entities in every repetition and had zero original-value leaks or synthetic-original collisions. The split-verdict comparison has `value_protection=pass` but remained `safety=review` and `signature_parity=review`: one stable baseline `http_cookie` signature was protected by the candidate under an `api_key` label with a span/boundary mismatch. Output: `/tmp/anonymizer-perf-goal/structured-fastlane-split-r3`. | Needs viable split verdict | This is a large structured fast-lane performance win, but not promotion-ready. The next decision is whether the covered `http_cookie` -> `api_key` mismatch is acceptable value protection for this workload or whether the deterministic rules need to match baseline label semantics more closely. | +| `bio-vmax10-w80` validator window tuning | Rejected on biography rows 6-10: latency, requests, and tokens regressed, and stable `field_of_study` and `state` signatures were lost. | Reject | Smaller validation windows need per-workload proof; prompt-size savings can be outweighed by more calls and lost context. | +| Text augmenter routing at `temperature: 0.3` | A one-row biography smoke test passed, but repeated five-row slices did not: rows 0-4 preserved signatures while latency regressed from 40.4s to 45.9s and requests from 21.0 to 21.5; rows 5-9 rejected after latency regressed from 41.0s to 52.1s and two stable `state` signatures became unstable. | Reject | JSON-validator/text-augmenter routing at the default text temperature is not a reliable prose speedup on these slices. | +| Text augmenter routing at `temperature: 0.7` | Passed the first biography slice, then failed on rows 6-10 by losing a stable `university` signature and regressing latency. | Reject | Do not promote the routing pattern from a single positive slice. | +| `rules_guardrail_no_augment` on legal prose | Improved latency/tokens on legal rows 2-3, but lost two stable `first_name` signatures. | Reject | Augmentation remains load-bearing for contextual names, even when aggregate entity counts look acceptable. | + +No broad replacement for the default prose/legal detection path has passed the +current repeated signature checks. The only strong performance result so far is +workload-scoped: deterministic rules for tightly bounded, rule-covered secret +scans. + +When DataDesigner message traces are enabled, interpret failed request counts +through `observed_non_bridge_*` metrics before drawing provider-reliability +conclusions. Across 13 local trace files, the local-vLLM +`SyncClientUnavailableError` rows were 104 near-zero-latency sync-to-async +bridge fallbacks with zero token usage; they are adapter accounting, not model +work. GLiNER `ProviderError` rows are different: the same trace set had 20 real +detector failures, which can invalidate otherwise faster detector-heavy +candidates. + +Do not expand deterministic rules into contextual names merely to recover the +failed candidates above. The rejected prose and legal runs lost labels such as +`first_name`, `field_of_study`, `state`, and `university`; these require context +and separate precision evidence. The rule layer should stay narrow unless a new +label has high-confidence syntax and false-positive tests. + +## Validator Chunk Tuning + +The detector validator can dominate replace-mode latency on records with many +candidate entities. Tune `Detect.validation_max_entities_per_call` and +`Detect.validation_excerpt_window_chars` together: + +- `validation_max_entities_per_call` controls how many candidate entities go + into each validator call. Lower values create more calls, but Anonymizer can + overlap those calls through the validator pool. +- `validation_excerpt_window_chars` controls how much text surrounds each + validation chunk. Lower values reduce prompt size, but can hide context the + validator needs for labels such as `date_of_birth`, `race_ethnicity`, or + legal roles. + +Run these sweeps per workload. A window that is safe for short biographies may +drop legal identifiers, and a legal-safe window may erase the speedup on short +records. + +Example config fragment: + +```yaml +configs: + - id: legal-vmax10-w160 + detect: + validation_max_entities_per_call: 10 + validation_excerpt_window_chars: 160 + entity_labels: [first_name, last_name, court_name, date, date_of_birth] + replace: + strategy: hash + digest_length: 12 +``` + +Use the aggregate analysis first: + +```bash +uv run python tools/measurement/analyze_benchmark_output.py \ + benchmark-runs/legal-window-sweep \ + --json +``` + +Then compare every faster candidate against a higher-context reference: + +```bash +uv run python tools/measurement/extract_signature_deltas.py \ + /tmp/reference/legal__default-window__r000.detection-artifacts.jsonl \ + /tmp/candidate/legal__vmax10-w160__r000.detection-artifacts.jsonl \ + --baseline-artifact-root /tmp/reference/artifacts \ + --candidate-artifact-root /tmp/candidate/artifacts \ + --baseline-config default-window \ + --candidate-config vmax10-w160 \ + --workload legal \ + --output /tmp/legal-vmax10-w160-deltas.csv \ + --format csv +``` + +Treat a candidate as unsafe until signature deltas are clean on repeated runs. +In one local vLLM check with two repetitions, a biography sample went from +24.6s with the default window to 17.8s with `vmax10/w80`, with all 50 stable +entity signatures preserved. A one-row legal sample went from 21.2s with the +default window to 13.2s with `vmax10/w160`, with all 28 stable signatures +preserved. Both candidates increased request and token counts, so the comparison +tool marks them for review instead of as automatic wins. + +The biography `vmax10/w80` result did not hold on the next five biography rows. +With `row_offset: 5`, median latency regressed from 31.8s to 33.6s, requests +from 20.0 to 43.0, and tokens from 44,367.0 to 68,407.5. The comparison also +lost stable `field_of_study` and `state` signatures, with an additional +unstable `university` loss, so the tool rejected the candidate. Recheck this +tuning on the target dataset because smaller windows can miss sensitive +attributes and because the extra parallel validator calls can overwhelm any +prompt-size savings. + +## Augmentation Ablation + +Use `experimental_detection_strategy: rules_guardrail_no_augment` to measure +what happens when the detector keeps GLiNER, validation, and deterministic rule +guardrails, but skips LLM augmentation. Treat this as an ablation, not as a +replacement for the default pipeline. + +In a local vLLM check with two repetitions, removing augmentation from the +two-row biography sample reduced work but consistently lost two stable +`first_name` signatures. The comparison tool rejected both the default-window +and `vmax10/w80` no-augmentation candidates. This indicates augmentation is +load-bearing for prose records where contextual names and quasi-identifiers +matter. + +The same ablation preserved all 28 stable signatures on a one-row legal sample. +With the default validation window, latency moved from 21.2s to 18.3s, requests +from 5 to 4, and tokens from 11,327.5 to 7,654. With `vmax10/w160`, latency +moved from 13.2s to 9.5s, requests from 8 to 7, and tokens from 16,604 to +12,881. Compared directly against the default-window baseline, the combined +legal candidate is faster but still needs review because validator chunking +increases requests and tokens. + +That legal no-augmentation result also failed to generalize to the next two +legal records. On `row_offset: 1` with two rows and two repetitions, comparing +`legal-noaugment-vmax10-w160` against the same-window full augmentation baseline +improved latency from 23.9s to 21.5s, requests from 28.0 to 26.0, and tokens +from 61,780.5 to 50,905.0, but the candidate lost two stable `first_name` +signatures and one unstable `date` signature. The comparison rejected it despite +the performance improvement. + +Use this ablation when `augmented_new_final_value_count` is near zero for the +target workload and repeated signature deltas are clean. Do not generalize a +single legal row to the rest of a legal dataset, and do not generalize legal +results to biography, support-ticket, shell-history, or mixed prose data without +rerunning the comparison on that workload. + +## Augmenter Routing and Temperature + +The detection validator and augmenter do different jobs. Keep them separable in +model configs when testing local endpoints: + +- validators benefit from deterministic JSON-oriented settings; +- augmenters may work better through a text alias, because DataDesigner + structured parsing can be fragile on local OpenAI-compatible endpoints; +- augmenter temperature changes can alter retry pressure and output shape, so + evaluate them with repeated signature comparison, not only entity counts. + +In one local vLLM biography run with two repetitions, keeping the validator on +`local-nemotron-json` while routing the augmenter to a text alias with +`temperature: 0.7` was the first prose candidate that passed the current safety +gate and improved performance. Median latency moved from 24.2s to 21.6s, +requests from 8 to 6, and tokens from 17,938.5 to 11,921. The comparison had no +baseline-only or unstable-lost signatures across 48 stable signatures, so the +tool marked it `candidate_viable`. + +The same routing/temperature candidate also held on a five-row biography slice +with two repetitions, though the gain was smaller. Median latency moved from +40.4s to 38.0s, requests from 21.0 to 20.5, and tokens from 43,367.5 to +43,043.0. It preserved all 114 stable baseline signatures; one candidate-only +`place_name` appeared in one repetition, so the comparison still marked the +candidate `candidate_viable`. + +This result did not generalize cleanly to the next five biography rows. On a +second slice using `row_offset: 5`, the same candidate was rejected: median +latency moved from 41.0s to 47.5s, requests from 21.0 to 21.5, and tokens were +effectively unchanged at 44,708.0 to 44,670.0. More importantly, the comparison +lost one stable `university` signature and had unstable losses for +`field_of_study` and `university`. Treat this routing as an experiment to +retest on each workload, not as a default candidate yet. +When the two temp-0.7 config IDs are grouped with `--config-aliases`, the +biography family result is `conflicting_evidence`: three comparison rows, two +viable rows, one reject row, best latency -10.4%, worst latency +16.0%, and +stable losses for `field_of_study` and `university`. + +On a two-row legal slice with two repetitions, the same augmenter routing did +not materially improve latency or requests: median latency moved from 27.3s to +27.5s, requests stayed at 8, and tokens moved from 24,460.0 to 24,296.5. It +preserved stable signatures, but the rule-guardrail legal strategy remains +review-gated and this routing should be treated as neutral for that sample. +Also compare it against prompt-only changes such as +`prose_augment_focus`: in the same biography slice, prose-focused augmentation +preserved signatures and reduced requests/tokens, but wall time increased from +24.2s to 26.4s, so the tool kept it in review. + +Parser compatibility is a separate concern. A text-model suite without +`dd_parser_compat: raw_json` produced a failed biography case in local testing; +the raw-parser compatibility mode fixed that failure, but increased latency and +tokens on both biography and legal slices. Treat raw-parser compatibility as an +endpoint interoperability fix, not as a performance optimization. + +## Detector-Only Ablation + +Use `experimental_detection_strategy: detector_only` to measure the lower bound +of the detection phase when GLiNER output is trusted directly and only local +span finalization runs afterward. Use +`experimental_detection_strategy: rules_guardrail_detector_only` to measure the +same path with deterministic high-confidence rule spans unioned into the final +entity set. Both remove LLM validation and LLM augmentation from the detection +phase, so they are diagnostic ablations rather than deployable strategies. + +The comparison tool marks these candidates with +`candidate_skips_llm_validation`, which forces `safety_verdict: review` even +when entity signatures match on the sampled records. The rule-guardrail variant +also gets `candidate_uses_rule_entities` when rule spans survive. Promote either +path only if independent precision checks show false positives are acceptable +for the target workflow and repeated signature deltas remain clean. + +In a one-row cross-workload smoke check, detector-only improved latency and +token counts on biography, legal, and shell-secrets slices, but all three +candidates were rejected by signature comparison. Biography moved from 13.7s to +0.9s and lost two baseline `first_name` signatures; legal moved from 15.3s to +1.0s and lost one `nationality` signature while increasing final entity count +from 22 to 39; shell-secrets moved from 6.6s to 0.8s and still lost one +baseline `api_key` signature. This is a useful lower-bound measurement, but it +shows why validation/augmentation or deterministic rule coverage remain +load-bearing for anonymization. + +The `rules_guardrail_detector_only` variant did not fix prose/legal losses in +the same one-row check: biography still lost two `first_name` signatures and +legal still lost one `nationality` signature. It did preserve all shell-secret +baseline signatures while moving latency from 4.6s to 0.8s, requests from 4 to +1, and tokens from 3,969 to 85. Treat that as a narrow structured-secret +candidate. It remains review-gated because it skips LLM validation and relies +on deterministic rules. + +On the three-row shell-secrets slice with three successful candidate +repetitions, `rules_guardrail_detector_only` preserved all stable baseline +signatures while moving median latency from 7.2s to 3.2s, requests from 12 to 4, +and tokens from 11,019 to 198. The final entity set came from 9 detector spans +and 3 rule spans. It still had local GLiNER `ProviderError` health-check +failures and remains slower than `rules_only`, which used zero model calls and +zero tokens on the same fully rule-covered labels. + +## Deterministic Rules for Structured Secrets + +Use `experimental_detection_strategy: rules_only` only when the workload is a +bounded secret-scanning task and every requested label is covered by the +deterministic rules. Current rule coverage is intentionally narrow: +`api_key`, `date_of_birth`, `email`, `http_cookie`, `organization_name`, +`password`, `pin`, `religious_belief`, `street_address`, `unique_id`, `url`, +and `user_name`. The `http_cookie`, `pin`, `unique_id`, and `user_name` rules +cover keyed or command-style structured patterns only. They do not recognize +free-form names, narrative identifiers, or arbitrary prose mentions. + +The zero-model detector is implemented by +`EntityDetectionWorkflow.detect_with_high_confidence_rules()`. The benchmark +strategy delegates to that internal engine method, but no user-facing config +selects it outside the benchmark harness. + +Use `experimental_detection_strategy: rules_covered_or_default` for mixed +benchmark suites where some configs are structured-secret scans and others +include contextual labels such as `person`, `organization_name`, or +`street_address`. It runs the same zero-model path for structured fast-lane +cases, but does not attempt a DataDesigner-free replacement for contextual +prose or legal records. + +A mixed local-vLLM smoke run on June 8, 2026 used two synthetic shell-secret +rows plus one biography and one legal row. The first shell run found that +`rules_covered_or_default` missed a sudo stdin password that default +augmentation caught; after adding a narrow `echo "..." | sudo -S` rule, the +rerun preserved all 9 shell signatures with zero model requests and zero tokens. +The biography and legal configs requested `person`, so they correctly fell back +to model-backed detection and matched default entity counts. Keep this strategy +signature-gated: a missed default signature is a rule-quality bug or a fallback +signal, not acceptable drift. + +A follow-up three-repetition shell-only run kept all 9 candidate signatures +stable while default detection had 8 stable signatures because one `api_key` +was absent from one default repetition. The comparison still returned +`candidate_verdict=review` because the candidate had no detector-sourced final +spans. This is the intended behavior: repeated clean signatures can justify a +workload-scoped fast lane, but rule-only provenance should remain an explicit +review decision. + +For substitute workloads, use +`experimental_replacement_strategy: local_structured_substitute` to bypass the +DataDesigner replacement-generator call. The local substitute generator writes a +normal replacement map and stamps `_replacement_map_source=local_structured` so +measurement estimates do not count a replacement-map LLM call. It only supports +structured labels. Pair it with `rules_covered_or_default` when all requested +labels are also rule-covered; otherwise detection can still use the default +model-backed path while replacement-map generation stays local. If a config +includes `person` or another contextual label, preflight fails instead of +silently producing poor local substitutes. + +On the current four-row non-shell structured-secret suite, +DataDesigner-backed substitute preserved 17 entities with zero original-value +leaks but had repeat-3 median latency 38.1s, 4 requests, and 13,967 tokens in +replacement-map generation. The local structured substitute arm preserved the +same 17 entities, had zero original-value leaks, and had repeat-3 median latency +0.005s with 0 requests and 0 tokens. The repeat output used for this result is +`/tmp/anonymizer-perf-goal/structured-secrets-local-substitute-repeat3`. + +The local substitute backend can also combine with model-backed detection. In +the first one-row audit-style structured-identifier smoke, `api_key`, +`http_cookie`, `pin`, `unique_id`, and `user_name` were not all rule-covered, so +detection fell back to the default model path in both arms. The local substitute +arm still removed the replacement-map DataDesigner workflow, moving pipeline +latency from 53.6s to 33.0s, requests from 5 to 4, and tokens from 11,547 to +7,694 while preserving the same 5 final entities and zero original-value leaks. +The output used for that result is +`/tmp/anonymizer-perf-goal/structured-identifiers-local-substitute`. + +Use `replay_replacement_strategies.py` when you need to hold detection fixed and +isolate replacement-map generation: + +```bash +uv run python tools/measurement/replay_replacement_strategies.py \ + /tmp/anonymizer-perf-goal/structured_identifiers_varied.csv \ + --text-column text \ + --labels api_key,http_cookie,password,pin,unique_id,user_name \ + --nrows 5 \ + --replacement-repetitions 3 \ + --model-configs /stable-cache/anonymizer/local-vllm-json-models.yaml \ + --model-providers /stable-cache/anonymizer/local-vllm-providers.yaml \ + --dd-parser-compat raw_json \ + --comparison-output /tmp/anonymizer-perf-goal/structured-identifiers-replacement-replay-comparison.csv \ + --json +``` + +The current fixed-trace replay detected 21 entities once, then ran both +substitute backends on that same trace. DataDesigner substitute took 6.04s for +replacement-map generation; local structured substitute took 0.003s. Both arms +produced 21 replacements, zero missing replacement-map entries, zero +original-value leaks, and zero synthetic-original collisions. The JSON output +used for this result is +`/tmp/anonymizer-perf-goal/structured-identifiers-replacement-replay.json`. +A rerun after adding an LLM replacement-map collision guard produced the same +21/21 complete, leak-free, collision-free result. In that rerun, DataDesigner +substitute took 6.22s and local structured substitute took 0.003s; the updated +JSON output is +`/tmp/anonymizer-perf-goal/structured-identifiers-replacement-replay-after-llm-guard.json`. +When `--replacement-repetitions` is greater than one, detection still runs once +and only the substitute backends repeat. The summary rows aggregate replacement +latency, missing-map counts, leaks, collisions, duplicate synthetics, and source +counts across those repeated backend passes. When `--comparison-output` is set, +the replay tool also writes a one-row comparison CSV with +`value_protection_verdict`, `signature_parity_verdict`, `safety_verdict`, +`performance_verdict`, and `candidate_verdict`. This lets +`screen_strategy_comparisons.py` include fixed-trace replacement evidence +alongside normal pairwise benchmark comparisons. Missing local replacement-map +entries, original-value leaks, and synthetic-original collisions fail the replay +candidate even if the elapsed-time delta is large. +If the DD substitute baseline misses replacement-map entries or leaks original +values while the local backend covers them, the replay comparison emits +candidate-covers-baseline flags and the strategy screen recommends +`candidate_covers_baseline_defects` for all-review groups of that shape. Treat +that as a baseline-independent safety-rule prompt: inspect the candidate's +missing, leak, collision, duplicate-synthetic, and supported-label columns +rather than requiring exact parity with a known-flawed substitute baseline. + +After adding narrow keyed rules for `http_cookie`, `pin`, `unique_id`, and +`user_name`, the same audit-style label set can now short-circuit both +detection and local replacement for a structured record. In a one-row local +vLLM check, default detection plus DataDesigner substitute found 4 entities and +missed the `unique_id`; the rules/local arm found 5 entities, had zero +original-value leaks, and moved pipeline latency from 9.2s to 0.005s, requests +from 5 to 0, and tokens from 6,075 to 0. The pairwise comparison remains +`review`, not `candidate_viable`, because the candidate has rule-only +provenance and the evidence is a single row. The output used for this result is +`/tmp/anonymizer-perf-goal/structured-identifiers-expanded-rules`. + +On a three-row shell-secrets slice with labels `[api_key, password, email, url]`, +`rules_only` preserved all 12 stable signatures across three repetitions while +moving median latency from 7.2s to 0.004s, requests from 12 to 0, and tokens +from 11,019 to 0 in the refreshed failure-aware comparison. The comparison tool +still marks the candidate for review because it has no contextual detector spans +and skips LLM validation. That is the right gate: a pure rule strategy is +acceptable only when missing contextual spans is part of the test contract. + +`rules_seed_no_augment` preserved the same 12 signatures and reduced median +tokens from 11,017 to 7,732, but median latency moved from 8.0s to 8.5s on the +same slice. In this run, seeding rules into the validator path reduced token +work but did not improve end-to-end latency. Prefer `rules_only` for tightly +scoped secret scans; prefer rule guardrails plus contextual detection for prose, +legal text, support tickets, and mixed records. + +Use `rules_filter_guardrail` as the mixed-workload version of that idea. It +keeps LLM augmentation, but rule-covered spans are not sent to the seed +validator. The rule spans are reinserted before augmentation so the augmenter +does not waste work rediscovering them. This is a candidate for datasets that +combine structured secrets with contextual prose; it still needs repeated +signature comparison because filtered detector spans no longer receive the +LLM validator's reclassification/drop pass. In a local shell-secrets smoke run, +the completed candidate repetition reduced seed validation candidates to zero +and preserved all stable signatures, but the repeated comparison rejected it +because a later candidate case hit a GLiNER health-check rate limit. + +## Metric Interpretation Use metrics as signals, not as a single score. Latency and throughput: -- `elapsed_sec`: wall time for a measured stage or workflow. -- `pipeline_elapsed_sec`: end-to-end Anonymizer wall time for a case. -- `records_per_pipeline_sec`: completed input records per pipeline second. -- `input_text_tokens_per_pipeline_sec`: input text tokens processed per - pipeline second. - -Model work: - -- `observed_total_requests`: measured model requests from DataDesigner or direct - model workflow records. -- `observed_total_tokens`: measured input plus output tokens. -- `observed_failed_requests`: provider-level failed requests. -- `observed_bridge_fallback_requests`: sync-client fallback requests recorded - from DataDesigner traces. -- `observed_non_bridge_failed_requests`: failed requests after subtracting - sync-client bridge fallbacks. Prefer this field when judging endpoint - reliability from trace-enabled runs. - -Detection artifacts: - -- `seed_entity_count`: detector or direct-seed candidate count before - validation. -- `seed_validation_candidate_count`: candidates sent to validation. -- `estimated_seed_validation_chunk_count`: estimated validator chunks from the - active validation chunk size. -- `augmented_entity_count`: augmenter suggestions. -- `augmented_new_final_value_count`: augmenter suggestions that add values not - already present in the seed/final set. -- `artifact_final_detector_entity_count` and +- `elapsed_sec`: wall time for a measured stage or DataDesigner workflow. + Staged DD-free detection cases report end-to-end case wall time here. +- `rows_per_sec`: completed output rows per second for the measured block. +- `tokens_per_sec`: observed total tokens per second when token usage exists. +- `text_length_tokens_bucket`: a coarse text-size bucket for comparing similar + inputs without storing text. +- `record_count` and `input_text_tokens_total`: case-level workload size + derived from record measurements. These are independent of provider-reported + token usage. +- `records_per_pipeline_sec` and `input_text_tokens_per_pipeline_sec`: dataset + throughput normalized by the measured Anonymizer pipeline stage. The matching + `*_per_ndd_sec` fields use summed DataDesigner workflow wall time instead. +- `input_text_tokens_per_endpoint_sec` and + `input_text_tokens_per_gpu_sec`: optional topology-normalized dataset + throughput. These are populated only when benchmark run tags provide portable + topology counts such as `endpoint_count` or `gpu_count`. + +LLM usage: + +- `observed_input_tokens`, `observed_output_tokens`, and + `observed_total_tokens`: provider-reported usage when available. Missing or + zero values mean the provider path did not expose usage, not necessarily that + no tokens were consumed. +- `observed_total_requests`, `observed_successful_requests`, and + `observed_failed_requests`: request counts when DataDesigner or a native + benchmark model workflow exposes them. +- `observed_failed_request_rate`: failed requests divided by total requests. + Case and group tables expose this as the end-to-end retry pressure for a + strategy; model usage tables expose it per workflow/model. Sort by this + together with total token count to find retry-heavy workflow/model pairs. +- `observed_bridge_fallback_requests`: DataDesigner sync-to-async bridge + fallbacks, derived from message traces when `--dd-trace` is enabled. Treat + these as adapter accounting, not provider/model failures. +- `model_elapsed_sec`: staged DD-free detection only; sum of direct model-call + durations for seed, validation, and augmentation. This stays `0.0` for fully + local rule-covered runs even when `elapsed_sec` records nonzero local work. +- `observed_non_bridge_total_requests`, + `observed_non_bridge_failed_requests`, and + `observed_non_bridge_failed_request_rate`: request metrics after subtracting + sync-to-async bridge fallbacks. Prefer these fields over raw failed-request + counts when diagnosing provider reliability from traced runs. +- `nominal_llm_call_count`: an internal estimate based on the Anonymizer + pipeline shape. Treat it as expected work, not observed provider traffic. +- `seed_validation_candidate_count`: number of detector candidates sent to the + seed validator, derived from detection artifacts without storing values. +- `estimated_seed_validation_chunk_count`: estimated validator chunk count after + applying `detect.validation_max_entities_per_call`. If this does not change + between benchmark configs, chunk-size experiments are not expected to reduce + successful validator calls. + +Entity and quality metrics: + +- `final_entity_count`: entities that survive detection and validation. +- `original_value_leak_count`: number of final entity original values that + still appear verbatim in the replaced or rewritten output text. This is a + conservative replace/rewrite safety signal and stores only counts, not raw + values. +- `original_value_leak_label_counts`: per-label counts for those surviving + original values. The analysis tables aggregate these as + `original_value_leak_record_count`, `sum_original_value_leak_count`, + `leaking_case_count`, and `median_original_value_leak_count`. +- `replacement_missing_final_entity_count`: number of final entity occurrences + whose original value has no entry in the replacement map. This is sanitized + replacement-map coverage, not raw leakage text. +- `replacement_missing_final_value_count`: number of unique final entity values + with no replacement-map entry. Compare it with + `original_value_leak_count` to distinguish omitted replacement-map entries + from replacement-application or metric issues. +- `replacement_missing_final_entity_label_counts`: per-label counts for missing + replacement-map coverage. +- `replacement_synthetic_original_collision_count`: number of final entity + occurrences whose original value was reused as a synthetic replacement value + elsewhere in the same record. This is a substitute safety signal; map + coverage can be complete while this is nonzero. +- `replacement_synthetic_original_collision_value_count`: number of unique + protected original values reused as synthetic replacement values. +- `replacement_synthetic_original_collision_label_counts`: per-label counts for + synthetic-original collisions. +- `artifact_final_detector_entity_count`, + `artifact_final_rule_entity_count`, and `artifact_final_augmenter_entity_count`: final entity source counts derived - from detection artifact sidecars. + from detection artifact sidecars. These are useful safety signals for + rule-backed benchmark strategies. - `artifact_final_entity_signature_count` and `artifact_final_entity_signature_hashes`: opaque final-span signatures derived - from detection artifacts. These do not include raw entity values. - -Safety and replacement: - -- `original_value_leak_count`: count of protected original values still present - in replaced output. -- `replacement_missing_final_entity_count`: final entity occurrences whose - original value has no replacement-map entry. -- `replacement_missing_final_value_count`: unique final entity values with no - replacement-map entry. -- `replacement_synthetic_original_collision_count`: final entity occurrences - whose original value was reused as a synthetic replacement value elsewhere in - the same record. + from detection artifacts. `artifact_final_entity_signature_labels` maps each + hash to a label, but still does not include raw entity values. Use these to + catch and triage safety regressions where total entity count is unchanged but + the candidate lost a baseline-protected span. - `baseline_only_candidate_covered_signature_count`, `baseline_only_candidate_overlapping_signature_count`, and `baseline_only_candidate_uncovered_signature_count`: comparison-only fields from `compare_strategy_pairs.py`. These split exact signature deltas into - covered, boundary-overlap, and uncovered losses. -- `candidate_verdict`: `candidate_viable`, `review`, or `reject`. - -Treat `candidate_viable` as a promotion candidate, not as an automatic default. -It means the sampled comparison passed the current gates and improved at least -one performance metric without regressing another. Re-run candidates on the -target workload family, with repetitions, before changing production defaults. + baseline spans protected by a containing candidate span, protected by a + high-overlap or small keyed-boundary candidate span, or not protected by any + candidate span metadata. Overlapping coverage sets `span_boundary_mismatch` + and keeps the candidate in review; uncovered signatures set + `entity_signature_loss` and fail the safety verdict. +- `baseline_only_candidate_label_mismatch_signature_count`: comparison-only + field for baseline signatures whose raw span is covered by the candidate, but + under a different label. This sets `covered_label_mismatch` and keeps the + candidate in review because the value is protected but label semantics may no + longer match replacement/audit expectations. +- `value_protection_verdict`: comparison-only pass/review/fail verdict focused + on whether candidate output still protects baseline values. Covered + label-mismatch spans can still pass this axis because the sensitive value is + protected, while uncovered signatures, candidate leaks, and candidate case + failures fail it. +- `signature_parity_verdict`: comparison-only pass/review/fail verdict focused + on exact baseline signature semantics. Covered label mismatches and boundary + mismatches review-gate this axis even when `value_protection_verdict` passes. + This split is useful for DataDesigner-free experiments: a candidate can be a + plausible protection backend while still requiring label-policy review before + it can replace a DataDesigner-backed baseline. +- `final_entity_label_counts`: per-label entity counts serialized as JSON in + exported tabular files. +- `ground_truth_*` and `entity_*`: exact value+label precision, recall, F1, + false positives, and false negatives when the input includes one of the + supported ground-truth entity columns. +- `entity_relaxed_*`: span-overlap precision, recall, and F1. The + label-compatible variants require both span overlap and equivalent labels, + while the non-label-compatible relaxed metrics only ask whether a + ground-truth span was protected by any detected span. +- `empty_detection_count`, `empty_detection_rate`, + `empty_detection_with_ground_truth_count`, and + `empty_detection_with_ground_truth_rate`: diagnostics for records where the + detector returned no final entities. The ground-truth-specific fields are the + important safety signal when a benchmark includes labels. +- `utility_score`, `leakage_mass`, `weighted_leakage_rate`, + `needs_repair`, and `needs_human_review`: rewrite-mode evaluation fields. + These are null for replace-mode runs. + +Error and reliability metrics: + +- `failed_record_count`: records dropped by a DataDesigner workflow. +- `status`: completion state for a stage or workflow. +- `case_failed`: true when a benchmark case has any error-status stage or + DataDesigner workflow measurement. +- `error_stage_count`, `error_ndd_workflow_count`, and + `error_model_workflow_count`: error-status measurement rows counted per case. +- `failed_case_count` and `failed_case_rate`: group-level failed-case count and + rate for a workload/config/strategy. +- `summary.json` case errors: runner-level failures, such as invalid inputs or + model endpoint failures. + +## Reading Results Safely + +Compare like with like. A shell-history workload, a support-ticket workload, +and a legal-document workload stress different parts of Anonymizer. Group by +`workload_id` before drawing conclusions about model routing, speculative +decoding, validation chunk size, or rewrite repair settings. + +Record-level rows describe input shape and output quality, not per-record wall +time. Stage and workflow rows carry timing. To explain a slow run, first find +the slow stage, then inspect the records in that run for text length, entity +count, nominal call count, and rewrite repair signals. + +When token or request fields are missing, check `ndd_workflow.model_usage` and +`model_workflow.model_usage`. The measurement layer records deeper provider +usage only when the underlying executor returns it. diff --git a/tools/measurement/analyze_benchmark_output.py b/tools/measurement/analyze_benchmark_output.py index 056c2287..f4768de6 100644 --- a/tools/measurement/analyze_benchmark_output.py +++ b/tools/measurement/analyze_benchmark_output.py @@ -99,6 +99,9 @@ class CaseAnalysisRow(BaseModel): topology_shard_count: float | None = None input_text_tokens_per_endpoint_sec: float | None = None input_text_tokens_per_gpu_sec: float | None = None + route_total_row_count: float | None = None + route_rule_row_count: float | None = None + route_fallback_row_count: float | None = None final_entity_count: float | None = None empty_detection_count: int = 0 empty_detection_rate: float | None = None @@ -141,6 +144,7 @@ class CaseAnalysisRow(BaseModel): augmented_new_final_value_count: float | None = None artifact_final_entity_count: float | None = None artifact_final_detector_entity_count: float | None = None + artifact_final_rule_entity_count: float | None = None artifact_final_augmenter_entity_count: float | None = None artifact_final_entity_signature_count: float | None = None artifact_final_entity_signature_hashes: list[str] = Field(default_factory=list) @@ -190,6 +194,9 @@ class GroupAnalysisRow(BaseModel): median_topology_shard_count: float | None = None median_input_text_tokens_per_endpoint_sec: float | None = None median_input_text_tokens_per_gpu_sec: float | None = None + median_route_total_row_count: float | None = None + median_route_rule_row_count: float | None = None + median_route_fallback_row_count: float | None = None median_final_entity_count: float | None = None total_empty_detection_count: int = 0 empty_detection_rate: float | None = None @@ -231,6 +238,7 @@ class GroupAnalysisRow(BaseModel): median_augmented_new_final_value_count: float | None = None median_artifact_final_entity_count: float | None = None median_artifact_final_detector_entity_count: float | None = None + median_artifact_final_rule_entity_count: float | None = None median_artifact_final_augmenter_entity_count: float | None = None median_artifact_final_entity_signature_count: float | None = None @@ -503,6 +511,9 @@ def _build_case_row( measurement_rows, input_text_tokens_per_pipeline_sec=input_text_tokens_per_pipeline_sec, ), + route_total_row_count=_sum_or_none(model_rows, "route_total_row_count"), + route_rule_row_count=_sum_or_none(model_rows, "route_rule_row_count"), + route_fallback_row_count=_sum_or_none(model_rows, "route_fallback_row_count"), final_entity_count=final_entity_count, **_case_empty_detection_metrics(record_rows, record_count=record_count), **_case_ground_truth_metrics(record_rows, final_entity_count=final_entity_count), @@ -743,6 +754,7 @@ def _case_artifact_metrics( "augmented_new_final_value_count": _sum_or_none(artifact_rows, "augmented_new_final_value_count"), "artifact_final_entity_count": _sum_or_none(artifact_rows, "final_entity_count"), "artifact_final_detector_entity_count": _sum_or_none(artifact_rows, "final_source_counts.detector"), + "artifact_final_rule_entity_count": _sum_or_none(artifact_rows, "final_source_counts.rule"), "artifact_final_augmenter_entity_count": _sum_or_none(artifact_rows, "final_source_counts.augmenter"), "artifact_final_entity_signature_count": _signature_count(artifact_rows, signature_hashes=signature_hashes), "artifact_final_entity_signature_hashes": signature_hashes, @@ -1282,6 +1294,9 @@ def _build_group_row(keys: tuple[Any, ...], group: pd.DataFrame) -> GroupAnalysi median_topology_shard_count=_median_or_none(group, "topology_shard_count"), median_input_text_tokens_per_endpoint_sec=_median_or_none(group, "input_text_tokens_per_endpoint_sec"), median_input_text_tokens_per_gpu_sec=_median_or_none(group, "input_text_tokens_per_gpu_sec"), + median_route_total_row_count=_median_or_none(group, "route_total_row_count"), + median_route_rule_row_count=_median_or_none(group, "route_rule_row_count"), + median_route_fallback_row_count=_median_or_none(group, "route_fallback_row_count"), median_final_entity_count=_median_or_none(group, "final_entity_count"), total_empty_detection_count=total_empty_detection_count, empty_detection_rate=_safe_ratio(total_empty_detection_count, total_record_count), @@ -1344,6 +1359,7 @@ def _build_group_row(keys: tuple[Any, ...], group: pd.DataFrame) -> GroupAnalysi median_augmented_new_final_value_count=_median_or_none(group, "augmented_new_final_value_count"), median_artifact_final_entity_count=_median_or_none(group, "artifact_final_entity_count"), median_artifact_final_detector_entity_count=_median_or_none(group, "artifact_final_detector_entity_count"), + median_artifact_final_rule_entity_count=_median_or_none(group, "artifact_final_rule_entity_count"), median_artifact_final_augmenter_entity_count=_median_or_none(group, "artifact_final_augmenter_entity_count"), median_artifact_final_entity_signature_count=_median_or_none(group, "artifact_final_entity_signature_count"), ) diff --git a/tools/measurement/analyze_staged_detection_output.py b/tools/measurement/analyze_staged_detection_output.py index 737d439a..225cc045 100644 --- a/tools/measurement/analyze_staged_detection_output.py +++ b/tools/measurement/analyze_staged_detection_output.py @@ -40,6 +40,9 @@ class LogFormat(StrEnum): _log_format = LogFormat.plain +_FAST_LANE_MIN_CASES = 3 + + class StagedCaseAnalysisRow(BaseModel): source_path: str case_id: str @@ -51,6 +54,7 @@ class StagedCaseAnalysisRow(BaseModel): model_elapsed_sec: float | None = None model_phase_count: int = 0 model_request_count: int = 0 + rule_covered_label_set: bool = False prompt_tokens: int = 0 completion_tokens: int = 0 total_tokens: int = 0 @@ -78,6 +82,7 @@ class StagedGroupAnalysisRow(BaseModel): completed_case_count: int = 0 error_case_count: int = 0 failed_case_rate: float | None = None + rule_covered_case_count: int = 0 elapsed_sec_sum: float | None = None elapsed_sec_mean: float | None = None model_elapsed_sec_sum: float | None = None @@ -95,6 +100,8 @@ class StagedGroupAnalysisRow(BaseModel): direct_only_final_entity_signature_count_sum: int = 0 baseline_shared_signature_rate: float | None = None baseline_loss_signature_rate: float | None = None + fast_lane_verdict: str = "review" + flags: list[str] = Field(default_factory=list) class LabelDeltaAnalysisRow(BaseModel): @@ -206,6 +213,7 @@ def _build_case_row(record: dict[str, Any], *, source_path: Path) -> StagedCaseA model_elapsed_sec=_optional_float(record.get("model_elapsed_sec")), model_phase_count=_int_value(record.get("model_phase_count")), model_request_count=_int_value(record.get("model_request_count")), + rule_covered_label_set=bool(record.get("rule_covered_label_set")), **_usage_fields(_dict_value(record.get("total_usage"))), **_entity_count_fields(record), baseline_final_entity_signature_count=baseline_count, @@ -256,12 +264,22 @@ def _build_group_row(seed_source: str | None, rows: list[StagedCaseAnalysisRow]) shared_total = _sum_optional_int(rows, "shared_final_entity_signature_count") baseline_only_total = _sum_optional_int(rows, "baseline_only_final_entity_signature_count") model_request_count = sum(row.model_request_count for row in rows) + rule_covered_count = sum(1 for row in rows if row.rule_covered_label_set) + flags = _fast_lane_flags( + case_count=case_count, + error_count=error_count, + baseline_total=baseline_total, + baseline_only_total=baseline_only_total, + model_request_count=model_request_count, + rule_covered_count=rule_covered_count, + ) return StagedGroupAnalysisRow( seed_source=seed_source, case_count=case_count, completed_case_count=case_count - error_count, error_case_count=error_count, failed_case_rate=_rate(error_count, case_count), + rule_covered_case_count=rule_covered_count, elapsed_sec_sum=_sum_optional_float(rows, "elapsed_sec"), elapsed_sec_mean=_mean_optional_float(rows, "elapsed_sec"), model_elapsed_sec_sum=_sum_optional_float(rows, "model_elapsed_sec"), @@ -281,6 +299,8 @@ def _build_group_row(seed_source: str | None, rows: list[StagedCaseAnalysisRow]) ), baseline_shared_signature_rate=_rate(shared_total, baseline_total), baseline_loss_signature_rate=_rate(baseline_only_total, baseline_total), + fast_lane_verdict=_fast_lane_verdict(flags), + flags=flags, ) @@ -288,6 +308,39 @@ def _group_sort_key(item: tuple[str | None, list[StagedCaseAnalysisRow]]) -> str return item[0] or "" +def _fast_lane_flags( + *, + case_count: int, + error_count: int, + baseline_total: int, + baseline_only_total: int, + model_request_count: int, + rule_covered_count: int, +) -> list[str]: + flags: list[str] = [] + if case_count < _FAST_LANE_MIN_CASES: + flags.append("too_few_cases") + if error_count: + flags.append("case_errors") + if baseline_total == 0: + flags.append("missing_baseline_comparison") + if baseline_only_total: + flags.append("baseline_signature_loss") + if model_request_count: + flags.append("uses_model") + if rule_covered_count != case_count: + flags.append("not_fully_rule_covered") + return flags + + +def _fast_lane_verdict(flags: list[str]) -> str: + if "case_errors" in flags or "baseline_signature_loss" in flags: + return "reject" + if not flags: + return "fast_lane_candidate" + return "review" + + def build_label_delta_rows(cases: list[StagedCaseAnalysisRow]) -> list[LabelDeltaAnalysisRow]: counts: Counter[tuple[str | None, str, str]] = Counter() for case in cases: @@ -424,6 +477,7 @@ def _render_group_line(group: StagedGroupAnalysisRow, label_deltas: list[LabelDe lost = _top_labels(label_deltas, seed_source=group.seed_source, delta_type="baseline_only") return ( f"- {label}: cases={group.case_count}, errors={group.error_case_count}, " + f"verdict={group.fast_lane_verdict}, flags={_label_count_summary(group.flags)}, " f"elapsed_sum={_fmt_float(group.elapsed_sec_sum)}s, " f"model_elapsed_sum={_fmt_float(group.model_elapsed_sec_sum)}s, " f"requests={group.model_request_count_sum}, tokens={group.total_tokens_sum}, " @@ -433,6 +487,11 @@ def _render_group_line(group: StagedGroupAnalysisRow, label_deltas: list[LabelDe f"direct_only={group.direct_only_final_entity_signature_count_sum}, lost_labels={lost}" ) + +def _label_count_summary(items: list[str]) -> str: + return "[]" if not items else "[" + ", ".join(items) + "]" + + def _top_labels(label_deltas: list[LabelDeltaAnalysisRow], *, seed_source: str | None, delta_type: str) -> str: matches = [delta for delta in label_deltas if delta.seed_source == seed_source and delta.delta_type == delta_type] if not matches: diff --git a/tools/measurement/compare_strategy_pairs.py b/tools/measurement/compare_strategy_pairs.py index bdbad0d5..a4d9550d 100644 --- a/tools/measurement/compare_strategy_pairs.py +++ b/tools/measurement/compare_strategy_pairs.py @@ -5,12 +5,12 @@ Usage: uv run python tools/measurement/compare_strategy_pairs.py analysis/case_analysis.csv \ - --baseline-strategy default --candidate-strategy detector_native_validate_no_augment + --baseline-strategy no_augment --candidate-strategy rules_filter_guardrail_no_augment uv run python tools/measurement/compare_strategy_pairs.py analysis/case_analysis.parquet \ --baseline-config default --candidate-config no-augment --output comparisons.csv uv run python tools/measurement/compare_strategy_pairs.py baseline/case_analysis.csv \ --candidate-case-analysis candidate/case_analysis.csv \ - --baseline-strategy default --candidate-strategy native_single_pass + --baseline-strategy no_augment --candidate-strategy rules_guardrail_no_augment """ from __future__ import annotations @@ -165,6 +165,8 @@ class ComparisonRow(BaseModel): augmented_new_final_value_count_delta: float | None = None baseline_detector_entity_count: float | None = None candidate_detector_entity_count: float | None = None + baseline_rule_entity_count: float | None = None + candidate_rule_entity_count: float | None = None baseline_augmenter_entity_count: float | None = None candidate_augmenter_entity_count: float | None = None baseline_only_final_entity_signature_count: int | None = None @@ -442,6 +444,7 @@ def _single_string(rows: pd.DataFrame, column: str) -> str | None: "augmented_entity_count", "augmented_new_final_value_count", "artifact_final_detector_entity_count", + "artifact_final_rule_entity_count", "artifact_final_augmenter_entity_count", ] @@ -518,6 +521,8 @@ def _source_counts(baseline: dict[str, object], candidate: dict[str, object]) -> return { "baseline_detector_entity_count": _optional_float(baseline.get("artifact_final_detector_entity_count")), "candidate_detector_entity_count": _optional_float(candidate.get("artifact_final_detector_entity_count")), + "baseline_rule_entity_count": _optional_float(baseline.get("artifact_final_rule_entity_count")), + "candidate_rule_entity_count": _optional_float(candidate.get("artifact_final_rule_entity_count")), "baseline_augmenter_entity_count": _optional_float(baseline.get("artifact_final_augmenter_entity_count")), "candidate_augmenter_entity_count": _optional_float(candidate.get("artifact_final_augmenter_entity_count")), "baseline_original_value_leak_label_counts": _coerce_count_map( @@ -603,6 +608,8 @@ def _comparison_flags( _append_if_positive(flags, metrics, "observed_total_requests_delta", "request_increase") if _candidate_lacks_detector_entities(metrics): flags.append("no_candidate_detector_entities") + if _optional_float(metrics.get("candidate_rule_entity_count")): + flags.append("candidate_uses_rule_entities") if candidate_strategy in _SKIPS_LLM_VALIDATION_STRATEGIES: flags.append("candidate_skips_llm_validation") if _replacement_only_detection_instability( @@ -654,7 +661,7 @@ def _stable_signature_loss_metric(metrics: dict[str, object]) -> str: return "baseline_stable_candidate_unstable_final_entity_signature_count" -_SKIPS_LLM_VALIDATION_STRATEGIES = {"detector_only"} +_SKIPS_LLM_VALIDATION_STRATEGIES = {"detector_only", "rules_guardrail_detector_only", "rules_only"} def _has_metric_pair(metrics: dict[str, object], name: str) -> bool: @@ -732,6 +739,7 @@ def _safety_verdict(metrics: dict[str, object]) -> SafetyVerdict: return SafetyVerdict.fail if flags & { "no_candidate_detector_entities", + "candidate_uses_rule_entities", "candidate_skips_llm_validation", "failed_request_increase", "bridge_fallback_increase", @@ -804,6 +812,7 @@ def _candidate_lacks_detector_entities(metrics: dict[str, object]) -> bool: def _known_non_detector_candidate_count(metrics: dict[str, object]) -> float | None: known_counts = [ + _optional_float(metrics.get("candidate_rule_entity_count")), _optional_float(metrics.get("candidate_augmenter_entity_count")), ] if all(value is None for value in known_counts): diff --git a/tools/measurement/detection_strategies.py b/tools/measurement/detection_strategies.py index 8c45c6ab..a8955cd9 100644 --- a/tools/measurement/detection_strategies.py +++ b/tools/measurement/detection_strategies.py @@ -6,6 +6,7 @@ from __future__ import annotations import json +import re import time from collections import Counter from collections.abc import Callable, Iterator @@ -17,7 +18,7 @@ import pandas as pd from data_designer.config import custom_column_generator -from data_designer.config.column_configs import CustomColumnConfig, LLMTextColumnConfig +from data_designer.config.column_configs import CustomColumnConfig, LLMStructuredColumnConfig, LLMTextColumnConfig from data_designer.config.models import ModelConfig from dd_parser_compat import _load_embedded_json from direct_detection_probe import DirectDetectionRequest, DirectGenerationRequest, PromptMode, build_direct_prompt @@ -44,14 +45,17 @@ from anonymizer.engine.constants import ( COL_AUGMENTED_ENTITIES, COL_DETECTED_ENTITIES, + COL_INITIAL_TAGGED_TEXT, COL_MERGED_ENTITIES, COL_RAW_DETECTED, COL_SEED_ENTITIES, COL_SEED_ENTITIES_JSON, COL_SEED_VALIDATION_CANDIDATES, + COL_TAG_NOTATION, COL_TAGGED_TEXT, COL_TEXT, COL_VALIDATED_ENTITIES, + COL_VALIDATED_SEED_ENTITIES, COL_VALIDATION_DECISIONS, _jinja, ) @@ -69,10 +73,14 @@ EntitySpan, build_tagged_text, expand_entity_occurrences, + get_tag_notation, + parse_raw_entities, resolve_overlaps, ) +from anonymizer.engine.detection.rules import detect_high_confidence_entities from anonymizer.engine.ndd.adapter import FailedRecord from anonymizer.engine.ndd.model_loader import resolve_model_alias, resolve_model_aliases +from anonymizer.engine.row_partitioning import merge_and_reorder, split_rows from anonymizer.engine.schemas import AugmentedEntitiesSchema, EntitiesSchema, ValidationCandidatesSchema from anonymizer.measurement import record_model_workflow @@ -83,14 +91,37 @@ _DIRECT_MAX_TOKENS = 4096 _DIRECT_TIMEOUT_SEC = 180.0 _DIRECT_MAX_WORKERS = 4 +_STRUCTURED_ASSIGNMENT_RE = re.compile( + r"(?" + r"api[_-]?key|aws[_-]?access[_-]?key[_-]?id|access[_-]?key[_-]?id|hf[_-]?token|" + r"token|auth[_-]?token|session[_-]?id|authorization|" + r"password|pass|secret|aws[_-]?secret[_-]?access[_-]?key|django[_-]?secret|database[_-]?url|" + r"pin|user(?:_?name)?|username|login|account|cookie|" + r"trace[-_]?id|request[-_]?id|req[-_]?id|order[-_]?id|tenant[-_]?id|unique[-_]?id|" + r"url|uri|endpoint|callback|email" + r")['\"]?\s*[:=]\s*" + r"(?:['\"](?P[^'\"\r\n]+)['\"]|(?P[^\s'\",;]+))", + flags=re.IGNORECASE, +) class ExperimentalDetectionStrategy(StrEnum): default = "default" prose_augment_focus = "prose_augment_focus" compact_validation = "compact_validation" + rules_guardrail_compact_validation = "rules_guardrail_compact_validation" + rules_guardrail = "rules_guardrail" + rules_filter_guardrail = "rules_filter_guardrail" no_augment = "no_augment" + rules_seed_no_augment = "rules_seed_no_augment" + rules_guardrail_no_augment = "rules_guardrail_no_augment" + rules_filter_guardrail_no_augment = "rules_filter_guardrail_no_augment" + rules_guardrail_detector_only = "rules_guardrail_detector_only" detector_only = "detector_only" + rules_only = "rules_only" + rules_covered_or_default = "rules_covered_or_default" + native_rules_router = "native_rules_router" native_candidate_validate_no_augment = "native_candidate_validate_no_augment" detector_native_validate_no_augment = "detector_native_validate_no_augment" detector_native_validate_native_augment = "detector_native_validate_native_augment" @@ -104,6 +135,7 @@ class ExperimentalDetectionStrategy(StrEnum): _DetectAndValidate = Callable[..., dw.EntityDetectionResult] _AugmentPrompt = Callable[..., str] +_MaterializeFinalEntities = Callable[..., dict] PROSE_AUGMENT_FOCUS_TEXT = """\ Contextual prose recall focus: - Re-scan untagged narrative prose for organization and institution names, named facilities, labs, research centers, street or place names, self-described beliefs, occupations, titles, family member names, and other quasi-identifiers that combine with already tagged entities. @@ -131,6 +163,14 @@ class NativeDetectionRuntime: max_workers: int = _DIRECT_MAX_WORKERS +@dataclass(frozen=True) +class _NoAugmentOptions: + include_rules: bool + final_rule_guardrail: bool = False + filter_rule_overlaps: bool = False + rule_labels: tuple[str, ...] = () + + @dataclass(frozen=True) class _NativeStagedTask: ordinal: int @@ -192,6 +232,7 @@ class _DetectorNativeValidationRowResult: def experimental_detection_strategy_context( strategy: ExperimentalDetectionStrategy, *, + rule_labels: list[str] | None = None, native_client: DirectDetectionClient | None = None, gliner_seed_client: GlinerSeedClient | None = None, native_runtime: NativeDetectionRuntime | None = None, @@ -203,12 +244,19 @@ def experimental_detection_strategy_context( original_method = dw.EntityDetectionWorkflow.detect_and_validate_entities original_augment_prompt = dw._get_augment_prompt + original_materialize_final_entities = dw._materialize_final_entities + if rule_labels: + dw._materialize_final_entities = _make_rule_label_materializer( # type: ignore[assignment] + original_materialize_final_entities, + rule_labels=rule_labels, + ) if strategy == ExperimentalDetectionStrategy.prose_augment_focus: dw._get_augment_prompt = _make_prose_augment_prompt(original_augment_prompt) # type: ignore[assignment] else: dw.EntityDetectionWorkflow.detect_and_validate_entities = _method_for_strategy( # type: ignore[method-assign] strategy, original=original_method, + rule_labels=rule_labels, native_client=native_client, gliner_seed_client=gliner_seed_client, native_runtime=native_runtime or NativeDetectionRuntime(), @@ -218,6 +266,20 @@ def experimental_detection_strategy_context( finally: dw.EntityDetectionWorkflow.detect_and_validate_entities = original_method # type: ignore[method-assign] dw._get_augment_prompt = original_augment_prompt # type: ignore[assignment] + dw._materialize_final_entities = original_materialize_final_entities # type: ignore[assignment] + + +def _make_rule_label_materializer( + original: _MaterializeFinalEntities, + *, + rule_labels: list[str], +) -> _MaterializeFinalEntities: + def materialize_final_entities(raw: object, *, allowed_labels: set[str] | None) -> dict: + if allowed_labels is None: + return original(raw, allowed_labels=allowed_labels) + return original(raw, allowed_labels={*allowed_labels, *rule_labels}) + + return materialize_final_entities def _make_prose_augment_prompt(original: _AugmentPrompt) -> _AugmentPrompt: @@ -232,6 +294,7 @@ def _method_for_strategy( strategy: ExperimentalDetectionStrategy, *, original: _DetectAndValidate | None = None, + rule_labels: list[str] | None = None, native_client: DirectDetectionClient | None = None, gliner_seed_client: GlinerSeedClient | None = None, native_runtime: NativeDetectionRuntime | None = None, @@ -241,10 +304,49 @@ def _method_for_strategy( if original is None: raise ValueError("compact_validation requires the original detection method") return _make_default_compact_validation_method(original) + if strategy == ExperimentalDetectionStrategy.rules_guardrail: + if original is None: + raise ValueError("rules_guardrail requires the original detection method") + return _make_default_with_rule_guardrail_method(original, rule_labels=rule_labels) + if strategy == ExperimentalDetectionStrategy.rules_filter_guardrail: + return _make_validated_augmented_rule_filter_guardrail_method(rule_labels=rule_labels) + if strategy == ExperimentalDetectionStrategy.rules_guardrail_compact_validation: + if original is None: + raise ValueError("rules_guardrail_compact_validation requires the original detection method") + return _make_default_with_rule_guardrail_method( + original, + rule_labels=rule_labels, + compact_validation=True, + ) if strategy == ExperimentalDetectionStrategy.no_augment: - return _make_validated_no_augment_method() + return _make_validated_no_augment_method(include_rules=False) + if strategy == ExperimentalDetectionStrategy.rules_seed_no_augment: + return _make_validated_no_augment_method(include_rules=True, rule_labels=rule_labels) + if strategy == ExperimentalDetectionStrategy.rules_guardrail_no_augment: + return _make_validated_no_augment_method( + include_rules=False, + final_rule_guardrail=True, + rule_labels=rule_labels, + ) + if strategy == ExperimentalDetectionStrategy.rules_filter_guardrail_no_augment: + return _make_validated_no_augment_method( + include_rules=False, + final_rule_guardrail=True, + filter_rule_overlaps=True, + rule_labels=rule_labels, + ) + if strategy == ExperimentalDetectionStrategy.rules_guardrail_detector_only: + return _make_detector_only_with_rule_guardrail_method(rule_labels=rule_labels) if strategy == ExperimentalDetectionStrategy.detector_only: return _detect_with_detector_only + if strategy == ExperimentalDetectionStrategy.rules_covered_or_default: + if original is None: + raise ValueError("rules_covered_or_default requires the original detection method") + return _make_rules_covered_or_default_method(original) + if strategy == ExperimentalDetectionStrategy.rules_only: + return _detect_with_rules_only + if strategy == ExperimentalDetectionStrategy.native_rules_router: + return _make_native_rules_router_method(native_client=native_client, native_runtime=runtime) if strategy == ExperimentalDetectionStrategy.native_candidate_validate_no_augment: return _make_native_candidate_validate_no_augment_method(native_client=native_client, native_runtime=runtime) if strategy == ExperimentalDetectionStrategy.detector_native_validate_no_augment: @@ -314,7 +416,236 @@ def detect_and_validate_entities( return detect_and_validate_entities -def _make_validated_no_augment_method() -> _DetectAndValidate: +def _make_default_with_rule_guardrail_method( + original: _DetectAndValidate, + *, + rule_labels: list[str] | None = None, + compact_validation: bool = False, +) -> _DetectAndValidate: + def detect_and_validate_entities( + self: dw.EntityDetectionWorkflow, + dataframe: pd.DataFrame, + *, + model_configs: list[ModelConfig], + selected_models: DetectionModelSelection, + gliner_detection_threshold: float, + validation_max_entities_per_call: int = dw._DEFAULT_VALIDATION_MAX_ENTITIES_PER_CALL, + validation_excerpt_window_chars: int = dw._DEFAULT_VALIDATION_EXCERPT_WINDOW_CHARS, + entity_labels: list[str] | None = None, + data_summary: str | None = None, + preview_num_records: int | None = None, + ) -> dw.EntityDetectionResult: + result = original( + self, + dataframe, + model_configs=model_configs, + selected_models=selected_models, + gliner_detection_threshold=gliner_detection_threshold, + validation_max_entities_per_call=validation_max_entities_per_call, + validation_excerpt_window_chars=validation_excerpt_window_chars, + validation_single_chunk_full_text=not compact_validation, + entity_labels=entity_labels, + data_summary=data_summary, + preview_num_records=preview_num_records, + ) + output = _apply_rule_guardrail( + result.dataframe.copy(), + labels=_rule_labels_for_detection(entity_labels, extra_rule_labels=rule_labels), + ) + return dw.EntityDetectionResult(dataframe=output, failed_records=result.failed_records) + + return detect_and_validate_entities + + +def _make_validated_augmented_rule_filter_guardrail_method( + *, + rule_labels: list[str] | None = None, +) -> _DetectAndValidate: + def detect_and_validate_entities( + self: dw.EntityDetectionWorkflow, + dataframe: pd.DataFrame, + *, + model_configs: list[ModelConfig], + selected_models: DetectionModelSelection, + gliner_detection_threshold: float, + validation_max_entities_per_call: int = dw._DEFAULT_VALIDATION_MAX_ENTITIES_PER_CALL, + validation_excerpt_window_chars: int = dw._DEFAULT_VALIDATION_EXCERPT_WINDOW_CHARS, + entity_labels: list[str] | None = None, + data_summary: str | None = None, + preview_num_records: int | None = None, + ) -> dw.EntityDetectionResult: + return _run_validated_augmented_rule_filter_guardrail_detection( + self, + dataframe, + model_configs=model_configs, + selected_models=selected_models, + gliner_detection_threshold=gliner_detection_threshold, + preview_num_records=preview_num_records, + validation_max_entities_per_call=validation_max_entities_per_call, + validation_excerpt_window_chars=validation_excerpt_window_chars, + entity_labels=entity_labels, + data_summary=data_summary, + rule_labels=rule_labels, + ) + + return detect_and_validate_entities + + +def _run_validated_augmented_rule_filter_guardrail_detection( + workflow: dw.EntityDetectionWorkflow, + dataframe: pd.DataFrame, + *, + model_configs: list[ModelConfig], + selected_models: DetectionModelSelection, + gliner_detection_threshold: float, + preview_num_records: int | None, + validation_max_entities_per_call: int, + validation_excerpt_window_chars: int, + entity_labels: list[str] | None, + data_summary: str | None, + rule_labels: list[str] | None, +) -> dw.EntityDetectionResult: + labels = dw._resolve_detection_labels(entity_labels) + workflow_model_configs = workflow._inject_detector_params( + model_configs=model_configs, + selected_models=selected_models, + labels=labels, + gliner_detection_threshold=gliner_detection_threshold, + ) + detection_result = workflow._adapter.run_workflow( + dataframe, + model_configs=workflow_model_configs, + columns=_validated_augmented_rule_filter_guardrail_columns( + selected_models=selected_models, + labels=labels, + data_summary=data_summary, + validation_max_entities_per_call=validation_max_entities_per_call, + validation_excerpt_window_chars=validation_excerpt_window_chars, + strict_labels=entity_labels is not None, + rule_labels=rule_labels, + ), + workflow_name="entity-detection-rules-filter-guardrail", + preview_num_records=preview_num_records, + ) + return dw.EntityDetectionResult( + dataframe=detection_result.dataframe.copy(), + failed_records=detection_result.failed_records, + ) + + +def _validated_augmented_rule_filter_guardrail_columns( + *, + selected_models: DetectionModelSelection, + labels: list[str], + data_summary: str | None, + validation_max_entities_per_call: int, + validation_excerpt_window_chars: int, + strict_labels: bool, + rule_labels: list[str] | None, +) -> list[LLMTextColumnConfig | LLMStructuredColumnConfig | CustomColumnConfig]: + validator_params = _validator_params( + selected_models=selected_models, + labels=labels, + data_summary=data_summary, + validation_max_entities_per_call=validation_max_entities_per_call, + validation_excerpt_window_chars=validation_excerpt_window_chars, + ) + rule_detection_labels = _rule_labels_for_detection(labels, extra_rule_labels=rule_labels) + return [ + LLMTextColumnConfig( + name=COL_RAW_DETECTED, + prompt=_jinja(COL_TEXT), + model_alias=_detector_alias(selected_models), + ), + CustomColumnConfig( + name=COL_SEED_ENTITIES, + generator_function=_make_parse_detected_entities_filtering_rules(rule_detection_labels), + ), + CustomColumnConfig(name=COL_SEED_VALIDATION_CANDIDATES, generator_function=prepare_validation_inputs), + _validation_decisions_column(selected_models, validator_params), + CustomColumnConfig(name=COL_VALIDATED_ENTITIES, generator_function=enrich_validation_decisions), + CustomColumnConfig( + name=COL_SEED_ENTITIES_JSON, + generator_function=_make_apply_validation_to_seed_entities_with_additive_rule_guardrail( + rule_detection_labels + ), + ), + LLMStructuredColumnConfig( + name=COL_AUGMENTED_ENTITIES, + prompt=dw._get_augment_prompt(data_summary=data_summary, labels=labels, strict_labels=strict_labels), + model_alias=resolve_model_alias("entity_augmenter", selected_models), + output_format=AugmentedEntitiesSchema, + ), + CustomColumnConfig(name=COL_MERGED_ENTITIES, generator_function=merge_and_build_candidates), + CustomColumnConfig( + name=COL_DETECTED_ENTITIES, + generator_function=_make_apply_validation_and_finalize_with_additive_rule_guardrail(rule_detection_labels), + ), + ] + + +def _rule_labels_for_detection( + entity_labels: list[str] | None, + *, + extra_rule_labels: list[str] | tuple[str, ...] | None = None, +) -> list[str]: + labels = set(dw._resolve_detection_labels(entity_labels)) + labels.update(extra_rule_labels or []) + return sorted(labels) + + +def _apply_rule_guardrail(dataframe: pd.DataFrame, *, labels: list[str]) -> pd.DataFrame: + if COL_TEXT not in dataframe.columns or COL_DETECTED_ENTITIES not in dataframe.columns: + return dataframe + dataframe[COL_DETECTED_ENTITIES] = dataframe[COL_DETECTED_ENTITIES].astype("object") + if COL_TAGGED_TEXT in dataframe.columns: + dataframe[COL_TAGGED_TEXT] = dataframe[COL_TAGGED_TEXT].astype("object") + for index, row in dataframe.iterrows(): + guarded = _guarded_entities( + text=str(row.get(COL_TEXT, "")), raw_entities=row.get(COL_DETECTED_ENTITIES), labels=labels + ) + dataframe.at[index, COL_DETECTED_ENTITIES] = EntitiesSchema( + entities=[entity.as_dict() for entity in guarded] + ).model_dump(mode="json") + if COL_TAGGED_TEXT in dataframe.columns: + dataframe.at[index, COL_TAGGED_TEXT] = build_tagged_text(text=str(row.get(COL_TEXT, "")), entities=guarded) + return dataframe + + +def _guarded_entities(*, text: str, raw_entities: object, labels: list[str]) -> list[EntitySpan]: + final_spans = _entity_spans_from_payload(raw_entities) + rule_spans = detect_high_confidence_entities(text, labels=labels) + return _merge_rule_guardrail_spans(final_spans, rule_spans) + + +def _merge_rule_guardrail_spans(final_spans: list[EntitySpan], rule_spans: list[EntitySpan]) -> list[EntitySpan]: + filtered_final = [ + entity + for entity in final_spans + if not any( + rule.start_position == entity.start_position + and rule.end_position == entity.end_position + and rule.label != entity.label + for rule in rule_spans + ) + ] + return resolve_overlaps([*filtered_final, *rule_spans]) + + +def _make_validated_no_augment_method( + *, + include_rules: bool, + final_rule_guardrail: bool = False, + filter_rule_overlaps: bool = False, + rule_labels: list[str] | None = None, +) -> _DetectAndValidate: + options = _NoAugmentOptions( + include_rules=include_rules, + final_rule_guardrail=final_rule_guardrail, + filter_rule_overlaps=filter_rule_overlaps, + rule_labels=tuple(rule_labels or ()), + ) + def detect_and_validate_entities( self: dw.EntityDetectionWorkflow, dataframe: pd.DataFrame, @@ -339,6 +670,7 @@ def detect_and_validate_entities( validation_excerpt_window_chars=validation_excerpt_window_chars, entity_labels=entity_labels, data_summary=data_summary, + options=options, ) return detect_and_validate_entities @@ -356,6 +688,7 @@ def _run_validated_no_augment_detection( validation_excerpt_window_chars: int, entity_labels: list[str] | None, data_summary: str | None, + options: _NoAugmentOptions, ) -> dw.EntityDetectionResult: labels = dw._resolve_detection_labels(entity_labels) workflow_model_configs = workflow._inject_detector_params( @@ -373,8 +706,9 @@ def _run_validated_no_augment_detection( data_summary=data_summary, validation_max_entities_per_call=validation_max_entities_per_call, validation_excerpt_window_chars=validation_excerpt_window_chars, + options=options, ), - workflow_name="entity-detection-no-augment", + workflow_name=_workflow_name_for_no_augment(options), preview_num_records=preview_num_records, ) return dw.EntityDetectionResult( @@ -390,6 +724,7 @@ def _validated_no_augment_columns( data_summary: str | None, validation_max_entities_per_call: int, validation_excerpt_window_chars: int, + options: _NoAugmentOptions, ) -> list[LLMTextColumnConfig | CustomColumnConfig]: validator_params = _validator_params( selected_models=selected_models, @@ -398,18 +733,28 @@ def _validated_no_augment_columns( validation_max_entities_per_call=validation_max_entities_per_call, validation_excerpt_window_chars=validation_excerpt_window_chars, ) + parse_generator = _parse_generator( + labels=_rule_labels_for_detection(labels, extra_rule_labels=options.rule_labels), + include_rules=options.include_rules, + filter_rule_overlaps=options.filter_rule_overlaps, + ) return [ LLMTextColumnConfig( name=COL_RAW_DETECTED, prompt=_jinja(COL_TEXT), model_alias=_detector_alias(selected_models) ), - CustomColumnConfig(name=COL_SEED_ENTITIES, generator_function=parse_detected_entities), + CustomColumnConfig(name=COL_SEED_ENTITIES, generator_function=parse_generator), CustomColumnConfig(name=COL_SEED_VALIDATION_CANDIDATES, generator_function=prepare_validation_inputs), _validation_decisions_column(selected_models, validator_params), CustomColumnConfig(name=COL_VALIDATED_ENTITIES, generator_function=enrich_validation_decisions), CustomColumnConfig(name=COL_SEED_ENTITIES_JSON, generator_function=apply_validation_to_seed_entities), CustomColumnConfig(name=COL_AUGMENTED_ENTITIES, generator_function=_empty_augmentation), CustomColumnConfig(name=COL_MERGED_ENTITIES, generator_function=merge_and_build_candidates), - CustomColumnConfig(name=COL_DETECTED_ENTITIES, generator_function=apply_validation_and_finalize), + CustomColumnConfig( + name=COL_DETECTED_ENTITIES, + generator_function=_finalizer( + _rule_labels_for_detection(labels, extra_rule_labels=options.rule_labels), options + ), + ), ] @@ -427,6 +772,24 @@ def _validation_decisions_column( ) +def _finalizer(labels: list[str], options: _NoAugmentOptions) -> Callable[[dict[str, Any]], dict[str, Any]]: + if options.filter_rule_overlaps: + return _make_apply_validation_and_finalize_with_additive_rule_guardrail(labels) + if options.final_rule_guardrail: + return _make_apply_validation_and_finalize_with_rule_guardrail(labels) + return apply_validation_and_finalize + + +def _workflow_name_for_no_augment(options: _NoAugmentOptions) -> str: + if options.filter_rule_overlaps: + return "entity-detection-rules-filter-guardrail-no-augment" + if options.final_rule_guardrail: + return "entity-detection-rules-guardrail-no-augment" + if options.include_rules: + return "entity-detection-rules-no-augment" + return "entity-detection-no-augment" + + def _validator_params( *, selected_models: DetectionModelSelection, @@ -470,10 +833,41 @@ def _detect_with_detector_only( gliner_detection_threshold=gliner_detection_threshold, entity_labels=entity_labels, preview_num_records=preview_num_records, + rule_labels=None, workflow_name="entity-detection-detector-only", ) +def _make_detector_only_with_rule_guardrail_method(rule_labels: list[str] | None) -> _DetectAndValidate: + def detect_and_validate_entities( + self: dw.EntityDetectionWorkflow, + dataframe: pd.DataFrame, + *, + model_configs: list[ModelConfig], + selected_models: DetectionModelSelection, + gliner_detection_threshold: float, + validation_max_entities_per_call: int = dw._DEFAULT_VALIDATION_MAX_ENTITIES_PER_CALL, + validation_excerpt_window_chars: int = dw._DEFAULT_VALIDATION_EXCERPT_WINDOW_CHARS, + validation_single_chunk_full_text: bool = True, + entity_labels: list[str] | None = None, + data_summary: str | None = None, + preview_num_records: int | None = None, + ) -> dw.EntityDetectionResult: + return _run_detector_only_detection( + self, + dataframe, + model_configs=model_configs, + selected_models=selected_models, + gliner_detection_threshold=gliner_detection_threshold, + entity_labels=entity_labels, + preview_num_records=preview_num_records, + rule_labels=_rule_labels_for_detection(entity_labels, extra_rule_labels=rule_labels), + workflow_name="entity-detection-rules-guardrail-detector-only", + ) + + return detect_and_validate_entities + + def _run_detector_only_detection( workflow: dw.EntityDetectionWorkflow, dataframe: pd.DataFrame, @@ -483,6 +877,7 @@ def _run_detector_only_detection( gliner_detection_threshold: float, entity_labels: list[str] | None, preview_num_records: int | None, + rule_labels: list[str] | None, workflow_name: str, ) -> dw.EntityDetectionResult: labels = dw._resolve_detection_labels(entity_labels) @@ -495,7 +890,7 @@ def _run_detector_only_detection( detection_result = workflow._adapter.run_workflow( dataframe, model_configs=workflow_model_configs, - columns=_detector_only_columns(selected_models), + columns=_detector_only_columns(selected_models, rule_labels=rule_labels), workflow_name=workflow_name, preview_num_records=preview_num_records, ) @@ -505,7 +900,11 @@ def _run_detector_only_detection( ) -def _detector_only_columns(selected_models: DetectionModelSelection) -> list[LLMTextColumnConfig | CustomColumnConfig]: +def _detector_only_columns( + selected_models: DetectionModelSelection, + *, + rule_labels: list[str] | None, +) -> list[LLMTextColumnConfig | CustomColumnConfig]: return [ LLMTextColumnConfig( name=COL_RAW_DETECTED, @@ -514,10 +913,16 @@ def _detector_only_columns(selected_models: DetectionModelSelection) -> list[LLM ), CustomColumnConfig(name=COL_SEED_ENTITIES, generator_function=parse_detected_entities), CustomColumnConfig(name=COL_SEED_ENTITIES_JSON, generator_function=_copy_seed_entities_json), - CustomColumnConfig(name=COL_DETECTED_ENTITIES, generator_function=_finalize_detector_only), + CustomColumnConfig(name=COL_DETECTED_ENTITIES, generator_function=_detector_only_finalizer(rule_labels)), ] +def _detector_only_finalizer(rule_labels: list[str] | None) -> Callable[[dict[str, Any]], dict[str, Any]]: + if rule_labels is None: + return _finalize_detector_only + return _make_finalize_detector_only_with_rule_guardrail(rule_labels) + + @custom_column_generator(required_columns=[COL_SEED_ENTITIES]) def _copy_seed_entities_json(row: dict[str, Any]) -> dict[str, Any]: row[COL_SEED_ENTITIES_JSON] = json.dumps( @@ -538,12 +943,191 @@ def _finalize_detector_only(row: dict[str, Any]) -> dict[str, Any]: return row +def _make_finalize_detector_only_with_rule_guardrail(labels: list[str]) -> Callable[[dict[str, Any]], dict[str, Any]]: + @custom_column_generator( + required_columns=[COL_TEXT, COL_SEED_ENTITIES], + side_effect_columns=[COL_TAGGED_TEXT], + ) + def finalize_detector_only_with_rule_guardrail(row: dict[str, Any]) -> dict[str, Any]: + row = _finalize_detector_only(row) + text = str(row.get(COL_TEXT, "")) + final_spans = _entity_spans_from_payload(row.get(COL_DETECTED_ENTITIES, {})) + rule_spans = detect_high_confidence_entities(text, labels=labels) + guarded = _merge_rule_guardrail_spans(final_spans, rule_spans) + row[COL_DETECTED_ENTITIES] = EntitiesSchema(entities=[span.as_dict() for span in guarded]).model_dump( + mode="json" + ) + row[COL_TAGGED_TEXT] = build_tagged_text(text=text, entities=guarded) + return row + + return finalize_detector_only_with_rule_guardrail + + @custom_column_generator(required_columns=[COL_TEXT]) def _empty_augmentation(row: dict[str, Any]) -> dict[str, Any]: row[COL_AUGMENTED_ENTITIES] = AugmentedEntitiesSchema().model_dump(mode="json") return row +def _parse_generator( + *, + labels: list[str], + include_rules: bool, + filter_rule_overlaps: bool, +) -> Callable[[dict[str, Any]], dict[str, Any]]: + if filter_rule_overlaps: + return _make_parse_detected_entities_filtering_rules(labels) + if include_rules: + return _make_parse_detected_entities_with_rules(labels) + return parse_detected_entities + + +def _make_parse_detected_entities_with_rules(labels: list[str]) -> Callable[[dict[str, Any]], dict[str, Any]]: + @custom_column_generator( + required_columns=[COL_TEXT, COL_RAW_DETECTED], + side_effect_columns=[COL_TAG_NOTATION], + ) + def parse_detected_entities_with_rules(row: dict[str, Any]) -> dict[str, Any]: + text = str(row.get(COL_TEXT, "")) + detected = parse_raw_entities(raw_response=str(row.get(COL_RAW_DETECTED, "")), text=text) + rule_spans = detect_high_confidence_entities(text, labels=labels) + row[COL_SEED_ENTITIES] = EntitiesSchema( + entities=[entity.as_dict() for entity in resolve_overlaps([*detected, *rule_spans])] + ).model_dump(mode="json") + row[COL_TAG_NOTATION] = get_tag_notation(text=text) + return row + + return parse_detected_entities_with_rules + + +def _make_parse_detected_entities_filtering_rules(labels: list[str]) -> Callable[[dict[str, Any]], dict[str, Any]]: + @custom_column_generator( + required_columns=[COL_TEXT, COL_RAW_DETECTED], + side_effect_columns=[COL_TAG_NOTATION], + ) + def parse_detected_entities_filtering_rules(row: dict[str, Any]) -> dict[str, Any]: + text = str(row.get(COL_TEXT, "")) + detected = parse_raw_entities(raw_response=str(row.get(COL_RAW_DETECTED, "")), text=text) + rule_spans = detect_high_confidence_entities(text, labels=labels) + filtered = [entity for entity in detected if not _is_rule_covered_detector_span(entity, rule_spans)] + row[COL_SEED_ENTITIES] = EntitiesSchema( + entities=[entity.as_dict() for entity in resolve_overlaps(filtered)] + ).model_dump(mode="json") + row[COL_TAG_NOTATION] = get_tag_notation(text=text) + return row + + return parse_detected_entities_filtering_rules + + +def _make_apply_validation_and_finalize_with_rule_guardrail( + labels: list[str], +) -> Callable[[dict[str, Any]], dict[str, Any]]: + @custom_column_generator( + required_columns=[COL_TEXT, COL_MERGED_ENTITIES, COL_VALIDATED_ENTITIES], + side_effect_columns=[COL_TAGGED_TEXT], + ) + def apply_validation_and_finalize_with_rule_guardrail(row: dict[str, Any]) -> dict[str, Any]: + row = apply_validation_and_finalize(row) + text = str(row.get(COL_TEXT, "")) + final_spans = _entity_spans_from_payload(row.get(COL_DETECTED_ENTITIES, {})) + rule_spans = detect_high_confidence_entities(text, labels=labels) + guarded = _merge_rule_guardrail_spans(final_spans, rule_spans) + row[COL_DETECTED_ENTITIES] = EntitiesSchema(entities=[entity.as_dict() for entity in guarded]).model_dump( + mode="json" + ) + row[COL_TAGGED_TEXT] = build_tagged_text(text=text, entities=guarded) + return row + + return apply_validation_and_finalize_with_rule_guardrail + + +def _make_apply_validation_and_finalize_with_additive_rule_guardrail( + labels: list[str], +) -> Callable[[dict[str, Any]], dict[str, Any]]: + @custom_column_generator( + required_columns=[COL_TEXT, COL_MERGED_ENTITIES, COL_VALIDATED_ENTITIES], + side_effect_columns=[COL_TAGGED_TEXT], + ) + def apply_validation_and_finalize_with_additive_rule_guardrail(row: dict[str, Any]) -> dict[str, Any]: + row = apply_validation_and_finalize(row) + text = str(row.get(COL_TEXT, "")) + final_spans = _entity_spans_from_payload(row.get(COL_DETECTED_ENTITIES, {})) + rule_spans = detect_high_confidence_entities(text, labels=labels) + guarded = _add_non_overlapping_rule_spans(final_spans, rule_spans) + row[COL_DETECTED_ENTITIES] = EntitiesSchema(entities=[entity.as_dict() for entity in guarded]).model_dump( + mode="json" + ) + row[COL_TAGGED_TEXT] = build_tagged_text(text=text, entities=guarded) + return row + + return apply_validation_and_finalize_with_additive_rule_guardrail + + +def _make_apply_validation_to_seed_entities_with_rule_guardrail( + labels: list[str], +) -> Callable[[dict[str, Any]], dict[str, Any]]: + @custom_column_generator( + required_columns=[COL_TEXT, COL_SEED_ENTITIES, COL_VALIDATED_ENTITIES], + side_effect_columns=[COL_INITIAL_TAGGED_TEXT, COL_SEED_ENTITIES_JSON, COL_VALIDATED_SEED_ENTITIES], + ) + def apply_validation_to_seed_entities_with_rule_guardrail(row: dict[str, Any]) -> dict[str, Any]: + row = apply_validation_to_seed_entities(row) + text = str(row.get(COL_TEXT, "")) + validated_seed = _entity_spans_from_payload(row.get(COL_VALIDATED_SEED_ENTITIES, {})) + rule_spans = detect_high_confidence_entities(text, labels=labels) + guarded = _merge_rule_guardrail_spans(validated_seed, rule_spans) + seed_entities = [entity.as_dict() for entity in guarded] + row[COL_VALIDATED_SEED_ENTITIES] = EntitiesSchema(entities=seed_entities).model_dump(mode="json") + row[COL_SEED_ENTITIES_JSON] = json.dumps(seed_entities) + row[COL_INITIAL_TAGGED_TEXT] = build_tagged_text(text=text, entities=guarded) + return row + + return apply_validation_to_seed_entities_with_rule_guardrail + + +def _make_apply_validation_to_seed_entities_with_additive_rule_guardrail( + labels: list[str], +) -> Callable[[dict[str, Any]], dict[str, Any]]: + @custom_column_generator( + required_columns=[COL_TEXT, COL_SEED_ENTITIES, COL_VALIDATED_ENTITIES], + side_effect_columns=[COL_INITIAL_TAGGED_TEXT, COL_SEED_ENTITIES_JSON, COL_VALIDATED_SEED_ENTITIES], + ) + def apply_validation_to_seed_entities_with_additive_rule_guardrail(row: dict[str, Any]) -> dict[str, Any]: + row = apply_validation_to_seed_entities(row) + text = str(row.get(COL_TEXT, "")) + validated_seed = _entity_spans_from_payload(row.get(COL_VALIDATED_SEED_ENTITIES, {})) + rule_spans = detect_high_confidence_entities(text, labels=labels) + guarded = _add_non_overlapping_rule_spans(validated_seed, rule_spans) + seed_entities = [entity.as_dict() for entity in guarded] + row[COL_VALIDATED_SEED_ENTITIES] = EntitiesSchema(entities=seed_entities).model_dump(mode="json") + row[COL_SEED_ENTITIES_JSON] = json.dumps(seed_entities) + row[COL_INITIAL_TAGGED_TEXT] = build_tagged_text(text=text, entities=guarded) + return row + + return apply_validation_to_seed_entities_with_additive_rule_guardrail + + +def _is_rule_covered_detector_span(entity: EntitySpan, spans: list[EntitySpan]) -> bool: + return any( + entity.label == span.label + and span.start_position <= entity.start_position + and span.end_position >= entity.end_position + for span in spans + ) + + +def _add_non_overlapping_rule_spans( + existing_spans: list[EntitySpan], + rule_spans: list[EntitySpan], +) -> list[EntitySpan]: + additions = [rule for rule in rule_spans if not any(_spans_overlap(rule, existing) for existing in existing_spans)] + return resolve_overlaps([*existing_spans, *additions]) + + +def _spans_overlap(left: EntitySpan, right: EntitySpan) -> bool: + return max(left.start_position, right.start_position) < min(left.end_position, right.end_position) + + def _entity_spans_from_payload(raw_payload: object) -> list[EntitySpan]: return [ EntitySpan( @@ -682,7 +1266,7 @@ def _execute_native_single_pass_row( failed_record_count=0, runtime=runtime, ) - return _native_single_pass_result_row(row, spans=spans), None + return _native_single_pass_result_row(row, spans=spans, labels=labels), None def _complete_native_single_pass( @@ -834,13 +1418,15 @@ def _native_single_pass_span(*, value: str, label: str, start: int, end: int) -> ) -def _native_single_pass_result_row(row: pd.Series, *, spans: list[EntitySpan]) -> dict[str, Any]: +def _native_single_pass_result_row(row: pd.Series, *, spans: list[EntitySpan], labels: list[str]) -> dict[str, Any]: text = str(row.get(COL_TEXT, "")) + rule_spans = detect_high_confidence_entities(text, labels=labels) + guarded = _add_non_overlapping_rule_spans(spans, rule_spans) output_row = row.to_dict() - output_row[COL_DETECTED_ENTITIES] = EntitiesSchema(entities=[span.as_dict() for span in spans]).model_dump( + output_row[COL_DETECTED_ENTITIES] = EntitiesSchema(entities=[span.as_dict() for span in guarded]).model_dump( mode="json" ) - output_row[COL_TAGGED_TEXT] = build_tagged_text(text=text, entities=spans) + output_row[COL_TAGGED_TEXT] = build_tagged_text(text=text, entities=guarded) return output_row @@ -936,6 +1522,21 @@ def _require_native_endpoint(runtime: NativeDetectionRuntime) -> None: ) +def _make_native_rules_router_method( + *, + native_client: DirectDetectionClient | None, + native_runtime: NativeDetectionRuntime, +) -> _DetectAndValidate: + return _make_native_staged_method( + native_client=native_client, + gliner_seed_client=None, + native_runtime=native_runtime, + seed_source=SeedSource.rules_router, + workflow_name="entity-detection-native-rules-router", + skip_augmentation=False, + ) + + def _make_native_candidate_validate_no_augment_method( *, native_client: DirectDetectionClient | None, @@ -945,7 +1546,7 @@ def _make_native_candidate_validate_no_augment_method( native_client=native_client, gliner_seed_client=None, native_runtime=native_runtime, - seed_source=SeedSource.direct_llm, + seed_source=SeedSource.rules_plus_direct_llm, workflow_name="entity-detection-native-candidate-validate-no-augment", skip_augmentation=True, ) @@ -1656,7 +2257,7 @@ def _native_staged_request( data_summary: str | None, ) -> StagedDetectionRequest: return StagedDetectionRequest( - case_id=f"native-staged-{ordinal}", + case_id=f"native-rules-router-{ordinal}", text=str(row.get(COL_TEXT, "")), labels=labels, row_index=_safe_row_index(index, fallback=ordinal), @@ -1693,3 +2294,280 @@ def _native_output_dataframe( output[COL_DETECTED_ENTITIES] = pd.Series(dtype="object") output[COL_TAGGED_TEXT] = pd.Series(dtype="object") return output + + +def _detect_with_rules_only( + self: dw.EntityDetectionWorkflow, + dataframe: pd.DataFrame, + *, + model_configs: list[ModelConfig], + selected_models: DetectionModelSelection, + gliner_detection_threshold: float, + validation_max_entities_per_call: int = dw._DEFAULT_VALIDATION_MAX_ENTITIES_PER_CALL, + validation_excerpt_window_chars: int = dw._DEFAULT_VALIDATION_EXCERPT_WINDOW_CHARS, + validation_single_chunk_full_text: bool = True, + entity_labels: list[str] | None = None, + data_summary: str | None = None, + preview_num_records: int | None = None, +) -> dw.EntityDetectionResult: + return self.detect_with_high_confidence_rules(dataframe, entity_labels=entity_labels) + + +def _make_rules_covered_or_default_method(original: _DetectAndValidate) -> _DetectAndValidate: + def detect_and_validate_entities( + self: dw.EntityDetectionWorkflow, + dataframe: pd.DataFrame, + *, + model_configs: list[ModelConfig], + selected_models: DetectionModelSelection, + gliner_detection_threshold: float, + validation_max_entities_per_call: int = dw._DEFAULT_VALIDATION_MAX_ENTITIES_PER_CALL, + validation_excerpt_window_chars: int = dw._DEFAULT_VALIDATION_EXCERPT_WINDOW_CHARS, + validation_single_chunk_full_text: bool = True, + entity_labels: list[str] | None = None, + data_summary: str | None = None, + preview_num_records: int | None = None, + ) -> dw.EntityDetectionResult: + labels = dw._resolve_detection_labels(entity_labels) + if _labels_are_rules_only(labels): + return _detect_rules_covered_rows_or_default( + original, + self, + dataframe, + model_configs=model_configs, + selected_models=selected_models, + gliner_detection_threshold=gliner_detection_threshold, + validation_max_entities_per_call=validation_max_entities_per_call, + validation_excerpt_window_chars=validation_excerpt_window_chars, + validation_single_chunk_full_text=validation_single_chunk_full_text, + entity_labels=entity_labels, + data_summary=data_summary, + preview_num_records=preview_num_records, + labels=labels, + ) + return original( + self, + dataframe, + model_configs=model_configs, + selected_models=selected_models, + gliner_detection_threshold=gliner_detection_threshold, + validation_max_entities_per_call=validation_max_entities_per_call, + validation_excerpt_window_chars=validation_excerpt_window_chars, + validation_single_chunk_full_text=validation_single_chunk_full_text, + entity_labels=entity_labels, + data_summary=data_summary, + preview_num_records=preview_num_records, + ) + + return detect_and_validate_entities + + +def _detect_rules_covered_rows_or_default( + original: _DetectAndValidate, + self: dw.EntityDetectionWorkflow, + dataframe: pd.DataFrame, + *, + model_configs: list[ModelConfig], + selected_models: DetectionModelSelection, + gliner_detection_threshold: float, + validation_max_entities_per_call: int, + validation_excerpt_window_chars: int, + validation_single_chunk_full_text: bool, + entity_labels: list[str] | None, + data_summary: str | None, + preview_num_records: int | None, + labels: list[str], +) -> dw.EntityDetectionResult: + started = time.perf_counter() + if dataframe.empty: + result = _detect_with_rules_only( + self, + dataframe, + model_configs=model_configs, + selected_models=selected_models, + gliner_detection_threshold=gliner_detection_threshold, + validation_max_entities_per_call=validation_max_entities_per_call, + validation_excerpt_window_chars=validation_excerpt_window_chars, + validation_single_chunk_full_text=validation_single_chunk_full_text, + entity_labels=entity_labels, + data_summary=data_summary, + preview_num_records=preview_num_records, + ) + _record_rules_covered_route( + started=started, + total_row_count=0, + rule_row_count=0, + fallback_row_count=0, + result=result, + ) + return result + + coverage_mask = dataframe[COL_TEXT].apply( + lambda text: _structured_assignments_are_rule_covered(str(text), labels=labels) + ) + if bool(coverage_mask.all()): + result = _detect_with_rules_only( + self, + dataframe, + model_configs=model_configs, + selected_models=selected_models, + gliner_detection_threshold=gliner_detection_threshold, + validation_max_entities_per_call=validation_max_entities_per_call, + validation_excerpt_window_chars=validation_excerpt_window_chars, + validation_single_chunk_full_text=validation_single_chunk_full_text, + entity_labels=entity_labels, + data_summary=data_summary, + preview_num_records=preview_num_records, + ) + _record_rules_covered_route( + started=started, + total_row_count=len(dataframe), + rule_row_count=len(dataframe), + fallback_row_count=0, + result=result, + ) + return result + if not bool(coverage_mask.any()): + result = original( + self, + dataframe, + model_configs=model_configs, + selected_models=selected_models, + gliner_detection_threshold=gliner_detection_threshold, + validation_max_entities_per_call=validation_max_entities_per_call, + validation_excerpt_window_chars=validation_excerpt_window_chars, + validation_single_chunk_full_text=validation_single_chunk_full_text, + entity_labels=entity_labels, + data_summary=data_summary, + preview_num_records=preview_num_records, + ) + _record_rules_covered_route( + started=started, + total_row_count=len(dataframe), + rule_row_count=0, + fallback_row_count=len(dataframe), + result=result, + ) + return result + + rule_rows, default_rows = split_rows( + dataframe, + column=COL_TEXT, + predicate=lambda text: _structured_assignments_are_rule_covered(str(text), labels=labels), + ) + + rule_result = _detect_with_rules_only( + self, + rule_rows, + model_configs=model_configs, + selected_models=selected_models, + gliner_detection_threshold=gliner_detection_threshold, + validation_max_entities_per_call=validation_max_entities_per_call, + validation_excerpt_window_chars=validation_excerpt_window_chars, + validation_single_chunk_full_text=validation_single_chunk_full_text, + entity_labels=entity_labels, + data_summary=data_summary, + preview_num_records=preview_num_records, + ) + default_result = original( + self, + default_rows, + model_configs=model_configs, + selected_models=selected_models, + gliner_detection_threshold=gliner_detection_threshold, + validation_max_entities_per_call=validation_max_entities_per_call, + validation_excerpt_window_chars=validation_excerpt_window_chars, + validation_single_chunk_full_text=validation_single_chunk_full_text, + entity_labels=entity_labels, + data_summary=data_summary, + preview_num_records=preview_num_records, + ) + result = dw.EntityDetectionResult( + dataframe=merge_and_reorder(rule_result.dataframe, default_result.dataframe), + failed_records=[*rule_result.failed_records, *default_result.failed_records], + ) + _record_rules_covered_route( + started=started, + total_row_count=len(dataframe), + rule_row_count=len(rule_rows), + fallback_row_count=len(default_rows), + result=result, + ) + return result + + +def _record_rules_covered_route( + *, + started: float, + total_row_count: int, + rule_row_count: int, + fallback_row_count: int, + result: dw.EntityDetectionResult, +) -> None: + record_model_workflow( + workflow_name="entity-detection-rules-covered-router", + model_aliases=[], + input_row_count=total_row_count, + output_row_count=len(result.dataframe), + failed_record_count=len(result.failed_records), + elapsed_sec=time.perf_counter() - started, + status="completed" if not result.failed_records else "partial", + extra_fields={ + "route_total_row_count": total_row_count, + "route_rule_row_count": rule_row_count, + "route_fallback_row_count": fallback_row_count, + }, + ) + + +def _structured_assignments_are_rule_covered(text: str, *, labels: list[str]) -> bool: + allowed_labels = set(labels) + rule_spans = detect_high_confidence_entities(text, labels=labels) + covered_ranges = [(span.start_position, span.end_position) for span in rule_spans] + for match in _STRUCTURED_ASSIGNMENT_RE.finditer(text): + label = _structured_assignment_label(match.group("key")) + if label not in allowed_labels: + continue + start, end = _structured_assignment_value_span(match) + if not _range_overlaps_any(start, end, covered_ranges): + return False + return True + + +def _structured_assignment_value_span(match: re.Match[str]) -> tuple[int, int]: + if match.group("quoted") is not None: + return match.span("quoted") + return match.span("bare") + + +def _range_overlaps_any(start: int, end: int, ranges: list[tuple[int, int]]) -> bool: + return any(start < range_end and end > range_start for range_start, range_end in ranges) + + +def _structured_assignment_label(key: str) -> str: + normalized = key.lower().replace("-", "_") + if normalized in {"api_key", "aws_access_key_id", "access_key_id", "hf_token", "token", "auth_token", "session_id"}: + return "api_key" + if normalized == "authorization": + return "api_key" + if normalized in {"password", "pass", "secret", "aws_secret_access_key", "django_secret"}: + return "password" + if normalized == "database_url": + return "url" + if normalized == "pin": + return "pin" + if normalized in {"user", "username", "user_name", "login", "account"}: + return "user_name" + if normalized == "cookie": + return "http_cookie" + if normalized in {"trace_id", "request_id", "req_id", "order_id", "tenant_id", "unique_id"}: + return "unique_id" + if normalized in {"url", "uri", "endpoint", "callback"}: + return "url" + if normalized == "email": + return "email" + return "" + + +def _labels_are_rules_only(labels: list[str]) -> bool: + return dw.labels_are_supported_by_structured_rule_fast_lane(labels) diff --git a/tools/measurement/extract_signature_deltas.py b/tools/measurement/extract_signature_deltas.py index 7732c147..66a8e40c 100644 --- a/tools/measurement/extract_signature_deltas.py +++ b/tools/measurement/extract_signature_deltas.py @@ -26,6 +26,7 @@ from pydantic import BaseModel, Field, ValidationError from anonymizer.engine.constants import COL_DETECTED_ENTITIES, COL_TEXT +from anonymizer.engine.detection.rules import detect_high_confidence_entities from anonymizer.engine.schemas import EntitiesSchema, EntitySchema app = cyclopts.App(help=__doc__) @@ -51,6 +52,7 @@ class DeltaSide(StrEnum): class ContextResolution(StrEnum): parquet = "parquet" artifact_details = "artifact_details" + rule = "rule" metadata_only = "metadata_only" @@ -302,6 +304,9 @@ def _resolve_signature_context( parquet_context = _parquet_entity_context(artifact_row, signature, artifact_root, context_window) if parquet_context is not None: return parquet_context + rule_context = _rule_entity_context(artifact_row, signature, label, artifact_root, context_window) + if rule_context is not None: + return rule_context detail_context = _artifact_detail_context(artifact_row, signature, label, artifact_root, context_window) return detail_context or {"resolution": ContextResolution.metadata_only} @@ -322,6 +327,24 @@ def _parquet_entity_context( return None +def _rule_entity_context( + artifact_row: dict[str, object], + signature: str, + label: str | None, + artifact_root: Path, + context_window: int, +) -> dict[str, object] | None: + record = _artifact_record(artifact_row, artifact_root) + if record is None or label is None: + return None + text, row_index, _row = record + for span in detect_high_confidence_entities(text, labels=[label]): + entity = EntitySchema.model_validate(span.as_dict()) + if _entity_signature_hash(entity, row_index=row_index) == signature: + return _entity_context(entity, text, signature, context_window, ContextResolution.rule) + return None + + def _artifact_detail_context( artifact_row: dict[str, object], signature: str, diff --git a/tools/measurement/run_benchmarks.py b/tools/measurement/run_benchmarks.py index e6a871de..77b24493 100755 --- a/tools/measurement/run_benchmarks.py +++ b/tools/measurement/run_benchmarks.py @@ -54,10 +54,15 @@ from anonymizer.config.replace_strategies import Annotate, Hash, Redact, Substitute from anonymizer.config.rewrite import DEFAULT_PRESERVE_TEXT, DEFAULT_PROTECT_TEXT, PrivacyGoal, RiskTolerance from anonymizer.engine.constants import COL_DETECTED_ENTITIES, COL_FINAL_ENTITIES +from anonymizer.engine.detection.rules import ( + STRUCTURED_RULE_FAST_LANE_LABELS, + SUPPORTED_RULE_LABELS, + detect_high_confidence_entities, +) from anonymizer.engine.io.constants import SUPPORTED_IO_FORMATS from anonymizer.engine.ndd.model_loader import parse_model_configs, validate_model_alias_references from anonymizer.engine.replace.structured_substitute import SUPPORTED_STRUCTURED_SUBSTITUTE_LABELS -from anonymizer.engine.schemas import EntitiesSchema +from anonymizer.engine.schemas import EntitiesSchema, EntitySchema from anonymizer.interface.anonymizer import Anonymizer from anonymizer.measurement import MeasurementConfig, configured_measurement_session @@ -165,6 +170,7 @@ class ConfigSpec(BaseModel): emit_telemetry: bool = False experimental_detection_strategy: ExperimentalDetectionStrategy = ExperimentalDetectionStrategy.default experimental_replacement_strategy: ExperimentalReplacementStrategy = ExperimentalReplacementStrategy.default + experimental_rule_labels: list[str] | None = None native_runtime: NativeRuntimeSpec | None = None @model_validator(mode="after") @@ -272,6 +278,11 @@ class _CaseExecution: _TRACE_FINAL_ARTIFACT_STRATEGIES = { + ExperimentalDetectionStrategy.rules_guardrail, + ExperimentalDetectionStrategy.rules_covered_or_default, + ExperimentalDetectionStrategy.rules_guardrail_compact_validation, + ExperimentalDetectionStrategy.rules_filter_guardrail, + ExperimentalDetectionStrategy.native_rules_router, ExperimentalDetectionStrategy.native_candidate_validate_no_augment, ExperimentalDetectionStrategy.detector_native_validate_no_augment, ExperimentalDetectionStrategy.detector_native_validate_native_augment, @@ -282,7 +293,18 @@ class _CaseExecution: ExperimentalDetectionStrategy.native_single_pass_values, ExperimentalDetectionStrategy.native_single_pass_values_recall, } +_RULE_BACKED_STRATEGIES = { + ExperimentalDetectionStrategy.rules_guardrail, + ExperimentalDetectionStrategy.rules_guardrail_compact_validation, + ExperimentalDetectionStrategy.rules_filter_guardrail, + ExperimentalDetectionStrategy.rules_seed_no_augment, + ExperimentalDetectionStrategy.rules_guardrail_no_augment, + ExperimentalDetectionStrategy.rules_filter_guardrail_no_augment, + ExperimentalDetectionStrategy.rules_guardrail_detector_only, + ExperimentalDetectionStrategy.rules_only, +} _NATIVE_RUNTIME_STRATEGIES = { + ExperimentalDetectionStrategy.native_rules_router, ExperimentalDetectionStrategy.native_candidate_validate_no_augment, ExperimentalDetectionStrategy.detector_native_validate_no_augment, ExperimentalDetectionStrategy.detector_native_validate_native_augment, @@ -428,6 +450,10 @@ def _preflight_config_errors(spec: BenchmarkSpec, *, parsed_models: Any | None) except Exception as exc: errors.append(f"config '{config.id}' invalid: {exc}") continue + try: + _preflight_experimental_detection_strategy(config, anonymizer_config) + except Exception as exc: + errors.append(f"config '{config.id}' experimental_detection_strategy invalid: {exc}") try: _preflight_native_runtime(config, spec=spec) except Exception as exc: @@ -457,6 +483,39 @@ def _active_config_ids(spec: BenchmarkSpec) -> set[str]: return {entry.config for entry in spec.matrix} +def _preflight_experimental_detection_strategy(config: ConfigSpec, anonymizer_config: AnonymizerConfig) -> None: + _preflight_experimental_rule_labels(config) + if config.experimental_detection_strategy != ExperimentalDetectionStrategy.rules_only: + return + entity_labels = anonymizer_config.detect.entity_labels + supported = ", ".join(sorted(SUPPORTED_RULE_LABELS)) + if entity_labels is None: + raise ValueError( + f"`rules_only` requires explicit detect.entity_labels limited to deterministic rule labels: {supported}" + ) + unsupported = sorted(set(entity_labels) - SUPPORTED_RULE_LABELS) + if unsupported: + raise ValueError( + f"unsupported high-confidence rule labels: {', '.join(unsupported)}; supported labels: {supported}" + ) + + +def _preflight_experimental_rule_labels(config: ConfigSpec) -> None: + if not config.experimental_rule_labels: + return + supported = ", ".join(sorted(SUPPORTED_RULE_LABELS)) + if config.experimental_detection_strategy not in _RULE_BACKED_STRATEGIES: + raise ValueError( + "experimental_rule_labels requires a rule-backed strategy: " + + ", ".join(sorted(strategy.value for strategy in _RULE_BACKED_STRATEGIES)) + ) + unsupported = sorted(set(config.experimental_rule_labels) - SUPPORTED_RULE_LABELS) + if unsupported: + raise ValueError( + f"unsupported experimental_rule_labels: {', '.join(unsupported)}; supported labels: {supported}" + ) + + def _preflight_native_runtime(config: ConfigSpec, *, spec: BenchmarkSpec) -> None: strategy = config.experimental_detection_strategy if strategy not in _NATIVE_RUNTIME_STRATEGIES: @@ -896,7 +955,12 @@ def _case_detection_artifact_path( ) if detection_artifact_path is not None or paths.artifact_snapshot is None: return detection_artifact_path - return None + return export_rules_only_case_detection_artifacts( + config, + execution.input_data, + paths.artifact_output_path, + case=case, + ) def _trace_final_artifact_path_if_requested( @@ -915,6 +979,8 @@ def _trace_final_artifact_path_if_requested( detection_artifact_path or output_path, trace_dataframe, case=case, + replace_existing=config.experimental_detection_strategy + == ExperimentalDetectionStrategy.rules_covered_or_default, ) @@ -923,12 +989,16 @@ def patch_case_detection_artifacts_from_trace_dataframe( trace_dataframe: pd.DataFrame, *, case: BenchmarkCase | None = None, + replace_existing: bool = False, ) -> Path | None: final_rows = _final_entity_artifact_rows_from_trace_dataframe(trace_dataframe) if not final_rows: return None - rows = _read_detection_artifact_payloads(output_path) if output_path.exists() else [] - patched = _merge_final_entity_artifact_rows(rows, final_rows) + if replace_existing: + patched = final_rows + else: + rows = _read_detection_artifact_payloads(output_path) if output_path.exists() else [] + patched = _merge_final_entity_artifact_rows(rows, final_rows) if case is not None: patched = [_with_case_metadata(row, case=case) for row in patched] write_detection_artifact_payloads(patched, output_path) @@ -1078,7 +1148,7 @@ def _execute_case( ) with configured_measurement_session(measurement): with dd_parser_compat_context(dd_parser_compat): - detection_context_kwargs: dict[str, Any] = {} + detection_context_kwargs: dict[str, Any] = {"rule_labels": config.experimental_rule_labels} if config.experimental_detection_strategy in _NATIVE_RUNTIME_STRATEGIES: detection_context_kwargs["native_runtime"] = _native_detection_runtime(spec, config) with experimental_detection_strategy_context( @@ -1320,6 +1390,64 @@ def export_case_detection_artifact_analysis( return output_path +def export_rules_only_case_detection_artifacts( + config: ConfigSpec, + input_data: AnonymizerInput, + output_path: Path, + *, + case: BenchmarkCase, +) -> Path | None: + if not _is_local_input_source(input_data.source): + return None + labels = build_anonymizer_config(config).detect.entity_labels + if not _uses_rules_only_artifact_export(config, labels): + return None + source = Path(input_data.source) + dataframe = _read_local_input_dataframe(source, suffix=infer_input_source_suffix(str(source))) + rows = [ + _with_case_metadata( + _rules_only_artifact_row( + text=record[input_data.text_column], + labels=labels, + row_index=int(row_index), + ), + case=case, + ) + for row_index, record in dataframe.iterrows() + ] + if not rows: + return None + write_detection_artifact_payloads(rows, output_path) + return output_path + + +def _uses_rules_only_artifact_export(config: ConfigSpec, labels: list[str] | None) -> bool: + if labels is None: + return False + if config.experimental_detection_strategy == ExperimentalDetectionStrategy.rules_only: + return True + if config.experimental_detection_strategy != ExperimentalDetectionStrategy.rules_covered_or_default: + return False + return set(labels).issubset(STRUCTURED_RULE_FAST_LANE_LABELS) + + +def _rules_only_artifact_row(*, text: object, labels: list[str], row_index: int) -> dict[str, Any]: + entities = [ + EntitySchema.model_validate(span.as_dict()) + for span in detect_high_confidence_entities(str(text), labels=labels) + ] + return build_detection_artifact_row_from_entities( + workflow_name="entity-detection-rules-only", + batch_file="synthetic-rules-only", + row_index=row_index, + seed_entities=entities, + seed_validation_candidate_count=len(entities), + merged_validation_candidate_count=len(entities), + augmented_entities=[], + final_entities=entities, + ).model_dump() + + def _with_case_metadata(row: dict[str, Any], *, case: BenchmarkCase) -> dict[str, Any]: return { "suite_id": case.suite_id, @@ -1362,6 +1490,7 @@ def _run_tags(case: BenchmarkCase, spec: BenchmarkSpec) -> dict[str, Any]: "case_id": case.case_id, "experimental_detection_strategy": config.experimental_detection_strategy.value, "experimental_replacement_strategy": config.experimental_replacement_strategy.value, + "experimental_rule_labels": config.experimental_rule_labels, "dd_parser_compat": spec.dd_parser_compat.value, } if config.experimental_detection_strategy in _NATIVE_RUNTIME_STRATEGIES: diff --git a/tools/measurement/screen_strategy_comparisons.py b/tools/measurement/screen_strategy_comparisons.py index 0cd9e141..59dc280e 100644 --- a/tools/measurement/screen_strategy_comparisons.py +++ b/tools/measurement/screen_strategy_comparisons.py @@ -662,6 +662,8 @@ def group_recommendation(group: ScreenGroup) -> str: return "reliability_review" if is_label_policy_review_group(group): return "label_policy_review" + if is_fast_lane_review_group(group): + return "fast_lane_review" if group.performance_verdict_counts.get("improved", 0) == group.review_count: return "review_only" if group.performance_verdict_counts.get("improved", 0) or group.performance_verdict_counts.get("mixed", 0): @@ -755,6 +757,36 @@ def is_label_policy_review_group(group: ScreenGroup) -> bool: return bool(group.label_mismatch_label_counts or group.flag_counts.get("covered_label_mismatch")) +_FAST_LANE_REVIEW_STRATEGIES = {"rules_only", "rules_covered_or_default"} +_FAST_LANE_REVIEW_FLAGS = { + "candidate_skips_llm_validation", + "candidate_uses_rule_entities", + "entity_count_loss", + "no_candidate_detector_entities", + "span_boundary_mismatch", +} + + +def is_fast_lane_review_group(group: ScreenGroup) -> bool: + if group.candidate_strategy not in _FAST_LANE_REVIEW_STRATEGIES: + return False + if group.review_count != group.row_count: + return False + if group.performance_verdict_counts.get("improved", 0) != group.review_count: + return False + leak_count = group.sum_candidate_original_value_leak_count + if leak_count is None or leak_count != 0: + return False + if ( + group.baseline_only_label_counts + or group.stable_lost_label_counts + or group.candidate_original_value_leak_label_counts + ): + return False + flags = set(group.flag_counts) + return bool(flags) and flags.issubset(_FAST_LANE_REVIEW_FLAGS) + + def group_performance_summary(group: ScreenGroup) -> str: if group.performance_verdict_counts: return label_summary(group.performance_verdict_counts) diff --git a/tools/measurement/staged_detection_probe.py b/tools/measurement/staged_detection_probe.py index ddb1a5d0..6da611da 100644 --- a/tools/measurement/staged_detection_probe.py +++ b/tools/measurement/staged_detection_probe.py @@ -78,13 +78,20 @@ TagNotation, apply_augmented_entities, build_tagged_text, + build_validation_candidates, get_tag_notation, parse_raw_entities, + resolve_overlaps, +) +from anonymizer.engine.detection.rules import ( + STRUCTURED_RULE_FAST_LANE_LABELS, + detect_high_confidence_entities, ) from anonymizer.engine.schemas import ( EntitiesSchema, EntitySchema, RawValidationDecisionsSchema, + ValidatedDecisionSchema, ValidatedDecisionsSchema, ValidationCandidatesSchema, ) @@ -107,6 +114,10 @@ class SeedSource(StrEnum): direct_llm = "direct_llm" gliner = "gliner" + rules = "rules" + rules_trusted = "rules_trusted" + rules_plus_direct_llm = "rules_plus_direct_llm" + rules_router = "rules_router" class ValidationPromptMode(StrEnum): @@ -140,6 +151,7 @@ class StagedExecutionConfig(BaseModel): max_tokens: int = Field(default=4096, gt=0) timeout_sec: float = Field(default=180.0, gt=0) skip_augmentation: bool = False + skip_augmentation_when_rule_covered: bool = False validation_prompt_mode: ValidationPromptMode = ValidationPromptMode.full_text validation_max_entities_per_call: int = Field(default=10, gt=0) validation_excerpt_window_chars: int = Field(default=160, gt=0) @@ -240,6 +252,7 @@ class StagedDetectionCase(BaseModel): total_usage: dict[str, int] = Field(default_factory=dict) model_phase_count: int = 0 model_request_count: int = 0 + rule_covered_label_set: bool = False seed_suggestion_count: int = 0 seed_entity_count: int = 0 validation_candidate_count: int = 0 @@ -302,6 +315,7 @@ def run_staged_detection_case( max_tokens: int = 4096, timeout_sec: float = 180.0, skip_augmentation: bool = False, + skip_augmentation_when_rule_covered: bool = False, validation_prompt_mode: ValidationPromptMode = ValidationPromptMode.full_text, validation_max_entities_per_call: int = 10, validation_excerpt_window_chars: int = 160, @@ -320,6 +334,7 @@ def run_staged_detection_case( max_tokens=max_tokens, timeout_sec=timeout_sec, skip_augmentation=skip_augmentation, + skip_augmentation_when_rule_covered=skip_augmentation_when_rule_covered, validation_prompt_mode=validation_prompt_mode, validation_max_entities_per_call=validation_max_entities_per_call, validation_excerpt_window_chars=validation_excerpt_window_chars, @@ -341,6 +356,7 @@ def execute_staged_detection_case( max_tokens: int = 4096, timeout_sec: float = 180.0, skip_augmentation: bool = False, + skip_augmentation_when_rule_covered: bool = False, validation_prompt_mode: ValidationPromptMode = ValidationPromptMode.full_text, validation_max_entities_per_call: int = 10, validation_excerpt_window_chars: int = 160, @@ -413,8 +429,14 @@ def _run_seed_phase( seed_client: GlinerSeedClient | None, config: StagedExecutionConfig, ) -> tuple[dict[str, Any], int, DirectCompletion]: + if _uses_rule_short_circuit(request, config): + return _run_rules_seed_phase(request) if config.seed_source == SeedSource.gliner: return _run_gliner_seed_phase(request, seed_client or HttpxGlinerSeedClient(), config) + if config.seed_source in {SeedSource.rules, SeedSource.rules_trusted}: + return _run_rules_seed_phase(request) + if config.seed_source in {SeedSource.rules_plus_direct_llm, SeedSource.rules_router}: + return _run_rules_plus_direct_llm_seed_phase(request, client, config) return _run_direct_llm_seed_phase(request, client, config) @@ -453,6 +475,25 @@ def _run_direct_llm_seed_phase( return row, len(seed_suggestions), completion +def _run_rules_plus_direct_llm_seed_phase( + request: StagedDetectionRequest, + client: DirectDetectionClient, + config: StagedExecutionConfig, +) -> tuple[dict[str, Any], int, DirectCompletion]: + completion = _complete(client, prompt=_seed_prompt(request), config=config) + direct_spans, seed_suggestions = _direct_seed_spans(request, completion.content) + rule_spans = detect_high_confidence_entities(request.text, labels=request.labels) + row = _seed_row_from_spans(request, resolve_overlaps([*rule_spans, *direct_spans])) + _limit_validation_candidates_to_sources(row, sources={"direct_seed"}) + return row, len(seed_suggestions) + len(rule_spans), completion + + +def _run_rules_seed_phase(request: StagedDetectionRequest) -> tuple[dict[str, Any], int, DirectCompletion]: + seed_spans = detect_high_confidence_entities(request.text, labels=request.labels) + completion = DirectCompletion(content="", elapsed_sec=0.0, usage={}) + return _seed_row_from_spans(request, seed_spans), len(seed_spans), completion + + def _complete( client: DirectDetectionClient, *, @@ -536,12 +577,23 @@ def _seed_entity_id(label: str, span: EntitySpan) -> str: return f"{label}_{span.start_position}_{span.end_position}" +def _limit_validation_candidates_to_sources(row: dict[str, Any], *, sources: set[str]) -> None: + text = str(row.get(COL_TEXT, "")) + seed_spans = [span for span in _seed_entity_spans(row) if span.source in sources] + row[COL_SEED_VALIDATION_CANDIDATES] = ValidationCandidatesSchema( + candidates=build_validation_candidates(text=text, entities=seed_spans) + ).model_dump(mode="json") + + def _run_validation_phase( row: dict[str, Any], request: StagedDetectionRequest, client: DirectDetectionClient, config: StagedExecutionConfig, ) -> DirectCompletion: + if config.seed_source == SeedSource.rules_trusted or _uses_rule_short_circuit(request, config): + _trust_seed_entities(row) + return DirectCompletion(content="", elapsed_sec=0.0, usage={}) candidates = ValidationCandidatesSchema.from_raw(row.get(COL_SEED_VALIDATION_CANDIDATES, {})) if not candidates.candidates: row[COL_VALIDATION_DECISIONS] = {"decisions": []} @@ -671,6 +723,23 @@ def _sum_completion_usage(completions: list[DirectCompletion]) -> dict[str, int] return dict(sorted(totals.items())) +def _trust_seed_entities(row: dict[str, Any]) -> None: + candidates = ValidationCandidatesSchema.from_raw(row.get(COL_SEED_VALIDATION_CANDIDATES, {})) + row[COL_VALIDATED_ENTITIES] = ValidatedDecisionsSchema( + decisions=[ + ValidatedDecisionSchema( + id=candidate.id, + decision="keep", + value=candidate.value, + label=candidate.label, + reason="trusted deterministic rule", + ) + for candidate in candidates.candidates + ] + ).model_dump(mode="json") + apply_validation_to_seed_entities(row) + + def _validation_prompt( request: StagedDetectionRequest, candidates: ValidationCandidatesSchema, @@ -730,7 +799,19 @@ def _run_augmentation_phase( def _should_skip_augmentation(request: StagedDetectionRequest, config: StagedExecutionConfig) -> bool: - return config.skip_augmentation + if config.skip_augmentation: + return True + if _uses_rule_short_circuit(request, config): + return True + if not config.skip_augmentation_when_rule_covered: + return False + if config.seed_source not in {SeedSource.rules, SeedSource.rules_trusted, SeedSource.rules_plus_direct_llm}: + return False + return set(request.labels).issubset(STRUCTURED_RULE_FAST_LANE_LABELS) + + +def _uses_rule_short_circuit(request: StagedDetectionRequest, config: StagedExecutionConfig) -> bool: + return config.seed_source == SeedSource.rules_router and _is_rule_covered_label_set(request) def _augmentation_prompt(request: StagedDetectionRequest, row: dict[str, Any]) -> str: @@ -817,6 +898,7 @@ def _completed_case( total_usage=_sum_usage(phase_usage), model_phase_count=_model_phase_count(phase_model_work), model_request_count=_model_request_count(phase_model_requests), + rule_covered_label_set=_is_rule_covered_label_set(request), seed_suggestion_count=seed_suggestion_count, seed_entity_count=artifact.seed_entity_count, validation_candidate_count=artifact.seed_validation_candidate_count, @@ -841,12 +923,28 @@ def _phase_model_work( def _uses_seed_model(request: StagedDetectionRequest, config: StagedExecutionConfig) -> bool: - return config.seed_source in {SeedSource.direct_llm, SeedSource.gliner} + if _uses_rule_short_circuit(request, config): + return False + return config.seed_source in { + SeedSource.direct_llm, + SeedSource.gliner, + SeedSource.rules_plus_direct_llm, + SeedSource.rules_router, + } def _uses_validation_model( request: StagedDetectionRequest, artifact: DetectionArtifactRow, config: StagedExecutionConfig ) -> bool: + if _uses_rule_short_circuit(request, config): + return False + if ( + config.seed_source in {SeedSource.rules_trusted, SeedSource.rules_router} + and artifact.seed_validation_candidate_count == 0 + ): + return False + if config.seed_source == SeedSource.rules_trusted: + return False return artifact.seed_validation_candidate_count > 0 @@ -861,12 +959,16 @@ def _phase_skip_reasons( def _seed_skip_reason(request: StagedDetectionRequest, config: StagedExecutionConfig) -> str | None: + if config.seed_source in {SeedSource.rules, SeedSource.rules_trusted} or _uses_rule_short_circuit(request, config): + return "deterministic_rules" return None def _validation_skip_reason( request: StagedDetectionRequest, artifact: DetectionArtifactRow, config: StagedExecutionConfig ) -> str | None: + if config.seed_source == SeedSource.rules_trusted or _uses_rule_short_circuit(request, config): + return "trusted_rules" if artifact.seed_validation_candidate_count == 0: return "no_seed_candidates" return None @@ -875,6 +977,8 @@ def _validation_skip_reason( def _augmentation_skip_reason(request: StagedDetectionRequest, config: StagedExecutionConfig) -> str | None: if config.skip_augmentation: return "disabled" + if _should_skip_augmentation(request, config): + return "rule_covered_labels" return None @@ -914,6 +1018,10 @@ def _model_request_count(phase_model_requests: PhaseModelRequests) -> int: return phase_model_requests.seed + phase_model_requests.validation + phase_model_requests.augmentation +def _is_rule_covered_label_set(request: StagedDetectionRequest) -> bool: + return set(request.labels).issubset(STRUCTURED_RULE_FAST_LANE_LABELS) + + def _extract_entity_suggestions(content: str) -> list[dict[str, str]]: payload = _load_embedded_json(content) raw_entities = payload.get("entities", []) if isinstance(payload, dict) else [] @@ -1221,6 +1329,7 @@ def run_probe( gliner_api_key_env: str = "NVIDIA_API_KEY", gliner_threshold: float = 0.3, skip_augmentation: bool = False, + skip_augmentation_when_rule_covered: bool = False, validation_prompt_mode: ValidationPromptMode = ValidationPromptMode.full_text, validation_max_entities_per_call: int = 10, validation_excerpt_window_chars: int = 160, @@ -1279,6 +1388,7 @@ def _run_probe_cases( gliner_api_key_env=config.gliner_api_key_env, gliner_threshold=config.gliner_threshold, skip_augmentation=config.skip_augmentation, + skip_augmentation_when_rule_covered=config.skip_augmentation_when_rule_covered, validation_prompt_mode=config.validation_prompt_mode, validation_max_entities_per_call=config.validation_max_entities_per_call, validation_excerpt_window_chars=config.validation_excerpt_window_chars, @@ -1357,6 +1467,9 @@ def main( gliner_api_key_env: Annotated[str, cyclopts.Parameter("--gliner-api-key-env")] = "NVIDIA_API_KEY", gliner_threshold: Annotated[float, cyclopts.Parameter("--gliner-threshold")] = 0.3, skip_augmentation: Annotated[bool, cyclopts.Parameter("--skip-augmentation")] = False, + skip_augmentation_when_rule_covered: Annotated[ + bool, cyclopts.Parameter("--skip-augmentation-when-rule-covered") + ] = False, validation_prompt_mode: Annotated[ ValidationPromptMode, cyclopts.Parameter("--validation-prompt-mode") ] = ValidationPromptMode.full_text,