From dd8deb0bfea2428eefe95278d28bd21bf77d934c Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Mon, 1 Jun 2026 13:07:08 +0100 Subject: [PATCH 1/7] feat(detection): tolerate small-model drift on detection schemas Loosen the LLM-facing detection wire schemas so small models (gemma4-e2b/e4b, nemotron-3-nano:4b, qwen3.5:4b) that drift from the strict contract no longer drop records during validation/augmentation/latent tagging: - RawValidationDecisionSchema / ValidationDecisionSchema: coerce free-form decision prose to keep/reclass/drop, preserve None as "no answer", drop value/label from the wire (re-filled downstream from trusted candidates). - LatentEntitySchema: loosen category/confidence/evidence/rationale with before-validators; truncate verbose rationale instead of failing. - LatentEntitiesSchema + detection_workflow: pad empty latent cells with a sentinel so PyArrow can write the parquet column. - AugmentedEntitySchema: drop value/label min_length; a single empty entry no longer fails the whole augmentation batch (empties are skipped downstream). Adds test_small_model_drift.py (detection drift classes) and updates test_chunked_validation for the slimmed wire shape. --- .../engine/detection/detection_workflow.py | 36 ++- src/anonymizer/engine/schemas/detection.py | 268 ++++++++++++++++-- tests/engine/test_chunked_validation.py | 15 +- tests/engine/test_small_model_drift.py | 121 ++++++++ 4 files changed, 416 insertions(+), 24 deletions(-) create mode 100644 tests/engine/test_small_model_drift.py diff --git a/src/anonymizer/engine/detection/detection_workflow.py b/src/anonymizer/engine/detection/detection_workflow.py index 87eb644b..e7227e5a 100644 --- a/src/anonymizer/engine/detection/detection_workflow.py +++ b/src/anonymizer/engine/detection/detection_workflow.py @@ -58,6 +58,7 @@ EntitiesByValueSchema, EntitiesSchema, LatentEntitiesSchema, + LatentEntitySchema, ) logger = logging.getLogger("anonymizer.detection") @@ -242,7 +243,10 @@ def identify_latent_entities( workflow_name="latent-entity-detection", preview_num_records=preview_num_records, ) - return EntityDetectionResult(dataframe=latent_result.dataframe, failed_records=latent_result.failed_records) + return EntityDetectionResult( + dataframe=_pad_empty_latent_column(latent_result.dataframe), + failed_records=latent_result.failed_records, + ) def run( self, @@ -679,3 +683,33 @@ def _format_privacy_goal(privacy_goal: PrivacyGoal | None) -> str: if privacy_goal is None: return "Not provided" return privacy_goal.to_prompt_string() + + +def _pad_empty_latent_column(df: pd.DataFrame) -> pd.DataFrame: + """Inject a sentinel into any empty ``_latent_entities`` cell. + + Downstream workflows write the DataFrame to parquet via DataDesigner, + which uses pyarrow. pyarrow raises ``Cannot write struct type with no + child field`` when every cell has ``latent_entities: []`` — it can't + infer the nested struct schema from only empty lists. + ``LatentEntitiesSchema._ensure_parquet_writable`` covers this when + pydantic validation runs, but DD does not always route through + ``model_validate`` (e.g. partial-failure fallback), so we pad again + at the DataFrame level. + """ + if COL_LATENT_ENTITIES not in df.columns: + return df + sentinel = [LatentEntitySchema().model_dump()] + + def _fix(cell): + if isinstance(cell, dict): + if not cell.get("latent_entities"): + return {**cell, "latent_entities": sentinel} + return cell + if isinstance(cell, list) and not cell: + return sentinel + return cell + + df = df.copy() + df[COL_LATENT_ENTITIES] = df[COL_LATENT_ENTITIES].map(_fix) + return df diff --git a/src/anonymizer/engine/schemas/detection.py b/src/anonymizer/engine/schemas/detection.py index 6c6c3dea..b76ed521 100644 --- a/src/anonymizer/engine/schemas/detection.py +++ b/src/anonymizer/engine/schemas/detection.py @@ -5,7 +5,7 @@ from enum import Enum -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator, model_validator from anonymizer.engine.schemas.shared import _parse_raw_wrapper @@ -60,6 +60,68 @@ class RawValidationDecisionSchema(BaseModel): proposed_label: str = Field(default="") reason: str | None = None + @field_validator("decision", mode="before") + @classmethod + def _normalize_decision_preserve_none(cls, v: object) -> object: + """Tolerate small-model decision-field drift while preserving None. + + Renamed from ``_normalize_decision`` to make the None-preserving + semantics visible at every call site without requiring readers + to compare docstrings against ``ValidationDecisionSchema``'s + same-name (now removed) variant. + + Small models (gemma4-e4b on legal_court bench) emit free-form + prose in the decision slot — observed: ``"No specific entity + type for the date placeholder."`` and ``"No specific field + matched for this token."``. Without coercion pydantic rejects + the whole chunk's records. + + Distinct from ValidationDecisionSchema._normalize_decision in + ONE important way: this schema's chunked-validation merger + (``merge_chunk_decisions``) treats ``decision=None`` as "no + answer, skip" so a later chunk can supply a real verdict for + the same id. We must therefore preserve None-ness — only + normalize *strings*. + + Strategy: + * None / non-string / blank string -> ``None`` (preserves the + "no answer" semantics the merger relies on). + * Exact match ``keep``/``reclass``/``drop`` (case-insensitive) -> as-is. + * Substring match (``"Keep."``, ``"DROP!"``, ``"reclass entity"``) + -> the matched choice. Most-specific first so ``"reclass"`` + wins over ``"keep"`` when both substrings appear. + * Free-form prose with no recognizable choice -> ``"keep"`` + (conservative: preserve detection over silently dropping it). + """ + if v is None: + return None + if not isinstance(v, str) or not v.strip(): + return None + cleaned = v.strip().lower() + if cleaned in {"keep", "reclass", "drop"}: + return cleaned + for choice in ("reclass", "drop", "keep"): + if choice in cleaned: + return choice + return "keep" + + @field_validator("proposed_label", mode="before") + @classmethod + def _coerce_proposed_label(cls, v: object) -> object: + """Mirror ValidationDecisionSchema._coerce_proposed_label. + + Small models (gemma4-e4b on the chunked-validation path) emit + ``proposed_label: null`` when the decision is "keep" — pydantic + otherwise rejects None for the str-typed field, dropping the + whole record. The validator chunk schema needs the same loose + coercion the wire ValidationDecisionSchema already has. + """ + if v is None: + return "" + if isinstance(v, (int, float, bool)): + return str(v) + return v + class RawValidationDecisionsSchema(BaseModel): decisions: list[RawValidationDecisionSchema] = Field(default_factory=list) @@ -100,26 +162,87 @@ def from_raw(cls, raw: object) -> ValidationSkeletonSchema: class ValidationDecisionSchema(BaseModel): - """Per-entity validation decision from the LLM validator.""" + """Loose wire-contract for per-entity validation decisions from the LLM. + + The strict internal shape has only the three fields the server actually + consumes: `id`, `decision`, `proposed_label` (+ optional `reason`). The + previous schema also carried `value` and `label`, but those are + overridden from the trusted candidate_lookup in + enrich_validation_decisions — they were pure drift surface. + + Wire-layer looseness addresses classes M/N/O from the bench: + * `decision` is typed `str` (not ValidationChoice) so DD’s + jsonschema pre-check cannot reject enum drift; before-validator + normalizes to a valid enum. Default "keep" means the field is + NOT in `required`, so omission does not drop the record. + * `proposed_label` is `str | None` so the emitted JSON Schema is + `anyOf: [string, null]` — explicit `null` emissions no longer + fail `type: "string"` at the pre-check. + * `value`/`label` removed entirely — any int/null drift on those + fields is now impossible because they’re not in the schema. + """ id: str - value: str = Field(default="", description="Entity value (echoed from skeleton)") - label: str = Field(default="", description="Entity label (echoed from skeleton)") - decision: ValidationChoice - proposed_label: str = Field( + decision: str = Field( + default="keep", + description='one of: "keep" | "reclass" | "drop"', + ) + proposed_label: str | None = Field( default="", description="Correct label when decision is 'reclass', otherwise empty", ) reason: str | None = None + @field_validator("proposed_label", mode="before") + @classmethod + def _coerce_proposed_label(cls, v: object) -> object: + """Coerce None / non-string to empty string so the strict downstream + shape is always a string (RawValidationDecisionSchema expects str).""" + if v is None: + return "" + if isinstance(v, (int, float, bool)): + return str(v) + return v + + @field_validator("decision", mode="before") + @classmethod + def _normalize_decision(cls, v: object) -> str: + """Coerce drift into a valid ValidationChoice value. + + None / non-string / unknown strings default to 'keep' — the + conservative choice that preserves detection. A substring match + catches small-model variants like 'Keep.' or 'DROP!'. + + Note: a related normalizer + ``RawValidationDecisionSchema._normalize_decision_preserve_none`` + exists on the chunked-validation path. It deliberately preserves + None — see that docstring for why. The two diverge because this + schema's field is ``decision: str`` (default "keep"), while the + chunked path uses ``decision: ValidationChoice | None`` to signal + "no answer" to ``merge_chunk_decisions``. The chunked variant is + explicitly named so the divergence is visible at every call site. + """ + if v is None or not isinstance(v, str) or not v.strip(): + return "keep" + cleaned = v.strip().lower() + if cleaned in {"keep", "reclass", "drop"}: + return cleaned + for choice in ("reclass", "drop", "keep"): # check most-specific first + if choice in cleaned: + return choice + return "keep" + class ValidationDecisionsSchema(BaseModel): decisions: list[ValidationDecisionSchema] = Field(default_factory=list) class AugmentedEntitySchema(BaseModel): - value: str = Field(min_length=1) - label: str = Field(min_length=1) + # No min_length: a single empty value/label must not fail-validate the whole + # augmentation batch on a small model. apply_augmented_entities() skips + # blank value/label downstream, so empties are dropped without losing the row. + value: str = "" + label: str = "" reason: str | None = None @@ -137,34 +260,141 @@ class LatentConfidence(str, Enum): class LatentEntitySchema(BaseModel): - category: LatentCategory + """Single latent (inferred) entity from the latent_detector role. + + Loose wire contract across every field so small-model drift does not + drop records at DataDesigner’s jsonschema pre-check: + * category / confidence: typed str (not LatentCategory / LatentConfidence); + a before-validator normalizes case and substring-matches drift. + * label / value: permissive default "", coerce None/int. + * evidence: any list shape accepted at wire; before-validator + clamps to [0, 2] non-empty strings. + * rationale: wire has no min_length; before-validator truncates if + too long and pads with ellipsis if too short. + """ + + category: str = Field( + default="", + description=("one of: latent_identifier | latent_sensitive_attribute (see LatentCategory enum)"), + ) label: str = Field( - min_length=1, + default="", description=( "General category/class of the inference in snake_case " - "(e.g., employer, specific_institution, home_location, medication, health_condition)" + "(e.g., employer, specific_institution, home_location)" ), ) value: str = Field( - min_length=1, - description="Concise inferred value (generalize if not pinned down strongly by evidence)", + default="", + description="Concise inferred value", + ) + confidence: str = Field( + default="medium", + description="one of: high | medium (see LatentConfidence enum)", ) - confidence: LatentConfidence evidence: list[str] = Field( - min_length=1, - max_length=2, - description="One or two short quotes from the text that support this inference", + default_factory=list, + description="Up to 2 short quotes supporting this inference", ) rationale: str = Field( - min_length=20, - max_length=150, + default="", description="One sentence explaining the inference without adding new facts", ) + @field_validator("category", mode="before") + @classmethod + def _normalize_category(cls, v: object) -> str: + if v is None or not isinstance(v, str) or not v.strip(): + return "latent_identifier" + cleaned = v.strip().lower().replace(" ", "_").replace("-", "_") + allowed = {c.value for c in LatentCategory} + if cleaned in allowed: + return cleaned + if "sensitive" in cleaned: + return LatentCategory.latent_sensitive_attribute.value + return LatentCategory.latent_identifier.value + + @field_validator("confidence", mode="before") + @classmethod + def _normalize_confidence(cls, v: object) -> str: + if v is None: + return "medium" + # Numeric -> band + if isinstance(v, (int, float)): + f = float(v) + if f >= 0.66: + return "high" + return "medium" + if not isinstance(v, str) or not v.strip(): + return "medium" + cleaned = v.strip().lower() + if cleaned in {"high", "medium"}: + return cleaned + if cleaned in {"h", "hi"}: + return "high" + return "medium" + + @field_validator("label", "value", mode="before") + @classmethod + def _coerce_to_str(cls, v: object) -> str: + if v is None: + return "" + if isinstance(v, (int, float, bool)): + return str(v) + return v + + @field_validator("evidence", mode="before") + @classmethod + def _clamp_evidence(cls, v: object) -> list[str]: + """Accept any list shape; keep at most 2 non-empty string quotes.""" + if not isinstance(v, list): + return [] + out: list[str] = [] + for item in v: + if item is None: + continue + if not isinstance(item, str): + item = str(item) + item = item.strip() + if item: + out.append(item) + if len(out) >= 2: + break + return out + + @field_validator("rationale", mode="before") + @classmethod + def _cap_rationale(cls, v: object) -> str: + """Truncate verbose rationales (Nemotron-observed 260 chars > 150 max). + Server reads rationale for context only; no lower bound enforced. + """ + if v is None: + return "" + if isinstance(v, (int, float, bool)): + v = str(v) + if not isinstance(v, str): + return "" + s = v.strip() + if len(s) > 150: + return s[:147].rstrip() + "..." + return s + class LatentEntitiesSchema(BaseModel): latent_entities: list[LatentEntitySchema] = Field(default_factory=list) + @model_validator(mode="after") + def _ensure_parquet_writable(self) -> "LatentEntitiesSchema": + """DataDesigner writes this column to parquet; an empty list of + structs makes pyarrow fail with 'Cannot write struct type with no + child field' because it can't infer the nested schema. Inject a + single all-defaults sentinel when empty — downstream code (e.g. + reconstruct_full_disposition) already filters entries with empty + label/value so the sentinel is invisible semantically.""" + if not self.latent_entities: + self.latent_entities = [LatentEntitySchema()] + return self + class EntityByValueSchema(BaseModel): value: str = Field(default="") diff --git a/tests/engine/test_chunked_validation.py b/tests/engine/test_chunked_validation.py index f9b402a3..7099f227 100644 --- a/tests/engine/test_chunked_validation.py +++ b/tests/engine/test_chunked_validation.py @@ -285,8 +285,14 @@ def test_filters_unknown_ids_and_deduplicates(self) -> None: by_id = {d["id"]: d for d in merged["decisions"]} assert by_id["a"]["decision"] == "keep" assert by_id["b"]["decision"] == "drop" - assert by_id["a"]["value"] == "Alice" # enriched from candidate - assert by_id["a"]["label"] == "first_name" + # ``value`` and ``label`` were dropped from the wire-layer + # ``ValidationDecisionSchema`` to remove a small-model drift surface; + # ``ValidationDecisionsSchema.model_validate`` in + # ``merge_chunk_decisions`` strips them. They are re-filled from + # ``candidate_lookup`` by ``enrich_validation_decisions`` further + # downstream — covered by the end-to-end test below. + assert "value" not in by_id["a"] + assert "label" not in by_id["a"] def test_drops_decisions_without_verdict(self) -> None: """A decision with ``decision=None`` is equivalent to 'no answer' and must not leak through. @@ -311,11 +317,12 @@ def test_later_real_verdict_wins_over_earlier_null_duplicate(self) -> None: chunk_one = RawValidationDecisionsSchema.model_validate({"decisions": [{"id": "a", "decision": None}]}) chunk_two = RawValidationDecisionsSchema.model_validate({"decisions": [{"id": "a", "decision": "keep"}]}) merged = merge_chunk_decisions([chunk_one, chunk_two], candidates) + # value/label are stripped at merge time (post-#130 wire-loose + # ValidationDecisionSchema drops them) and re-filled later by + # enrich_validation_decisions from candidate_lookup. assert merged["decisions"] == [ { "id": "a", - "value": "Alice", - "label": "first_name", "decision": "keep", "proposed_label": "", "reason": None, diff --git a/tests/engine/test_small_model_drift.py b/tests/engine/test_small_model_drift.py new file mode 100644 index 00000000..e4e95bde --- /dev/null +++ b/tests/engine/test_small_model_drift.py @@ -0,0 +1,121 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Tests covering small-model output drift on detection schemas. + +These regressions cover the drift modes observed during small-model +benchmarks (gemma4-e2b, gemma4-e4b, nemotron-3-nano:4b, qwen3.5:4b on legal +court / medical visit / employee notes datasets). Each test pins one drift +class so a future schema change that re-tightens the wire contract surfaces +here rather than silently dropping records on small-model runs. +""" + +from __future__ import annotations + +from anonymizer.engine.schemas.detection import ( + LatentEntitySchema, + RawValidationDecisionSchema, + ValidationDecisionSchema, +) + +# --------------------------------------------------------------------------- +# RawValidationDecisionSchema — chunked-validation drift +# --------------------------------------------------------------------------- + + +class TestRawValidationDecisionDrift: + def test_freeform_prose_decision_coerces_to_keep(self) -> None: + """gemma4-e4b on legal_court emits prose like "No specific entity + type for the date placeholder." in the decision slot. Conservative + coercion to ``keep`` so the detection survives downstream.""" + result = RawValidationDecisionSchema.model_validate( + {"id": "x", "decision": "No specific entity type for the date placeholder."} + ) + assert result.decision is not None + assert result.decision.value == "keep" + + def test_explicit_drop_substring_wins(self) -> None: + result = RawValidationDecisionSchema.model_validate({"id": "x", "decision": "DROP."}) + assert result.decision.value == "drop" + + def test_reclass_substring_wins_over_keep(self) -> None: + """``"reclass entity (was previously kept)"`` — the more-specific + choice should win even when both substrings are present.""" + result = RawValidationDecisionSchema.model_validate( + {"id": "x", "decision": "reclass entity (was previously kept)"} + ) + assert result.decision.value == "reclass" + + def test_none_decision_preserved(self) -> None: + """``decision=None`` is "no answer" in the chunked merger; must NOT + be coerced into a verdict or downstream merge logic breaks.""" + result = RawValidationDecisionSchema.model_validate({"id": "x", "decision": None}) + assert result.decision is None + + def test_blank_string_decision_treated_as_none(self) -> None: + result = RawValidationDecisionSchema.model_validate({"id": "x", "decision": " "}) + assert result.decision is None + + def test_proposed_label_none_coerces_to_empty(self) -> None: + result = RawValidationDecisionSchema.model_validate({"id": "x", "decision": "keep", "proposed_label": None}) + assert result.proposed_label == "" + + def test_int_proposed_label_coerces_to_str(self) -> None: + result = RawValidationDecisionSchema.model_validate({"id": "x", "decision": "keep", "proposed_label": 42}) + assert result.proposed_label == "42" + + +# --------------------------------------------------------------------------- +# ValidationDecisionSchema — wire schema drops value/label +# --------------------------------------------------------------------------- + + +class TestValidationDecisionWireShape: + def test_value_label_stripped_from_wire(self) -> None: + """The wire-loose schema dropped value and label as drift surface; + downstream ``enrich_validation_decisions`` re-fills them from the + trusted ``candidate_lookup``.""" + result = ValidationDecisionSchema.model_validate( + {"id": "x", "decision": "keep", "value": "Alice", "label": "first_name"} + ) + dumped = result.model_dump() + assert "value" not in dumped + assert "label" not in dumped + + def test_decision_freeform_coerces_to_keep(self) -> None: + # ValidationDecisionSchema.decision is ``str`` (not the ValidationChoice + # enum) — the wire-loose contract — so the result is a plain string. + result = ValidationDecisionSchema.model_validate({"id": "x", "decision": "free-form prose"}) + assert result.decision == "keep" + + def test_proposed_label_none_coerces_to_empty(self) -> None: + result = ValidationDecisionSchema.model_validate({"id": "x", "decision": "keep", "proposed_label": None}) + assert result.proposed_label == "" + + +# --------------------------------------------------------------------------- +# LatentEntitySchema — defaults + rationale clamp +# --------------------------------------------------------------------------- + + +class TestLatentEntityDrift: + def test_overlong_rationale_truncates(self) -> None: + """Some models emit 200+ char rationales; clamp to 147 + ``"..."`` + to fit the 150-char schema cap rather than dropping the row.""" + long = "A" * 250 + result = LatentEntitySchema.model_validate({"label": "occupation", "value": "doctor", "rationale": long}) + assert len(result.rationale) <= 150 + assert result.rationale.endswith("...") + + def test_empty_required_fields_default_to_empty_string(self) -> None: + """Pre-loosening these were required ``min_length=1`` and would drop + the row; loose wire allows empty so the parquet-pad sentinel + path can build a placeholder row when needed.""" + result = LatentEntitySchema() + assert result.label == "" + assert result.value == "" + + def test_invalid_confidence_coerces_to_medium(self) -> None: + result = LatentEntitySchema.model_validate( + {"label": "x", "value": "y", "confidence": "very-high", "rationale": "ok"} + ) + assert result.confidence == "medium" From c01f35fdb330a25b5a6058f9c2d131eff04b2d92 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Mon, 1 Jun 2026 13:15:50 +0100 Subject: [PATCH 2/7] feat(replace): drop min_length on replacement wire schema EntityReplacementSchema is the LLM output_format for Substitute mode, so a single drifted entry (small models occasionally emit an empty synthetic, or a blank original/label) used to fail-validate the whole replacement map and drop the record. Drop min_length on original/label/synthetic: - empty original/label cannot match a requested entity and are filtered out by _filter_replacement_map_to_input_entities (valid siblings are preserved); - an empty synthetic keys on (original, label), survives the filter, and is applied as a deletion at rewrite time -- privacy-safe (no PII leak), even if utility-poor. Adds regression tests for both drift modes. --- src/anonymizer/engine/schemas/replace.py | 13 ++++++-- tests/engine/test_llm_replace_workflow.py | 39 +++++++++++++++++++++++ 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/src/anonymizer/engine/schemas/replace.py b/src/anonymizer/engine/schemas/replace.py index 86a890df..56490641 100644 --- a/src/anonymizer/engine/schemas/replace.py +++ b/src/anonymizer/engine/schemas/replace.py @@ -7,9 +7,16 @@ class EntityReplacementSchema(BaseModel): - original: str = Field(min_length=1, description="The original entity value") - label: str = Field(min_length=1, description="The entity label/type") - synthetic: str = Field(min_length=1, description="The synthetic replacement value") + # No min_length: this is the LLM output schema for Substitute mode, so a + # single drifted entry (e.g. a small model emitting an empty synthetic) + # must not fail-validate the whole replacement map and drop the record. + # Empty original/label are filtered downstream (they cannot match a + # requested entity in _filter_replacement_map_to_input_entities); an empty + # synthetic results in the entity being removed at apply time, which is + # privacy-safe (no PII leak) even if utility-poor. + original: str = Field(default="", description="The original entity value") + label: str = Field(default="", description="The entity label/type") + synthetic: str = Field(default="", description="The synthetic replacement value") class EntityReplacementMapSchema(BaseModel): diff --git a/tests/engine/test_llm_replace_workflow.py b/tests/engine/test_llm_replace_workflow.py index f7abbc44..a794f62c 100644 --- a/tests/engine/test_llm_replace_workflow.py +++ b/tests/engine/test_llm_replace_workflow.py @@ -356,3 +356,42 @@ def test_filter_replacement_map_empty_warning_does_not_leak_pii( assert "first_name" in caplog.text _assert_no_pii_in_logs(caplog, extra_secrets=("Acme Corp", "NovaCorp")) assert result == {"replacements": []} + + +# --------------------------------------------------------------------------- +# Small-model drift: a single bad entry must not fail the whole map / drop the +# record. EntityReplacementSchema dropped min_length on original/label/synthetic. +# --------------------------------------------------------------------------- + + +def test_empty_synthetic_does_not_fail_validation_and_survives_filter() -> None: + """A blank synthetic used to fail ``min_length=1`` and drop the whole map. + Now the entry validates; it keys on (original, label) so it survives the + requested-entity filter and is applied as a deletion (privacy-safe).""" + parsed_entities = EntitiesByValueSchema.model_validate( + {"entities_by_value": [{"value": "Alice", "labels": ["first_name"]}]} + ) + raw_map = {"replacements": [{"original": "Alice", "label": "first_name", "synthetic": ""}]} + + result = _filter_replacement_map_to_input_entities(raw_map=raw_map, parsed_entities=parsed_entities) + + assert result == {"replacements": [{"original": "Alice", "label": "first_name", "synthetic": ""}]} + + +def test_empty_original_or_label_entry_is_filtered_without_dropping_record() -> None: + """Empty original/label cannot match a requested entity, so the drifted + entry is dropped while the valid sibling entry is preserved (the whole map + no longer fails validation on the empty fields).""" + parsed_entities = EntitiesByValueSchema.model_validate( + {"entities_by_value": [{"value": "Alice", "labels": ["first_name"]}]} + ) + raw_map = { + "replacements": [ + {"original": "", "label": "", "synthetic": "junk"}, + {"original": "Alice", "label": "first_name", "synthetic": "Maya"}, + ] + } + + result = _filter_replacement_map_to_input_entities(raw_map=raw_map, parsed_entities=parsed_entities) + + assert result == {"replacements": [{"original": "Alice", "label": "first_name", "synthetic": "Maya"}]} From 926bf8c8588ff17f1d1375be7d476840553efa77 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Mon, 1 Jun 2026 13:22:07 +0100 Subject: [PATCH 3/7] fix(detection): drop dead latent_sensitive_attribute reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _normalize_category referenced LatentCategory.latent_sensitive_attribute, which is not a member of the enum (it only defines latent_identifier; sensitive attributes were folded into quasi_identifier on the rewrite side). Any latent category string containing "sensitive" — exactly the kind of drift small models emit — would raise AttributeError at runtime. Non-canonical category drift now normalizes to latent_identifier, and the field description no longer advertises the nonexistent value. Also clarifies _pad_empty_latent_column's shape-preservation intent (the downstream reader tolerates both dict and bare-list cells, so no mixing occurs). Adds regression tests for the sensitive/unknown category drift paths. --- .../engine/detection/detection_workflow.py | 5 +++++ src/anonymizer/engine/schemas/detection.py | 7 ++++--- tests/engine/test_small_model_drift.py | 13 +++++++++++++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/anonymizer/engine/detection/detection_workflow.py b/src/anonymizer/engine/detection/detection_workflow.py index e7227e5a..bc64fad6 100644 --- a/src/anonymizer/engine/detection/detection_workflow.py +++ b/src/anonymizer/engine/detection/detection_workflow.py @@ -702,6 +702,11 @@ def _pad_empty_latent_column(df: pd.DataFrame) -> pd.DataFrame: sentinel = [LatentEntitySchema().model_dump()] def _fix(cell): + # Preserve each cell's existing shape: the column is uniformly the + # struct/dict shape ({"latent_entities": [...]}) from LatentEntitiesSchema + # in the normal DD path (dict branch), but tolerate a bare-list cell from + # alternate paths. The downstream reader (_coerce_entity_list) accepts + # either shape, so we never mix dict and list within one column. if isinstance(cell, dict): if not cell.get("latent_entities"): return {**cell, "latent_entities": sentinel} diff --git a/src/anonymizer/engine/schemas/detection.py b/src/anonymizer/engine/schemas/detection.py index b76ed521..8cba4c1b 100644 --- a/src/anonymizer/engine/schemas/detection.py +++ b/src/anonymizer/engine/schemas/detection.py @@ -275,7 +275,7 @@ class LatentEntitySchema(BaseModel): category: str = Field( default="", - description=("one of: latent_identifier | latent_sensitive_attribute (see LatentCategory enum)"), + description="Must be: latent_identifier", ) label: str = Field( default="", @@ -310,8 +310,9 @@ def _normalize_category(cls, v: object) -> str: allowed = {c.value for c in LatentCategory} if cleaned in allowed: return cleaned - if "sensitive" in cleaned: - return LatentCategory.latent_sensitive_attribute.value + # LatentCategory has a single member (latent_identifier); sensitive + # attributes were folded into quasi_identifier on the rewrite side, so + # any non-canonical drift normalizes to the lone latent category. return LatentCategory.latent_identifier.value @field_validator("confidence", mode="before") diff --git a/tests/engine/test_small_model_drift.py b/tests/engine/test_small_model_drift.py index e4e95bde..c59ef66b 100644 --- a/tests/engine/test_small_model_drift.py +++ b/tests/engine/test_small_model_drift.py @@ -119,3 +119,16 @@ def test_invalid_confidence_coerces_to_medium(self) -> None: {"label": "x", "value": "y", "confidence": "very-high", "rationale": "ok"} ) assert result.confidence == "medium" + + def test_sensitive_category_drift_normalizes_to_latent_identifier(self) -> None: + """A category string containing "sensitive" must normalize to the lone + LatentCategory member rather than raising AttributeError (sensitive + attributes were folded into quasi_identifier on the rewrite side).""" + result = LatentEntitySchema.model_validate( + {"label": "x", "value": "y", "category": "latent_sensitive_attribute"} + ) + assert result.category == "latent_identifier" + + def test_unknown_category_drift_normalizes_to_latent_identifier(self) -> None: + result = LatentEntitySchema.model_validate({"label": "x", "value": "y", "category": "some-novel-bucket"}) + assert result.category == "latent_identifier" From b287340aa506f0a793e9f8c18bd8d2c273dfd4ce Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Mon, 1 Jun 2026 13:26:40 +0100 Subject: [PATCH 4/7] fix(detection): pad None/NaN latent cells for parquet-writability _pad_empty_latent_column previously passed None/NaN cells through unchanged. A row absent from a partial-failure fallback merge (reintroduced as NaN by a pandas reindex) could leave the latent column without an inferable struct schema. None/NaN now normalize to the canonical sentinel struct, consistent with the existing empty-cell handling. Adds TestPadEmptyLatentColumn covering empty-struct, populated, bare-list, None/NaN, and missing-column cases. --- .../engine/detection/detection_workflow.py | 5 +++ tests/engine/test_detection_workflow.py | 35 +++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/src/anonymizer/engine/detection/detection_workflow.py b/src/anonymizer/engine/detection/detection_workflow.py index bc64fad6..a4da9482 100644 --- a/src/anonymizer/engine/detection/detection_workflow.py +++ b/src/anonymizer/engine/detection/detection_workflow.py @@ -707,6 +707,11 @@ def _fix(cell): # in the normal DD path (dict branch), but tolerate a bare-list cell from # alternate paths. The downstream reader (_coerce_entity_list) accepts # either shape, so we never mix dict and list within one column. + # None / NaN (e.g. a row absent from a partial-failure fallback merge, + # reintroduced by a pandas reindex) normalizes to the canonical struct + # so the column stays parquet-writable. + if cell is None or (not isinstance(cell, (dict, list)) and pd.isna(cell)): + return {"latent_entities": sentinel} if isinstance(cell, dict): if not cell.get("latent_entities"): return {**cell, "latent_entities": sentinel} diff --git a/tests/engine/test_detection_workflow.py b/tests/engine/test_detection_workflow.py index 3f45f1b1..3fc5cd79 100644 --- a/tests/engine/test_detection_workflow.py +++ b/tests/engine/test_detection_workflow.py @@ -32,6 +32,7 @@ _get_augment_prompt, _get_latent_prompt, _get_validation_prompt, + _pad_empty_latent_column, _resolve_detection_labels, ) from anonymizer.engine.ndd.adapter import FailedRecord, WorkflowRunResult @@ -618,3 +619,37 @@ def test_pool_size_one_does_not_emit_warning( if r.name == "anonymizer.detection" and "pool of" in r.getMessage() and "aliases" in r.getMessage() ] assert pool_warnings == [] + + +class TestPadEmptyLatentColumn: + """``_pad_empty_latent_column`` keeps the latent column parquet-writable by + replacing empty/missing cells with a sentinel struct (PyArrow cannot infer a + struct schema from a column of only empty lists).""" + + def test_empty_struct_cell_gets_sentinel(self) -> None: + df = pd.DataFrame({COL_LATENT_ENTITIES: [{"latent_entities": []}]}) + out = _pad_empty_latent_column(df) + assert out[COL_LATENT_ENTITIES].iloc[0]["latent_entities"], "empty struct should be padded" + + def test_populated_struct_cell_untouched(self) -> None: + populated = {"latent_entities": [{"label": "employer", "value": "Acme"}]} + df = pd.DataFrame({COL_LATENT_ENTITIES: [populated]}) + out = _pad_empty_latent_column(df) + assert out[COL_LATENT_ENTITIES].iloc[0] == populated + + def test_empty_bare_list_cell_gets_sentinel(self) -> None: + df = pd.DataFrame({COL_LATENT_ENTITIES: [[]]}) + out = _pad_empty_latent_column(df) + assert out[COL_LATENT_ENTITIES].iloc[0], "empty bare list should be padded" + + def test_none_and_nan_cells_normalize_to_sentinel_struct(self) -> None: + df = pd.DataFrame({COL_LATENT_ENTITIES: [None, float("nan")]}) + out = _pad_empty_latent_column(df) + for i in range(2): + cell = out[COL_LATENT_ENTITIES].iloc[i] + assert isinstance(cell, dict) and cell["latent_entities"], f"row {i} should be a padded struct" + + def test_missing_column_is_a_noop(self) -> None: + df = pd.DataFrame({"other": [1, 2]}) + out = _pad_empty_latent_column(df) + assert list(out.columns) == ["other"] From 81ecba0739344019707236de727aea83e58289a5 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Mon, 1 Jun 2026 14:35:33 +0100 Subject: [PATCH 5/7] docs(detection): clarify proposed_label str|None wire annotation intent Greptile P2: the str | None annotation on ValidationDecisionSchema.proposed_label is intentional for jsonschema null-tolerance; _coerce_proposed_label always normalizes None to "" so the runtime value is a str. Document inline so readers do not add unnecessary None guards. --- src/anonymizer/engine/schemas/detection.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/anonymizer/engine/schemas/detection.py b/src/anonymizer/engine/schemas/detection.py index 8cba4c1b..422530ce 100644 --- a/src/anonymizer/engine/schemas/detection.py +++ b/src/anonymizer/engine/schemas/detection.py @@ -187,6 +187,10 @@ class ValidationDecisionSchema(BaseModel): default="keep", description='one of: "keep" | "reclass" | "drop"', ) + # Annotated str | None (not str) so DataDesigner's jsonschema pre-check + # accepts a literal null from a drifted model; _coerce_proposed_label below + # always normalizes None -> "" so the runtime value is in practice always a + # str. Downstream code does not need to guard against None here. proposed_label: str | None = Field( default="", description="Correct label when decision is 'reclass', otherwise empty", From f6c2135e33a920404e3a790d021d1b8410597ee1 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Tue, 2 Jun 2026 01:13:32 +0100 Subject: [PATCH 6/7] fix(replace): render label hints as "such as" so small models emit real values Small models echoed the "(e.g. X, Y, Z)" wrapper verbatim as the synthetic value (e.g. an age replaced with the literal string "(e.g. 52, 38, 45, 31)"), and "one of: ..." made them copy a canned example value instead. Render the per-label hints as "such as {examples}" and instruct the model to generate a NEW value of that kind without reusing the examples or copying the reference text. Validated live on gemma-3n-e2b: type_fidelity and attribute_fidelity both clean (4/4), no record drops. --- src/anonymizer/engine/replace/llm_replace_workflow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/anonymizer/engine/replace/llm_replace_workflow.py b/src/anonymizer/engine/replace/llm_replace_workflow.py index ccd5cb1d..966a12ed 100644 --- a/src/anonymizer/engine/replace/llm_replace_workflow.py +++ b/src/anonymizer/engine/replace/llm_replace_workflow.py @@ -229,7 +229,7 @@ def _get_replacement_mapping_prompt(*, entities_column: str, instructions: str | - "{{ entity.value }}" ({{ entity.labels_str }}) {%- endfor %} -Examples: {{ <> }} +Per-label type references (generate a NEW realistic value of that kind; do not reuse these example values, and never copy this reference text literally): {{ <> }} Rules: 1. Related entities must stay consistent: @@ -275,5 +275,5 @@ def _get_replacement_mapping_prompt(*, entities_column: str, instructions: str | _EXAMPLE_LOOKUP: dict[str, str] = { - label: f"(e.g. {', '.join(examples)})" for label, examples in ENTITY_LABEL_EXAMPLES.items() + label: f"such as {', '.join(examples)}" for label, examples in ENTITY_LABEL_EXAMPLES.items() } From c15896cdb69b2ffd17a317792fa4ee2591c0a9eb Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Tue, 2 Jun 2026 01:13:40 +0100 Subject: [PATCH 7/7] fix(detection): wrap bare-string latent evidence instead of dropping it _clamp_evidence returned [] for a bare-string evidence value, silently dropping the quote when a small model emits a single evidence string instead of a one-element list. Treat a bare string as a single-item list, consistent with the drift-tolerance strategy. Addresses Greptile review note on #174. Adds test_bare_string_evidence_wrapped_not_dropped. --- src/anonymizer/engine/schemas/detection.py | 8 +++++++- tests/engine/test_small_model_drift.py | 6 ++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/anonymizer/engine/schemas/detection.py b/src/anonymizer/engine/schemas/detection.py index 422530ce..0b98ebc0 100644 --- a/src/anonymizer/engine/schemas/detection.py +++ b/src/anonymizer/engine/schemas/detection.py @@ -351,7 +351,13 @@ def _coerce_to_str(cls, v: object) -> str: @field_validator("evidence", mode="before") @classmethod def _clamp_evidence(cls, v: object) -> list[str]: - """Accept any list shape; keep at most 2 non-empty string quotes.""" + """Accept any list shape; keep at most 2 non-empty string quotes. + + A bare string (small models sometimes emit a single quote instead of a + one-element list) is treated as a single-item list rather than dropped. + """ + if isinstance(v, str): + v = [v] if not isinstance(v, list): return [] out: list[str] = [] diff --git a/tests/engine/test_small_model_drift.py b/tests/engine/test_small_model_drift.py index c59ef66b..c2f5b90e 100644 --- a/tests/engine/test_small_model_drift.py +++ b/tests/engine/test_small_model_drift.py @@ -132,3 +132,9 @@ def test_sensitive_category_drift_normalizes_to_latent_identifier(self) -> None: def test_unknown_category_drift_normalizes_to_latent_identifier(self) -> None: result = LatentEntitySchema.model_validate({"label": "x", "value": "y", "category": "some-novel-bucket"}) assert result.category == "latent_identifier" + + def test_bare_string_evidence_wrapped_not_dropped(self) -> None: + """Small models sometimes emit a single evidence quote as a bare string + instead of a one-element list; it should be kept, not silently dropped.""" + result = LatentEntitySchema.model_validate({"label": "x", "value": "y", "evidence": "lives near the clinic"}) + assert result.evidence == ["lives near the clinic"]