diff --git a/src/anonymizer/engine/detection/detection_workflow.py b/src/anonymizer/engine/detection/detection_workflow.py index 87eb644b..a4da9482 100644 --- a/src/anonymizer/engine/detection/detection_workflow.py +++ b/src/anonymizer/engine/detection/detection_workflow.py @@ -58,6 +58,7 @@ EntitiesByValueSchema, EntitiesSchema, LatentEntitiesSchema, + LatentEntitySchema, ) logger = logging.getLogger("anonymizer.detection") @@ -242,7 +243,10 @@ def identify_latent_entities( workflow_name="latent-entity-detection", preview_num_records=preview_num_records, ) - return EntityDetectionResult(dataframe=latent_result.dataframe, failed_records=latent_result.failed_records) + return EntityDetectionResult( + dataframe=_pad_empty_latent_column(latent_result.dataframe), + failed_records=latent_result.failed_records, + ) def run( self, @@ -679,3 +683,43 @@ def _format_privacy_goal(privacy_goal: PrivacyGoal | None) -> str: if privacy_goal is None: return "Not provided" return privacy_goal.to_prompt_string() + + +def _pad_empty_latent_column(df: pd.DataFrame) -> pd.DataFrame: + """Inject a sentinel into any empty ``_latent_entities`` cell. + + Downstream workflows write the DataFrame to parquet via DataDesigner, + which uses pyarrow. pyarrow raises ``Cannot write struct type with no + child field`` when every cell has ``latent_entities: []`` — it can't + infer the nested struct schema from only empty lists. + ``LatentEntitiesSchema._ensure_parquet_writable`` covers this when + pydantic validation runs, but DD does not always route through + ``model_validate`` (e.g. partial-failure fallback), so we pad again + at the DataFrame level. + """ + if COL_LATENT_ENTITIES not in df.columns: + return df + sentinel = [LatentEntitySchema().model_dump()] + + def _fix(cell): + # Preserve each cell's existing shape: the column is uniformly the + # struct/dict shape ({"latent_entities": [...]}) from LatentEntitiesSchema + # in the normal DD path (dict branch), but tolerate a bare-list cell from + # alternate paths. The downstream reader (_coerce_entity_list) accepts + # either shape, so we never mix dict and list within one column. + # None / NaN (e.g. a row absent from a partial-failure fallback merge, + # reintroduced by a pandas reindex) normalizes to the canonical struct + # so the column stays parquet-writable. + if cell is None or (not isinstance(cell, (dict, list)) and pd.isna(cell)): + return {"latent_entities": sentinel} + if isinstance(cell, dict): + if not cell.get("latent_entities"): + return {**cell, "latent_entities": sentinel} + return cell + if isinstance(cell, list) and not cell: + return sentinel + return cell + + df = df.copy() + df[COL_LATENT_ENTITIES] = df[COL_LATENT_ENTITIES].map(_fix) + return df diff --git a/src/anonymizer/engine/replace/llm_replace_workflow.py b/src/anonymizer/engine/replace/llm_replace_workflow.py index ccd5cb1d..966a12ed 100644 --- a/src/anonymizer/engine/replace/llm_replace_workflow.py +++ b/src/anonymizer/engine/replace/llm_replace_workflow.py @@ -229,7 +229,7 @@ def _get_replacement_mapping_prompt(*, entities_column: str, instructions: str | - "{{ entity.value }}" ({{ entity.labels_str }}) {%- endfor %} -Examples: {{ <> }} +Per-label type references (generate a NEW realistic value of that kind; do not reuse these example values, and never copy this reference text literally): {{ <> }} Rules: 1. Related entities must stay consistent: @@ -275,5 +275,5 @@ def _get_replacement_mapping_prompt(*, entities_column: str, instructions: str | _EXAMPLE_LOOKUP: dict[str, str] = { - label: f"(e.g. {', '.join(examples)})" for label, examples in ENTITY_LABEL_EXAMPLES.items() + label: f"such as {', '.join(examples)}" for label, examples in ENTITY_LABEL_EXAMPLES.items() } diff --git a/src/anonymizer/engine/schemas/detection.py b/src/anonymizer/engine/schemas/detection.py index 6c6c3dea..0b98ebc0 100644 --- a/src/anonymizer/engine/schemas/detection.py +++ b/src/anonymizer/engine/schemas/detection.py @@ -5,7 +5,7 @@ from enum import Enum -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator, model_validator from anonymizer.engine.schemas.shared import _parse_raw_wrapper @@ -60,6 +60,68 @@ class RawValidationDecisionSchema(BaseModel): proposed_label: str = Field(default="") reason: str | None = None + @field_validator("decision", mode="before") + @classmethod + def _normalize_decision_preserve_none(cls, v: object) -> object: + """Tolerate small-model decision-field drift while preserving None. + + Renamed from ``_normalize_decision`` to make the None-preserving + semantics visible at every call site without requiring readers + to compare docstrings against ``ValidationDecisionSchema``'s + same-name (now removed) variant. + + Small models (gemma4-e4b on legal_court bench) emit free-form + prose in the decision slot — observed: ``"No specific entity + type for the date placeholder."`` and ``"No specific field + matched for this token."``. Without coercion pydantic rejects + the whole chunk's records. + + Distinct from ValidationDecisionSchema._normalize_decision in + ONE important way: this schema's chunked-validation merger + (``merge_chunk_decisions``) treats ``decision=None`` as "no + answer, skip" so a later chunk can supply a real verdict for + the same id. We must therefore preserve None-ness — only + normalize *strings*. + + Strategy: + * None / non-string / blank string -> ``None`` (preserves the + "no answer" semantics the merger relies on). + * Exact match ``keep``/``reclass``/``drop`` (case-insensitive) -> as-is. + * Substring match (``"Keep."``, ``"DROP!"``, ``"reclass entity"``) + -> the matched choice. Most-specific first so ``"reclass"`` + wins over ``"keep"`` when both substrings appear. + * Free-form prose with no recognizable choice -> ``"keep"`` + (conservative: preserve detection over silently dropping it). + """ + if v is None: + return None + if not isinstance(v, str) or not v.strip(): + return None + cleaned = v.strip().lower() + if cleaned in {"keep", "reclass", "drop"}: + return cleaned + for choice in ("reclass", "drop", "keep"): + if choice in cleaned: + return choice + return "keep" + + @field_validator("proposed_label", mode="before") + @classmethod + def _coerce_proposed_label(cls, v: object) -> object: + """Mirror ValidationDecisionSchema._coerce_proposed_label. + + Small models (gemma4-e4b on the chunked-validation path) emit + ``proposed_label: null`` when the decision is "keep" — pydantic + otherwise rejects None for the str-typed field, dropping the + whole record. The validator chunk schema needs the same loose + coercion the wire ValidationDecisionSchema already has. + """ + if v is None: + return "" + if isinstance(v, (int, float, bool)): + return str(v) + return v + class RawValidationDecisionsSchema(BaseModel): decisions: list[RawValidationDecisionSchema] = Field(default_factory=list) @@ -100,26 +162,91 @@ def from_raw(cls, raw: object) -> ValidationSkeletonSchema: class ValidationDecisionSchema(BaseModel): - """Per-entity validation decision from the LLM validator.""" + """Loose wire-contract for per-entity validation decisions from the LLM. + + The strict internal shape has only the three fields the server actually + consumes: `id`, `decision`, `proposed_label` (+ optional `reason`). The + previous schema also carried `value` and `label`, but those are + overridden from the trusted candidate_lookup in + enrich_validation_decisions — they were pure drift surface. + + Wire-layer looseness addresses classes M/N/O from the bench: + * `decision` is typed `str` (not ValidationChoice) so DD’s + jsonschema pre-check cannot reject enum drift; before-validator + normalizes to a valid enum. Default "keep" means the field is + NOT in `required`, so omission does not drop the record. + * `proposed_label` is `str | None` so the emitted JSON Schema is + `anyOf: [string, null]` — explicit `null` emissions no longer + fail `type: "string"` at the pre-check. + * `value`/`label` removed entirely — any int/null drift on those + fields is now impossible because they’re not in the schema. + """ id: str - value: str = Field(default="", description="Entity value (echoed from skeleton)") - label: str = Field(default="", description="Entity label (echoed from skeleton)") - decision: ValidationChoice - proposed_label: str = Field( + decision: str = Field( + default="keep", + description='one of: "keep" | "reclass" | "drop"', + ) + # Annotated str | None (not str) so DataDesigner's jsonschema pre-check + # accepts a literal null from a drifted model; _coerce_proposed_label below + # always normalizes None -> "" so the runtime value is in practice always a + # str. Downstream code does not need to guard against None here. + proposed_label: str | None = Field( default="", description="Correct label when decision is 'reclass', otherwise empty", ) reason: str | None = None + @field_validator("proposed_label", mode="before") + @classmethod + def _coerce_proposed_label(cls, v: object) -> object: + """Coerce None / non-string to empty string so the strict downstream + shape is always a string (RawValidationDecisionSchema expects str).""" + if v is None: + return "" + if isinstance(v, (int, float, bool)): + return str(v) + return v + + @field_validator("decision", mode="before") + @classmethod + def _normalize_decision(cls, v: object) -> str: + """Coerce drift into a valid ValidationChoice value. + + None / non-string / unknown strings default to 'keep' — the + conservative choice that preserves detection. A substring match + catches small-model variants like 'Keep.' or 'DROP!'. + + Note: a related normalizer + ``RawValidationDecisionSchema._normalize_decision_preserve_none`` + exists on the chunked-validation path. It deliberately preserves + None — see that docstring for why. The two diverge because this + schema's field is ``decision: str`` (default "keep"), while the + chunked path uses ``decision: ValidationChoice | None`` to signal + "no answer" to ``merge_chunk_decisions``. The chunked variant is + explicitly named so the divergence is visible at every call site. + """ + if v is None or not isinstance(v, str) or not v.strip(): + return "keep" + cleaned = v.strip().lower() + if cleaned in {"keep", "reclass", "drop"}: + return cleaned + for choice in ("reclass", "drop", "keep"): # check most-specific first + if choice in cleaned: + return choice + return "keep" + class ValidationDecisionsSchema(BaseModel): decisions: list[ValidationDecisionSchema] = Field(default_factory=list) class AugmentedEntitySchema(BaseModel): - value: str = Field(min_length=1) - label: str = Field(min_length=1) + # No min_length: a single empty value/label must not fail-validate the whole + # augmentation batch on a small model. apply_augmented_entities() skips + # blank value/label downstream, so empties are dropped without losing the row. + value: str = "" + label: str = "" reason: str | None = None @@ -137,34 +264,148 @@ class LatentConfidence(str, Enum): class LatentEntitySchema(BaseModel): - category: LatentCategory + """Single latent (inferred) entity from the latent_detector role. + + Loose wire contract across every field so small-model drift does not + drop records at DataDesigner’s jsonschema pre-check: + * category / confidence: typed str (not LatentCategory / LatentConfidence); + a before-validator normalizes case and substring-matches drift. + * label / value: permissive default "", coerce None/int. + * evidence: any list shape accepted at wire; before-validator + clamps to [0, 2] non-empty strings. + * rationale: wire has no min_length; before-validator truncates if + too long and pads with ellipsis if too short. + """ + + category: str = Field( + default="", + description="Must be: latent_identifier", + ) label: str = Field( - min_length=1, + default="", description=( "General category/class of the inference in snake_case " - "(e.g., employer, specific_institution, home_location, medication, health_condition)" + "(e.g., employer, specific_institution, home_location)" ), ) value: str = Field( - min_length=1, - description="Concise inferred value (generalize if not pinned down strongly by evidence)", + default="", + description="Concise inferred value", + ) + confidence: str = Field( + default="medium", + description="one of: high | medium (see LatentConfidence enum)", ) - confidence: LatentConfidence evidence: list[str] = Field( - min_length=1, - max_length=2, - description="One or two short quotes from the text that support this inference", + default_factory=list, + description="Up to 2 short quotes supporting this inference", ) rationale: str = Field( - min_length=20, - max_length=150, + default="", description="One sentence explaining the inference without adding new facts", ) + @field_validator("category", mode="before") + @classmethod + def _normalize_category(cls, v: object) -> str: + if v is None or not isinstance(v, str) or not v.strip(): + return "latent_identifier" + cleaned = v.strip().lower().replace(" ", "_").replace("-", "_") + allowed = {c.value for c in LatentCategory} + if cleaned in allowed: + return cleaned + # LatentCategory has a single member (latent_identifier); sensitive + # attributes were folded into quasi_identifier on the rewrite side, so + # any non-canonical drift normalizes to the lone latent category. + return LatentCategory.latent_identifier.value + + @field_validator("confidence", mode="before") + @classmethod + def _normalize_confidence(cls, v: object) -> str: + if v is None: + return "medium" + # Numeric -> band + if isinstance(v, (int, float)): + f = float(v) + if f >= 0.66: + return "high" + return "medium" + if not isinstance(v, str) or not v.strip(): + return "medium" + cleaned = v.strip().lower() + if cleaned in {"high", "medium"}: + return cleaned + if cleaned in {"h", "hi"}: + return "high" + return "medium" + + @field_validator("label", "value", mode="before") + @classmethod + def _coerce_to_str(cls, v: object) -> str: + if v is None: + return "" + if isinstance(v, (int, float, bool)): + return str(v) + return v + + @field_validator("evidence", mode="before") + @classmethod + def _clamp_evidence(cls, v: object) -> list[str]: + """Accept any list shape; keep at most 2 non-empty string quotes. + + A bare string (small models sometimes emit a single quote instead of a + one-element list) is treated as a single-item list rather than dropped. + """ + if isinstance(v, str): + v = [v] + if not isinstance(v, list): + return [] + out: list[str] = [] + for item in v: + if item is None: + continue + if not isinstance(item, str): + item = str(item) + item = item.strip() + if item: + out.append(item) + if len(out) >= 2: + break + return out + + @field_validator("rationale", mode="before") + @classmethod + def _cap_rationale(cls, v: object) -> str: + """Truncate verbose rationales (Nemotron-observed 260 chars > 150 max). + Server reads rationale for context only; no lower bound enforced. + """ + if v is None: + return "" + if isinstance(v, (int, float, bool)): + v = str(v) + if not isinstance(v, str): + return "" + s = v.strip() + if len(s) > 150: + return s[:147].rstrip() + "..." + return s + class LatentEntitiesSchema(BaseModel): latent_entities: list[LatentEntitySchema] = Field(default_factory=list) + @model_validator(mode="after") + def _ensure_parquet_writable(self) -> "LatentEntitiesSchema": + """DataDesigner writes this column to parquet; an empty list of + structs makes pyarrow fail with 'Cannot write struct type with no + child field' because it can't infer the nested schema. Inject a + single all-defaults sentinel when empty — downstream code (e.g. + reconstruct_full_disposition) already filters entries with empty + label/value so the sentinel is invisible semantically.""" + if not self.latent_entities: + self.latent_entities = [LatentEntitySchema()] + return self + class EntityByValueSchema(BaseModel): value: str = Field(default="") diff --git a/src/anonymizer/engine/schemas/replace.py b/src/anonymizer/engine/schemas/replace.py index 86a890df..56490641 100644 --- a/src/anonymizer/engine/schemas/replace.py +++ b/src/anonymizer/engine/schemas/replace.py @@ -7,9 +7,16 @@ class EntityReplacementSchema(BaseModel): - original: str = Field(min_length=1, description="The original entity value") - label: str = Field(min_length=1, description="The entity label/type") - synthetic: str = Field(min_length=1, description="The synthetic replacement value") + # No min_length: this is the LLM output schema for Substitute mode, so a + # single drifted entry (e.g. a small model emitting an empty synthetic) + # must not fail-validate the whole replacement map and drop the record. + # Empty original/label are filtered downstream (they cannot match a + # requested entity in _filter_replacement_map_to_input_entities); an empty + # synthetic results in the entity being removed at apply time, which is + # privacy-safe (no PII leak) even if utility-poor. + original: str = Field(default="", description="The original entity value") + label: str = Field(default="", description="The entity label/type") + synthetic: str = Field(default="", description="The synthetic replacement value") class EntityReplacementMapSchema(BaseModel): diff --git a/tests/engine/test_chunked_validation.py b/tests/engine/test_chunked_validation.py index f9b402a3..7099f227 100644 --- a/tests/engine/test_chunked_validation.py +++ b/tests/engine/test_chunked_validation.py @@ -285,8 +285,14 @@ def test_filters_unknown_ids_and_deduplicates(self) -> None: by_id = {d["id"]: d for d in merged["decisions"]} assert by_id["a"]["decision"] == "keep" assert by_id["b"]["decision"] == "drop" - assert by_id["a"]["value"] == "Alice" # enriched from candidate - assert by_id["a"]["label"] == "first_name" + # ``value`` and ``label`` were dropped from the wire-layer + # ``ValidationDecisionSchema`` to remove a small-model drift surface; + # ``ValidationDecisionsSchema.model_validate`` in + # ``merge_chunk_decisions`` strips them. They are re-filled from + # ``candidate_lookup`` by ``enrich_validation_decisions`` further + # downstream — covered by the end-to-end test below. + assert "value" not in by_id["a"] + assert "label" not in by_id["a"] def test_drops_decisions_without_verdict(self) -> None: """A decision with ``decision=None`` is equivalent to 'no answer' and must not leak through. @@ -311,11 +317,12 @@ def test_later_real_verdict_wins_over_earlier_null_duplicate(self) -> None: chunk_one = RawValidationDecisionsSchema.model_validate({"decisions": [{"id": "a", "decision": None}]}) chunk_two = RawValidationDecisionsSchema.model_validate({"decisions": [{"id": "a", "decision": "keep"}]}) merged = merge_chunk_decisions([chunk_one, chunk_two], candidates) + # value/label are stripped at merge time (post-#130 wire-loose + # ValidationDecisionSchema drops them) and re-filled later by + # enrich_validation_decisions from candidate_lookup. assert merged["decisions"] == [ { "id": "a", - "value": "Alice", - "label": "first_name", "decision": "keep", "proposed_label": "", "reason": None, diff --git a/tests/engine/test_detection_workflow.py b/tests/engine/test_detection_workflow.py index 3f45f1b1..3fc5cd79 100644 --- a/tests/engine/test_detection_workflow.py +++ b/tests/engine/test_detection_workflow.py @@ -32,6 +32,7 @@ _get_augment_prompt, _get_latent_prompt, _get_validation_prompt, + _pad_empty_latent_column, _resolve_detection_labels, ) from anonymizer.engine.ndd.adapter import FailedRecord, WorkflowRunResult @@ -618,3 +619,37 @@ def test_pool_size_one_does_not_emit_warning( if r.name == "anonymizer.detection" and "pool of" in r.getMessage() and "aliases" in r.getMessage() ] assert pool_warnings == [] + + +class TestPadEmptyLatentColumn: + """``_pad_empty_latent_column`` keeps the latent column parquet-writable by + replacing empty/missing cells with a sentinel struct (PyArrow cannot infer a + struct schema from a column of only empty lists).""" + + def test_empty_struct_cell_gets_sentinel(self) -> None: + df = pd.DataFrame({COL_LATENT_ENTITIES: [{"latent_entities": []}]}) + out = _pad_empty_latent_column(df) + assert out[COL_LATENT_ENTITIES].iloc[0]["latent_entities"], "empty struct should be padded" + + def test_populated_struct_cell_untouched(self) -> None: + populated = {"latent_entities": [{"label": "employer", "value": "Acme"}]} + df = pd.DataFrame({COL_LATENT_ENTITIES: [populated]}) + out = _pad_empty_latent_column(df) + assert out[COL_LATENT_ENTITIES].iloc[0] == populated + + def test_empty_bare_list_cell_gets_sentinel(self) -> None: + df = pd.DataFrame({COL_LATENT_ENTITIES: [[]]}) + out = _pad_empty_latent_column(df) + assert out[COL_LATENT_ENTITIES].iloc[0], "empty bare list should be padded" + + def test_none_and_nan_cells_normalize_to_sentinel_struct(self) -> None: + df = pd.DataFrame({COL_LATENT_ENTITIES: [None, float("nan")]}) + out = _pad_empty_latent_column(df) + for i in range(2): + cell = out[COL_LATENT_ENTITIES].iloc[i] + assert isinstance(cell, dict) and cell["latent_entities"], f"row {i} should be a padded struct" + + def test_missing_column_is_a_noop(self) -> None: + df = pd.DataFrame({"other": [1, 2]}) + out = _pad_empty_latent_column(df) + assert list(out.columns) == ["other"] diff --git a/tests/engine/test_llm_replace_workflow.py b/tests/engine/test_llm_replace_workflow.py index f7abbc44..a794f62c 100644 --- a/tests/engine/test_llm_replace_workflow.py +++ b/tests/engine/test_llm_replace_workflow.py @@ -356,3 +356,42 @@ def test_filter_replacement_map_empty_warning_does_not_leak_pii( assert "first_name" in caplog.text _assert_no_pii_in_logs(caplog, extra_secrets=("Acme Corp", "NovaCorp")) assert result == {"replacements": []} + + +# --------------------------------------------------------------------------- +# Small-model drift: a single bad entry must not fail the whole map / drop the +# record. EntityReplacementSchema dropped min_length on original/label/synthetic. +# --------------------------------------------------------------------------- + + +def test_empty_synthetic_does_not_fail_validation_and_survives_filter() -> None: + """A blank synthetic used to fail ``min_length=1`` and drop the whole map. + Now the entry validates; it keys on (original, label) so it survives the + requested-entity filter and is applied as a deletion (privacy-safe).""" + parsed_entities = EntitiesByValueSchema.model_validate( + {"entities_by_value": [{"value": "Alice", "labels": ["first_name"]}]} + ) + raw_map = {"replacements": [{"original": "Alice", "label": "first_name", "synthetic": ""}]} + + result = _filter_replacement_map_to_input_entities(raw_map=raw_map, parsed_entities=parsed_entities) + + assert result == {"replacements": [{"original": "Alice", "label": "first_name", "synthetic": ""}]} + + +def test_empty_original_or_label_entry_is_filtered_without_dropping_record() -> None: + """Empty original/label cannot match a requested entity, so the drifted + entry is dropped while the valid sibling entry is preserved (the whole map + no longer fails validation on the empty fields).""" + parsed_entities = EntitiesByValueSchema.model_validate( + {"entities_by_value": [{"value": "Alice", "labels": ["first_name"]}]} + ) + raw_map = { + "replacements": [ + {"original": "", "label": "", "synthetic": "junk"}, + {"original": "Alice", "label": "first_name", "synthetic": "Maya"}, + ] + } + + result = _filter_replacement_map_to_input_entities(raw_map=raw_map, parsed_entities=parsed_entities) + + assert result == {"replacements": [{"original": "Alice", "label": "first_name", "synthetic": "Maya"}]} diff --git a/tests/engine/test_small_model_drift.py b/tests/engine/test_small_model_drift.py new file mode 100644 index 00000000..c2f5b90e --- /dev/null +++ b/tests/engine/test_small_model_drift.py @@ -0,0 +1,140 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Tests covering small-model output drift on detection schemas. + +These regressions cover the drift modes observed during small-model +benchmarks (gemma4-e2b, gemma4-e4b, nemotron-3-nano:4b, qwen3.5:4b on legal +court / medical visit / employee notes datasets). Each test pins one drift +class so a future schema change that re-tightens the wire contract surfaces +here rather than silently dropping records on small-model runs. +""" + +from __future__ import annotations + +from anonymizer.engine.schemas.detection import ( + LatentEntitySchema, + RawValidationDecisionSchema, + ValidationDecisionSchema, +) + +# --------------------------------------------------------------------------- +# RawValidationDecisionSchema — chunked-validation drift +# --------------------------------------------------------------------------- + + +class TestRawValidationDecisionDrift: + def test_freeform_prose_decision_coerces_to_keep(self) -> None: + """gemma4-e4b on legal_court emits prose like "No specific entity + type for the date placeholder." in the decision slot. Conservative + coercion to ``keep`` so the detection survives downstream.""" + result = RawValidationDecisionSchema.model_validate( + {"id": "x", "decision": "No specific entity type for the date placeholder."} + ) + assert result.decision is not None + assert result.decision.value == "keep" + + def test_explicit_drop_substring_wins(self) -> None: + result = RawValidationDecisionSchema.model_validate({"id": "x", "decision": "DROP."}) + assert result.decision.value == "drop" + + def test_reclass_substring_wins_over_keep(self) -> None: + """``"reclass entity (was previously kept)"`` — the more-specific + choice should win even when both substrings are present.""" + result = RawValidationDecisionSchema.model_validate( + {"id": "x", "decision": "reclass entity (was previously kept)"} + ) + assert result.decision.value == "reclass" + + def test_none_decision_preserved(self) -> None: + """``decision=None`` is "no answer" in the chunked merger; must NOT + be coerced into a verdict or downstream merge logic breaks.""" + result = RawValidationDecisionSchema.model_validate({"id": "x", "decision": None}) + assert result.decision is None + + def test_blank_string_decision_treated_as_none(self) -> None: + result = RawValidationDecisionSchema.model_validate({"id": "x", "decision": " "}) + assert result.decision is None + + def test_proposed_label_none_coerces_to_empty(self) -> None: + result = RawValidationDecisionSchema.model_validate({"id": "x", "decision": "keep", "proposed_label": None}) + assert result.proposed_label == "" + + def test_int_proposed_label_coerces_to_str(self) -> None: + result = RawValidationDecisionSchema.model_validate({"id": "x", "decision": "keep", "proposed_label": 42}) + assert result.proposed_label == "42" + + +# --------------------------------------------------------------------------- +# ValidationDecisionSchema — wire schema drops value/label +# --------------------------------------------------------------------------- + + +class TestValidationDecisionWireShape: + def test_value_label_stripped_from_wire(self) -> None: + """The wire-loose schema dropped value and label as drift surface; + downstream ``enrich_validation_decisions`` re-fills them from the + trusted ``candidate_lookup``.""" + result = ValidationDecisionSchema.model_validate( + {"id": "x", "decision": "keep", "value": "Alice", "label": "first_name"} + ) + dumped = result.model_dump() + assert "value" not in dumped + assert "label" not in dumped + + def test_decision_freeform_coerces_to_keep(self) -> None: + # ValidationDecisionSchema.decision is ``str`` (not the ValidationChoice + # enum) — the wire-loose contract — so the result is a plain string. + result = ValidationDecisionSchema.model_validate({"id": "x", "decision": "free-form prose"}) + assert result.decision == "keep" + + def test_proposed_label_none_coerces_to_empty(self) -> None: + result = ValidationDecisionSchema.model_validate({"id": "x", "decision": "keep", "proposed_label": None}) + assert result.proposed_label == "" + + +# --------------------------------------------------------------------------- +# LatentEntitySchema — defaults + rationale clamp +# --------------------------------------------------------------------------- + + +class TestLatentEntityDrift: + def test_overlong_rationale_truncates(self) -> None: + """Some models emit 200+ char rationales; clamp to 147 + ``"..."`` + to fit the 150-char schema cap rather than dropping the row.""" + long = "A" * 250 + result = LatentEntitySchema.model_validate({"label": "occupation", "value": "doctor", "rationale": long}) + assert len(result.rationale) <= 150 + assert result.rationale.endswith("...") + + def test_empty_required_fields_default_to_empty_string(self) -> None: + """Pre-loosening these were required ``min_length=1`` and would drop + the row; loose wire allows empty so the parquet-pad sentinel + path can build a placeholder row when needed.""" + result = LatentEntitySchema() + assert result.label == "" + assert result.value == "" + + def test_invalid_confidence_coerces_to_medium(self) -> None: + result = LatentEntitySchema.model_validate( + {"label": "x", "value": "y", "confidence": "very-high", "rationale": "ok"} + ) + assert result.confidence == "medium" + + def test_sensitive_category_drift_normalizes_to_latent_identifier(self) -> None: + """A category string containing "sensitive" must normalize to the lone + LatentCategory member rather than raising AttributeError (sensitive + attributes were folded into quasi_identifier on the rewrite side).""" + result = LatentEntitySchema.model_validate( + {"label": "x", "value": "y", "category": "latent_sensitive_attribute"} + ) + assert result.category == "latent_identifier" + + def test_unknown_category_drift_normalizes_to_latent_identifier(self) -> None: + result = LatentEntitySchema.model_validate({"label": "x", "value": "y", "category": "some-novel-bucket"}) + assert result.category == "latent_identifier" + + def test_bare_string_evidence_wrapped_not_dropped(self) -> None: + """Small models sometimes emit a single evidence quote as a bare string + instead of a one-element list; it should be kept, not silently dropped.""" + result = LatentEntitySchema.model_validate({"label": "x", "value": "y", "evidence": "lives near the clinic"}) + assert result.evidence == ["lives near the clinic"]