Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/anonymizer/engine/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
COL_DOMAIN_SUPPLEMENT = "_domain_supplement"
COL_DOMAIN_SUPPLEMENT_PRIVACY = "_domain_supplement_privacy"
COL_SENSITIVITY_DISPOSITION = "_sensitivity_disposition"
COL_SIMPLE_DISPOSITION = "_simple_disposition" # internal hand-off: loose LLM wire output
COL_SENSITIVITY_DISPOSITION_BLOCK = "_sensitivity_disposition_block"
COL_REWRITE_DISPOSITION_BLOCK = "_rewrite_disposition_block"
COL_REPLACEMENT_MAP_FOR_PROMPT = "_replacement_map_for_prompt"
Expand Down
409 changes: 409 additions & 0 deletions src/anonymizer/engine/rewrite/disposition_derivation.py

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion src/anonymizer/engine/rewrite/qa_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,18 @@
)

# Derived from the schema so the Jinja key stays in sync with the field name.
# Prefer an annotation-typed lookup (strict-mode contract); fall back to a
# name-based lookup so wire-loose typing of ``domain`` (str instead of Domain
# enum) still resolves to the same field. The Domain enum hint is preserved
# in the field description and the ``_normalize_domain`` before-validator.
_DOMAIN_KEY = next(
(name for name, info in DomainClassificationSchema.model_fields.items() if info.annotation is Domain),
None,
)
if _DOMAIN_KEY is None and "domain" in DomainClassificationSchema.model_fields:
_DOMAIN_KEY = "domain"
if _DOMAIN_KEY is None:
raise RuntimeError("DomainClassificationSchema must define a field annotated with Domain")
raise RuntimeError("DomainClassificationSchema must define a 'domain' field")

# ---------------------------------------------------------------------------
# Stage 1 pre-step: format disposition → disposition block
Expand Down
217 changes: 212 additions & 5 deletions src/anonymizer/engine/rewrite/sensitivity_disposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,13 @@

from __future__ import annotations

from data_designer.config.column_configs import LLMStructuredColumnConfig
import logging
from typing import Any

from data_designer.config import custom_column_generator
from data_designer.config.column_configs import CustomColumnConfig, LLMStructuredColumnConfig
from data_designer.config.column_types import ColumnConfigT
from pydantic import ValidationError

from anonymizer.config.models import RewriteModelSelection
from anonymizer.config.rewrite import PrivacyGoal
Expand All @@ -14,13 +19,27 @@
COL_ENTITIES_BY_VALUE,
COL_LATENT_ENTITIES,
COL_SENSITIVITY_DISPOSITION,
COL_SIMPLE_DISPOSITION,
COL_TAG_NOTATION,
COL_TAGGED_TEXT,
_jinja,
)
from anonymizer.engine.ndd.model_loader import resolve_model_alias
from anonymizer.engine.prompt_utils import substitute_placeholders
from anonymizer.engine.schemas import SensitivityDispositionSchema, StrictSensitivityDispositionSchema
from anonymizer.engine.rewrite.disposition_derivation import (
_flatten_context,
derive_combined_risk_level,
reconstruct_full_disposition,
template_protection_reason,
)
from anonymizer.engine.schemas import (
EntityDispositionSchema,
SensitivityDispositionSchema,
SimpleDispositionResult,
)
from anonymizer.engine.schemas.rewrite import _ENTITY_LABEL_TO_CATEGORY

logger = logging.getLogger(__name__)


def _get_sensitivity_disposition_prompt(
Expand Down Expand Up @@ -257,6 +276,165 @@ def _get_sensitivity_disposition_prompt(
# ---------------------------------------------------------------------------


# ---------------------------------------------------------------------------
# Pessimistic fallback when reconstruction yields nothing
# ---------------------------------------------------------------------------


def _pessimistic_fallback_disposition(
entities_by_value: object,
latent_entities: object,
) -> SensitivityDispositionSchema:
"""Build a worst-case disposition from the entity context alone.

Used when ``reconstruct_full_disposition`` returns an empty list — e.g.
every ``SimpleDispositionItem`` was an orphan, or the LLM emitted no
items at all. Without this fallback, downstream
``parse_sensitivity_disposition`` raises ``ValidationError`` on
``min_length=1`` and the row drops, defeating the whole loose-wire +
server-reconstruction architecture this PR exists to add.

Policy (per Lipika/Andre's review on PR #130, addressing the
record-drop concern):
* ``direct_identifier`` -> ``replace`` (high risk, must be masked).
* everything else -> ``generalize`` (medium risk, mask but keep
rough semantics for utility).

Categories come from the per-entity ``entity_label`` via the
``_ENTITY_LABEL_TO_CATEGORY`` map (the same source of truth the
reconstructor uses for entity-label-stuffed-into-category drift);
unmapped labels fall back to ``quasi_identifier``.
"""
flat = _flatten_context(entities_by_value, latent_entities)
items: list[EntityDispositionSchema] = []
for idx, slot in enumerate(flat, start=1):
label = (slot.get("entity_label") or "").strip()
value = (slot.get("entity_value") or "").strip()
source = slot.get("source") or "tagged"
if not label or not value:
continue
if source == "latent":
category = "latent_identifier"
else:
category = _ENTITY_LABEL_TO_CATEGORY.get(label, "quasi_identifier")
method = "replace" if category == "direct_identifier" else "generalize"
sensitivity = "high" if category == "direct_identifier" else "medium"
combined_risk = derive_combined_risk_level(category, method, sensitivity)
reason = template_protection_reason(category, method, sensitivity)
items.append(
EntityDispositionSchema(
id=idx,
source=source,
category=category,
sensitivity=sensitivity,
entity_label=label,
entity_value=value,
protection_method_suggestion=method,
combined_risk_level=combined_risk,
protection_reason=reason,
)
)
if not items:
# Genuinely no entities at all in context. The orchestrator should have
# short-circuited rows with no detected entities before this step, so
# this is a pipeline-invariant violation — but this is the last-resort
# path whose contract is "never drop the row." Emitting an empty list
# would raise on SensitivityDispositionSchema's (and the downstream
# parser's) min_length=1 invariant and drop the record, so we log loudly
# and emit a single no-op (leave_as_is/low) disposition instead. It is
# excluded from protected_entities, so it never reaches the rewrite.
logger.error(
"pessimistic fallback: empty entity context at the disposition step "
"(orchestrator should have short-circuited entity-free rows); emitting "
"a single no-op disposition so the row is not dropped"
)
items.append(
EntityDispositionSchema(
id=1,
source="tagged",
category="quasi_identifier",
sensitivity="low",
entity_label="",
entity_value="",
protection_method_suggestion="leave_as_is",
combined_risk_level=derive_combined_risk_level("quasi_identifier", "leave_as_is", "low"),
protection_reason=template_protection_reason("quasi_identifier", "leave_as_is", "low"),
)
)
return SensitivityDispositionSchema(sensitivity_disposition=items)
Comment on lines +337 to +364

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Duplicate return statements — the if not items: branch and the fallthrough return are identical. The comment says "Still better to raise here" but neither branch explicitly raises; the raise comes implicitly from SensitivityDispositionSchema validation. Since the two paths return the same expression, the if branch is dead code. Either remove the if not items: block (letting validation raise naturally), or replace it with an explicit raise to make the intent clear and avoid the misleading comment.

Suggested change
if not items:
# Genuinely no entities at all in context — the orchestrator should
# have short-circuited before this step. Still better to raise here
# than to silently emit garbage; SensitivityDispositionSchema's
# min_length=1 invariant will surface the bug.
return SensitivityDispositionSchema(sensitivity_disposition=items)
return SensitivityDispositionSchema(sensitivity_disposition=items)
# Genuinely no entities at all in context — the orchestrator should
# have short-circuited before this step. SensitivityDispositionSchema's
# min_length=1 invariant will surface the bug via ValidationError.
return SensitivityDispositionSchema(sensitivity_disposition=items)



# ---------------------------------------------------------------------------
# Reconstruction column
# ---------------------------------------------------------------------------


@custom_column_generator(required_columns=[COL_SIMPLE_DISPOSITION, COL_ENTITIES_BY_VALUE, COL_LATENT_ENTITIES])
def _reconstruct_full_disposition_column(row: dict[str, Any]) -> dict[str, Any]:
"""Rebuild the strict EntityDispositionSchema list from the loose LLM
output in ``COL_SIMPLE_DISPOSITION`` plus the entity context columns.

Writes ``COL_SENSITIVITY_DISPOSITION`` so every downstream consumer
reads the same column name / shape as before this refactor.

Empty-result fallback: when the model returns nothing usable (every
item is an orphan, or the LLM omitted the field entirely), build a
pessimistic disposition from the entity context (direct identifiers
-> replace, everything else -> generalize). This addresses the
record-drop concern Lipika and Andre raised on PR #130 — emitting an
empty disposition would have failed downstream
``parse_sensitivity_disposition``'s ``min_length=1`` check anyway.
"""
simple_raw = row.get(COL_SIMPLE_DISPOSITION, {}) or {}
if isinstance(simple_raw, SimpleDispositionResult):
simple = simple_raw
else:
if isinstance(simple_raw, str):
import json as _json

try:
simple_raw = _json.loads(simple_raw)
except Exception:
simple_raw = {}
try:
simple = SimpleDispositionResult.model_validate(simple_raw)
except ValidationError as exc:
logger.warning(
"reconstruct: SimpleDispositionResult failed to validate (%s); "
"falling back to pessimistic disposition from entity context",
str(exc)[:200],
)
simple = SimpleDispositionResult()

entities_by_value = row.get(COL_ENTITIES_BY_VALUE)
latent_entities = row.get(COL_LATENT_ENTITIES)

if not simple.sensitivity_disposition:
logger.warning(
"reconstruct: empty SimpleDispositionResult for row; "
"falling back to pessimistic disposition from entity context"
)
full = _pessimistic_fallback_disposition(entities_by_value, latent_entities)
else:
try:
full = reconstruct_full_disposition(simple, entities_by_value, latent_entities)
except ValidationError as exc:
logger.warning(
"reconstruct: ValidationError after orphan-skipping (likely all items out of context range); "
"falling back to pessimistic disposition. detail=%s",
str(exc)[:200],
)
full = _pessimistic_fallback_disposition(entities_by_value, latent_entities)
Comment on lines +412 to +427

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Uncaught ValidationError from _pessimistic_fallback_disposition can still drop the row

Both call-sites of _pessimistic_fallback_disposition inside this function are unguarded. When context is empty (all slots have empty labels/values — or there are no context rows at all), _pessimistic_fallback_disposition calls SensitivityDispositionSchema(sensitivity_disposition=[]), which raises a ValidationError from the min_length=1 constraint. Because neither call-site is wrapped in its own try/except, that exception propagates out of _reconstruct_full_disposition_column uncaught, and the row still drops — contradicting the PR's stated guarantee of preventing whole-record drops.

The try/except on line 398–406 only covers reconstruct_full_disposition; both subsequent _pessimistic_fallback_disposition calls (line 396 and line 406) are outside its scope. Wrapping them as well (or catching at the column level) would close the gap.


row[COL_SENSITIVITY_DISPOSITION] = full.model_dump()
return row


# ---------------------------------------------------------------------------
# Workflow
# ---------------------------------------------------------------------------


class SensitivityDispositionWorkflow:
def columns(
self,
Expand All @@ -266,17 +444,46 @@ def columns(
data_summary: str | None = None,
strict_entity_protection: bool = False,
) -> list[ColumnConfigT]:
"""Two-step pipeline for small-model robustness:

1. LLM column emits the loose ``SimpleDispositionResult`` to a
hidden ``COL_SIMPLE_DISPOSITION`` column. The wire schema has
no enum/required/minLength constraints, so DataDesigner's
jsonschema pre-validate gate accepts drifted small-model
output that strict ``SensitivityDispositionSchema`` would
reject. ``drop=True`` keeps this internal hand-off out of the
user-facing preview DataFrame.
2. Pure-python reconstruction column rebuilds the strict
``SensitivityDispositionSchema`` from the loose wire output
plus the entity-context columns. No LLM call; deterministic;
handles id pairing, category/method drift normalization,
``combined_risk_level`` derivation, and pessimistic fallback
when the LLM produces nothing usable.

``strict_entity_protection`` continues to flow into the prompt's
``<strict_entity_protection>`` block — the contract is enforced
at prompt time. The output_format selection between
``SensitivityDispositionSchema`` and
``StrictSensitivityDispositionSchema`` is no longer needed
because we always emit ``SimpleDispositionResult`` on the wire
and reconstruct into the canonical (non-strict) schema, which
downstream consumers already accept.
"""
disposition_alias = resolve_model_alias("disposition_analyzer", selected_models)
output_schema = StrictSensitivityDispositionSchema if strict_entity_protection else SensitivityDispositionSchema
return [
LLMStructuredColumnConfig(
name=COL_SENSITIVITY_DISPOSITION,
name=COL_SIMPLE_DISPOSITION,
prompt=_get_sensitivity_disposition_prompt(
privacy_goal,
data_summary,
strict_entity_protection=strict_entity_protection,
),
model_alias=disposition_alias,
output_format=output_schema,
output_format=SimpleDispositionResult,
drop=True,
),
CustomColumnConfig(
name=COL_SENSITIVITY_DISPOSITION,
generator_function=_reconstruct_full_disposition_column,
),
]
4 changes: 4 additions & 0 deletions src/anonymizer/engine/schemas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@
RewriteOutputSchema,
SensitivityDispositionSchema,
SensitivityLevel,
SimpleDispositionItem,
SimpleDispositionResult,
StrictCombinedRiskLevel,
StrictEntityDispositionSchema,
StrictProtectionMethod,
Expand Down Expand Up @@ -109,6 +111,8 @@
"RewriteOutputSchema",
"SensitivityDispositionSchema",
"SensitivityLevel",
"SimpleDispositionItem",
"SimpleDispositionResult",
"StrictCombinedRiskLevel",
"StrictEntityDispositionSchema",
"StrictProtectionMethod",
Expand Down
Loading
Loading