diff --git a/src/anonymizer/engine/rewrite/domain_classification.py b/src/anonymizer/engine/rewrite/domain_classification.py
index f943c41d..fd976d34 100644
--- a/src/anonymizer/engine/rewrite/domain_classification.py
+++ b/src/anonymizer/engine/rewrite/domain_classification.py
@@ -63,7 +63,7 @@ def __post_init__(self) -> None:
DomainMetadata(
domain=Domain.BIOGRAPHY_PROFILE,
classification_description="Personal profiles, CVs/resumes, biographical narratives, employee bios",
- quality_supplement="Focus on: core life roles and occupations; long-term activities and commitments; career trajectory and development (including training, education, major transitions, and advancement into current roles); distinctive skills or ways of doing things in those roles (e.g., creative methods, sourcing philosophy, technical or artistic approach); central motivations and formative influences rooted in early experience; and key, ongoing relationships or family structures that shape the individual's life or work.\n\nYou MUST capture high-level educational background and professional trajectory when present, expressed in abstract terms (e.g., advanced study, early-stage training, work at major observatory, move into leadership), even if specific institutions or dates must be generalized.\n\nAlso capture signature outputs or recurring creations that represent the individual's identity or history (e.g., a recurring research theme, a major discovery focus, a signature dish), especially when tied to motivation or heritage.\n\nDrop: street-level or hyper-local locations, exact ages, precise institutions, and identifying anecdotes that do not materially affect development, output, values, or long-term identity.",
+ quality_supplement="Focus on: durable life roles and occupations; long-term activities, responsibilities, and commitments; broad career trajectory and development (including training, education, major transitions, and movement into current roles); sustained areas of work, practice, or contribution; and broad family or support structures only when materially relevant to long-term motivations, responsibilities, or life trajectory.\n\nCapture high-level educational and professional progression when present, in abstract or category-level terms (e.g., advanced technical training, work in a major research environment, transition into leadership), even when specific institutions, dates, employers, or locations may need to be generalized or omitted.\n\nAlso capture enduring motivations, values, or formative influences that explain long-term goals or career direction, expressed at a high level rather than through specific autobiographical detail.\n\nDo not preserve information primarily because it makes the person distinctive, memorable, or recognizable. Avoid narrow specializations, signature styles, uniquely identifying methods, or highly specific combinations of education, career history, geography, specialization, family structure, or timeline that primarily increase recognizability rather than preserving core utility.\n\nDrop: street-level or hyper-local locations; exact ages or birth dates; precise institutions or employers when unnecessary; names of relatives or close contacts; and identifying anecdotes.",
),
DomainMetadata(
domain=Domain.INSURANCE,
diff --git a/src/anonymizer/engine/rewrite/evaluate.py b/src/anonymizer/engine/rewrite/evaluate.py
index ee7e74ed..827c2956 100644
--- a/src/anonymizer/engine/rewrite/evaluate.py
+++ b/src/anonymizer/engine/rewrite/evaluate.py
@@ -75,11 +75,32 @@ def _render_quality_reanswer_prompt(row: dict[str, Any]) -> str:
prompt = """You are taking a reading comprehension exam. You will answer each question about the text.
+The text is an anonymized rewrite of an original document. This means:
+- identifying details may have been removed,
+- dates or locations may be generalized,
+- specific entities may be replaced with broader categories,
+- and information may be paraphrased or expressed more abstractly.
+
+Your task is to recover the BEST SEMANTIC ANSWER supported by the rewritten text,
+not merely to extract exact wording.
+
+PREFER GENERALIZED BUT SUPPORTED ANSWERS OVER "UNKNOWN". Use "unknown" ONLY if the
+rewritten text does not contain enough information to reasonably determine the answer,
+even through paraphrase, abstraction, or semantic inference.
+
-- If the text does not state the answer, use "unknown"
-- Keep answers concise and factual
-- Do not invent details
-- You MUST provide an answer for EVERY item in the template below
+- Keep answers concise and factual.
+- Do not invent details that are not supported by the text.
+- The answer may appear:
+ - in generalized form,
+ - at a broader level of abstraction,
+ - paraphrased,
+ - distributed across multiple sentences,
+ - or with identifying details removed.
+- Use semantic understanding, not exact wording overlap.
+- If the rewritten text clearly preserves the meaning needed to answer the question,
+ answer using that preserved meaning.
+- You MUST provide an answer for EVERY item in the template below.
@@ -87,7 +108,8 @@ def _render_quality_reanswer_prompt(row: dict[str, Any]) -> str:
-Fill in the "answer" field for each item. Do not add or remove items.
+Fill in the "answer" field for each item.
+Do not add or remove items.
<>
@@ -186,28 +208,52 @@ def _render_quality_compare_prompt(row: dict[str, Any]) -> str:
for item in qa.items
]
- prompt = """A student took a reading comprehension exam.
+ prompt = """A student took a reading comprehension exam. The text the student read was an anonymized
+rewrite of the original — so abstraction shifts are the EXPECTED and INTENDED result, not errors.
Compare the student's answer to the reference answer and grade each on a 0.0-1.0 scale:
-* 1.0 = Perfect match (semantically equivalent)
-* 0.7-0.9 = Mostly correct (minor details missing/different)
-* 0.4-0.6 = Partially correct (some key info present)
+* 1.0 = Core meaning preserved (including abstraction that preserves functional meaning)
+* 0.7-0.9 = Mostly correct (minor details missing, or meaning preserved at a broader level)
+* 0.4-0.6 = Partially correct (some key info present but meaningful content lost)
* 0.1-0.3 = Minimal match (tangentially related)
* 0.0 = No match (wrong, "unknown", or contradictory)
-* No credit for invented or contradictory information
-* Semantically equivalent answers get full credit
-* "unknown" answers get 0.0 score
-* A generalized or abstracted answer that preserves core semantic meaning is NOT wrong —
- score it on whether the meaning survives, not on whether specific details are retained.
- Example: "a state university" matching "a public university in New Jersey" scores ≥ 0.7
- if the institutional type is what matters.
-* For "important" units: if the answer is correct at a higher level of abstraction,
- score ≥ 0.7 even if specific details are omitted.
-* For "critical" units: hold to a stricter standard — core meaning must be present.
+* No credit for invented or contradictory information.
+* Semantically equivalent answers get full credit.
+* "unknown" answers get 0.0 score.
+* Do NOT penalize removal of identifying specificity unless that specificity is
+ central to the meaning of the question.
+
+ABSTRACTION TOLERANCE — this is the most important rule:
+The student's text was anonymized, so answers may legitimately use broader,
+safer, or less identifying language than the reference answer.
+
+Score based on whether the CORE FUNCTIONAL MEANING survives,
+not on whether the same level of specificity is preserved.
+
+A broader or more abstract answer may still deserve high credit if it preserves:
+- the same role or relationship,
+- the same event or outcome,
+- the same procedural or causal meaning,
+- the same type of activity or allegation,
+- or the same substantive point relevant to the question.
+
+Do NOT penalize answers merely because they are:
+- less specific,
+- less localized,
+- less temporally precise,
+- or less uniquely identifying.
+
+Reduce score only when the abstraction removes information that is central
+to answering the question itself.
+
+* For "critical" units: core functional meaning must survive, but abstraction that
+ preserves the functional meaning still scores ≥ 0.7.
+* For "important" units: abstraction that preserves the functional meaning scores ≥ 0.7;
+ broader abstraction that preserves the gist scores ≥ 0.5.
diff --git a/src/anonymizer/engine/rewrite/parsers.py b/src/anonymizer/engine/rewrite/parsers.py
index 05f88b02..71fe0b09 100644
--- a/src/anonymizer/engine/rewrite/parsers.py
+++ b/src/anonymizer/engine/rewrite/parsers.py
@@ -111,10 +111,40 @@ def parse_privacy_qa(raw: Any) -> PrivacyQAPairsSchema:
raise TypeError(f"Expected PrivacyQAPairsSchema or dict, got {type(raw).__name__}")
+def _correct_disposition_consistency(raw: dict) -> dict:
+ """Auto-correct LLM consistency violations before strict schema validation.
+
+ Handles: combined_risk_level='low' + protection_method_suggestion != 'leave_as_is'.
+ The prompt rule is clear, but LLMs occasionally violate it. The semantically correct
+ fix is to force 'leave_as_is': if the combined risk is low, no protection is needed.
+ Logs a warning for each corrected entity so the LLM miscalibration is visible.
+ """
+ entities = raw.get("sensitivity_disposition", [])
+ if not isinstance(entities, list):
+ return raw
+ for entity in entities:
+ if not isinstance(entity, dict):
+ continue
+ if entity.get("combined_risk_level") == "low" and entity.get("protection_method_suggestion") not in (
+ "leave_as_is",
+ None,
+ ):
+ logger.debug(
+ "Auto-correcting entity %s: combined_risk_level='low' + "
+ "protection_method_suggestion='%s' → 'leave_as_is'",
+ entity.get("id"),
+ entity.get("protection_method_suggestion"),
+ )
+ entity["protection_method_suggestion"] = "leave_as_is"
+ entity["generalization_suggestion"] = "N/A"
+ return raw
+
+
def parse_sensitivity_disposition(raw: Any) -> SensitivityDispositionSchema:
raw = normalize_payload(raw)
if isinstance(raw, SensitivityDispositionSchema): # catches StrictSensitivityDispositionSchema too
return raw
if isinstance(raw, dict):
+ raw = _correct_disposition_consistency(raw)
return SensitivityDispositionSchema.model_validate(raw)
raise ValueError(f"Cannot parse sensitivity disposition from {type(raw)}")
diff --git a/src/anonymizer/engine/rewrite/qa_generation.py b/src/anonymizer/engine/rewrite/qa_generation.py
index 03978657..edf25808 100644
--- a/src/anonymizer/engine/rewrite/qa_generation.py
+++ b/src/anonymizer/engine/rewrite/qa_generation.py
@@ -14,6 +14,7 @@
from anonymizer.engine.constants import (
COL_DOMAIN,
COL_DOMAIN_SUPPLEMENT,
+ COL_LATENT_ENTITIES,
COL_MEANING_UNITS,
COL_MEANING_UNITS_SERIALIZED,
COL_PRIVACY_QA,
@@ -25,7 +26,7 @@
)
from anonymizer.engine.ndd.model_loader import resolve_model_alias
from anonymizer.engine.prompt_utils import substitute_placeholders
-from anonymizer.engine.rewrite.parsers import parse_sensitivity_disposition
+from anonymizer.engine.rewrite.parsers import normalize_payload, parse_sensitivity_disposition
from anonymizer.engine.schemas import (
Domain,
DomainClassificationSchema,
@@ -51,19 +52,31 @@
# ---------------------------------------------------------------------------
-@custom_column_generator(required_columns=[COL_SENSITIVITY_DISPOSITION])
+@custom_column_generator(required_columns=[COL_SENSITIVITY_DISPOSITION, COL_LATENT_ENTITIES])
def _format_disposition_block(row: dict[str, Any]) -> dict[str, Any]:
"""Serialize sensitivity disposition into a JSON block for the meaning unit extraction prompt."""
disposition = parse_sensitivity_disposition(row.get(COL_SENSITIVITY_DISPOSITION, {}))
- block = [
- {
+
+ raw_latent = normalize_payload(row.get(COL_LATENT_ENTITIES)) or {}
+ latent_list = raw_latent.get("latent_entities", []) if isinstance(raw_latent, dict) else []
+ evidence_by_label_value: dict[tuple[str, str], list[str]] = {
+ (e["label"], e["value"]): e.get("evidence", []) for e in latent_list if isinstance(e, dict)
+ }
+
+ block = []
+ for e in disposition.sensitivity_disposition:
+ entry: dict[str, Any] = {
"entity_value": e.entity_value,
"does_need_protection": e.needs_protection,
"protection_method_suggestion": e.protection_method_suggestion,
"category": e.category,
}
- for e in disposition.sensitivity_disposition
- ]
+ if e.protection_method_suggestion == "generalize":
+ entry["generalization_suggestion"] = e.generalization_suggestion
+ if e.protection_method_suggestion == "suppress_inference":
+ entry["evidence"] = evidence_by_label_value.get((e.entity_label, e.entity_value), [])
+ block.append(entry)
+
row[COL_SENSITIVITY_DISPOSITION_BLOCK] = json.dumps(block, ensure_ascii=False)
return row
@@ -108,23 +121,30 @@ def _get_meaning_unit_extraction_prompt() -> str:
(roles, relationships, high-level descriptions).
- If it cannot be expressed safely without carrying identifying detail, DROP the unit.
-B) TRANSFORM-ALLOWED (allowed only if generalized/suppress_inference)
-If an entry has:
- - does_need_protection = True
- AND protection_method_suggestion is "generalize" OR "suppress_inference"
-Then you MAY still capture the meaning, BUT you must NOT use the entity_value itself.
-Instead: preserve the semantic role while moving to a broader, less identifying level of abstraction.
-This may include:
-
- • Geographic hierarchy: city → state → region → country
- • Institutional hierarchy: named organization → organization type
- • Role hierarchy: specific specialty → broader profession
- • Temporal abstraction: exact date → approximate period
- • Quantitative abstraction: exact number → rough scale
- • Named program/product → generic descriptive category
-
-The generalized phrasing must prevent recovery or lookup of the original entity_value while
-still preserving the meaning needed for usefulness.
+B) TRANSFORM-ALLOWED, WITH DIFFERENT RULES FOR GENERALIZE VS SUPPRESS_INFERENCE
+
+If protection_method_suggestion is "generalize":
+ - You MAY preserve the fact as a meaning unit.
+ - Do NOT use entity_value itself.
+ - Use generalization_suggestion as the abstraction level.
+
+If protection_method_suggestion is "suppress_inference":
+ - Do NOT create a meaning unit whose purpose is to preserve that inferred entity.
+ - Do NOT treat the latent inference itself as utility-bearing merely because it is inferable.
+ - suppress_inference applies not only to the abstract inferred attribute, but also to
+ explicit details whose primary semantic role is to enable reconstruction of that attribute.
+ - The evidence field identifies text spans that support the inference. Treat these as
+ potentially sensitive reconstruction clues.
+ - Avoid preserving these clues verbatim, through close paraphrases, or in combinations
+ that would materially reconstruct the suppressed inference.
+ - If an explicit fact has substantial independent utility beyond the suppressed inference,
+ preserve it only at the broadest abstraction that retains that utility without
+ reconstructing the inference.
+ - If a fact's primary value is only to support the suppressed inference, DROP it.
+ - Evaluate meaning units collectively as well as individually. Multiple generalized facts
+ may still reconstruct a suppressed inference when combined.
+ - When in doubt, prefer preserving the single most utility-bearing generalized clue rather
+ than multiple supporting details.
C) SAFE / LEFT-AS-IS (no special avoidance required)
If an entry has:
diff --git a/src/anonymizer/engine/rewrite/rewrite_generation.py b/src/anonymizer/engine/rewrite/rewrite_generation.py
index 7892b1cf..d5f0c3c5 100644
--- a/src/anonymizer/engine/rewrite/rewrite_generation.py
+++ b/src/anonymizer/engine/rewrite/rewrite_generation.py
@@ -95,7 +95,8 @@ def _get_rewrite_prompt(privacy_goal: PrivacyGoal, data_summary: str | None = No
Apply each protection method as follows:
- "replace": Substitute the entity value with the corresponding synthetic value from the replacement map.
Use the synthetic value consistently for every occurrence.
-- "generalize": Replace with a broader category or range
+- "generalize": Replace with the provided generalization_suggestion when present.
+ If no suggestion is provided, replace with a broader category or range
(e.g., a specific city → "a city in the Pacific Northwest", exact age → "in their late 30s").
- "remove": Omit the detail entirely. Rewrite the surrounding sentence so it reads naturally without it.
- "suppress_inference": Modify the text so the attribute cannot be reliably inferred by a motivated reader.
@@ -135,15 +136,16 @@ def _format_rewrite_disposition_block(row: dict[str, Any]) -> dict[str, Any]:
if not e.needs_protection:
continue
d = e.model_dump(mode="json")
- block.append(
- {
- "entity_label": d["entity_label"],
- "entity_value": d["entity_value"],
- "sensitivity": d["sensitivity"],
- "protection_method_suggestion": d["protection_method_suggestion"],
- "protection_reason": d["protection_reason"],
- }
- )
+ entry = {
+ "entity_label": d["entity_label"],
+ "entity_value": d["entity_value"],
+ "sensitivity": d["sensitivity"],
+ "protection_method_suggestion": d["protection_method_suggestion"],
+ "protection_reason": d["protection_reason"],
+ }
+ if d["protection_method_suggestion"] == "generalize":
+ entry["generalization_suggestion"] = d["generalization_suggestion"]
+ block.append(entry)
row[COL_REWRITE_DISPOSITION_BLOCK] = block
return row
diff --git a/src/anonymizer/engine/rewrite/sensitivity_disposition.py b/src/anonymizer/engine/rewrite/sensitivity_disposition.py
index fca24ba8..309694ed 100644
--- a/src/anonymizer/engine/rewrite/sensitivity_disposition.py
+++ b/src/anonymizer/engine/rewrite/sensitivity_disposition.py
@@ -213,6 +213,10 @@ def _get_sensitivity_disposition_prompt(
- For latent entities, "replace" is rarely appropriate (value not in text).
- For source="tagged": entity_value MUST match tag exactly.
- For source="latent": entity_label/value MUST match the provided latent entity.
+- generalization_suggestion: if protection_method_suggestion is "generalize", provide a
+ concise phrase showing exactly how this entity should be generalized in the rewritten text
+ (e.g., "a city in the Pacific Northwest", "late 1970s", "a public university").
+ Set to "N/A" for all other protection methods.
COVERAGE REQUIREMENTS:
- Include ONE entry for EVERY unique listed entity
diff --git a/src/anonymizer/engine/schemas/rewrite.py b/src/anonymizer/engine/schemas/rewrite.py
index 912281dd..1dbf7ebf 100644
--- a/src/anonymizer/engine/schemas/rewrite.py
+++ b/src/anonymizer/engine/schemas/rewrite.py
@@ -139,6 +139,7 @@ class EntityDispositionSchema(BaseModel):
protection_reason: str = Field(min_length=10, max_length=500)
protection_method_suggestion: ProtectionMethod
combined_risk_level: CombinedRiskLevel
+ generalization_suggestion: str = Field(default="N/A", min_length=1)
@property
def needs_protection(self) -> bool:
diff --git a/tests/engine/test_parsers.py b/tests/engine/test_parsers.py
index 618a233c..d5afb919 100644
--- a/tests/engine/test_parsers.py
+++ b/tests/engine/test_parsers.py
@@ -178,6 +178,7 @@ def test_parse_sensitivity_disposition_from_dict() -> None:
"protection_reason": "Direct identifier that enables re-identification",
"protection_method_suggestion": "replace",
"combined_risk_level": "high",
+ "generalization_suggestion": "N/A",
}
]
}
@@ -189,6 +190,52 @@ def test_parse_sensitivity_disposition_invalid_type() -> None:
parse_sensitivity_disposition("bad")
+def test_parse_sensitivity_disposition_corrects_low_with_generalize() -> None:
+ """LLM sometimes emits combined_risk_level='low' + protection_method_suggestion='generalize'.
+ The parser should auto-correct to 'leave_as_is' rather than dropping the record."""
+ raw = {
+ "sensitivity_disposition": [
+ {
+ "id": 1,
+ "source": "tagged",
+ "category": "quasi_identifier",
+ "sensitivity": "low",
+ "entity_label": "nationality",
+ "entity_value": "French",
+ "protection_reason": "Nationality adds little narrowing in this context",
+ "protection_method_suggestion": "generalize",
+ "combined_risk_level": "low",
+ "generalization_suggestion": "a European nationality",
+ }
+ ]
+ }
+ result = parse_sensitivity_disposition(raw)
+ entity = result.sensitivity_disposition[0]
+ assert entity.protection_method_suggestion == "leave_as_is"
+ assert entity.generalization_suggestion == "N/A"
+
+
+def test_parse_sensitivity_disposition_corrects_low_with_suppress_inference() -> None:
+ raw = {
+ "sensitivity_disposition": [
+ {
+ "id": 1,
+ "source": "latent",
+ "category": "latent_identifier",
+ "sensitivity": "low",
+ "entity_label": "religion",
+ "entity_value": "Catholic",
+ "protection_reason": "Inferable but low combined risk",
+ "protection_method_suggestion": "suppress_inference",
+ "combined_risk_level": "low",
+ "generalization_suggestion": "N/A",
+ }
+ ]
+ }
+ result = parse_sensitivity_disposition(raw)
+ assert result.sensitivity_disposition[0].protection_method_suggestion == "leave_as_is"
+
+
def test_parse_sensitivity_disposition_normalizes_numpy_array_payload() -> None:
raw = {
"sensitivity_disposition": np.array(
@@ -203,6 +250,7 @@ def test_parse_sensitivity_disposition_normalizes_numpy_array_payload() -> None:
"protection_reason": "Direct identifier that enables re-identification",
"protection_method_suggestion": "replace",
"combined_risk_level": "high",
+ "generalization_suggestion": "N/A",
}
],
dtype=object,
diff --git a/tests/engine/test_qa_generation.py b/tests/engine/test_qa_generation.py
index 74831f9d..b7718688 100644
--- a/tests/engine/test_qa_generation.py
+++ b/tests/engine/test_qa_generation.py
@@ -11,6 +11,7 @@
from anonymizer.engine.constants import (
COL_DOMAIN,
COL_DOMAIN_SUPPLEMENT,
+ COL_LATENT_ENTITIES,
COL_MEANING_UNITS,
COL_MEANING_UNITS_SERIALIZED,
COL_PRIVACY_QA,
@@ -56,6 +57,7 @@
protection_reason="Full name directly identifies the individual.",
protection_method_suggestion=ProtectionMethod.replace,
combined_risk_level="high",
+ generalization_suggestion="N/A",
),
EntityDispositionSchema(
id=2,
@@ -67,6 +69,7 @@
protection_reason="City alone does not create meaningful re-identification risk here.",
protection_method_suggestion=ProtectionMethod.leave_as_is,
combined_risk_level="low",
+ generalization_suggestion="N/A",
),
]
)
@@ -121,25 +124,68 @@ def test_qa_generator_alias_used(
def test_format_disposition_block_produces_valid_json() -> None:
- row = {COL_SENSITIVITY_DISPOSITION: _STUB_DISPOSITION}
+ row = {COL_SENSITIVITY_DISPOSITION: _STUB_DISPOSITION, COL_LATENT_ENTITIES: {}}
result = _format_disposition_block(row)
block = json.loads(result[COL_SENSITIVITY_DISPOSITION_BLOCK])
assert len(block) == 2
assert block[0]["entity_value"] == "Alice"
assert block[0]["does_need_protection"] is True
assert block[0]["protection_method_suggestion"] == "replace"
+ assert "generalization_suggestion" not in block[0]
assert block[1]["entity_value"] == "Portland"
assert block[1]["does_need_protection"] is False
+ assert "generalization_suggestion" not in block[1]
def test_format_disposition_block_accepts_dict_payload() -> None:
- row = {COL_SENSITIVITY_DISPOSITION: _STUB_DISPOSITION.model_dump(mode="python")}
+ row = {COL_SENSITIVITY_DISPOSITION: _STUB_DISPOSITION.model_dump(mode="python"), COL_LATENT_ENTITIES: {}}
result = _format_disposition_block(row)
block = json.loads(result[COL_SENSITIVITY_DISPOSITION_BLOCK])
assert len(block) == 2
assert block[0]["entity_value"] == "Alice"
+def test_format_disposition_block_includes_evidence_for_suppress_inference() -> None:
+ disposition = SensitivityDispositionSchema(
+ sensitivity_disposition=[
+ EntityDispositionSchema(
+ id=1,
+ source=EntitySource.tagged,
+ category=EntityCategory.direct_identifier,
+ sensitivity=SensitivityLevel.high,
+ entity_label="detention_duration",
+ entity_value="approximately 29 months",
+ protection_reason="Distinctive detention period aids re-identification.",
+ protection_method_suggestion=ProtectionMethod.suppress_inference,
+ combined_risk_level="high",
+ generalization_suggestion="N/A",
+ )
+ ]
+ )
+ latent_entities = {
+ "latent_entities": [
+ {
+ "label": "detention_duration",
+ "value": "approximately 29 months",
+ "evidence": ["On 8 June 2004 he was arrested", "on 20 November 2006 he was released"],
+ }
+ ]
+ }
+ row = {COL_SENSITIVITY_DISPOSITION: disposition, COL_LATENT_ENTITIES: latent_entities}
+ result = _format_disposition_block(row)
+ block = json.loads(result[COL_SENSITIVITY_DISPOSITION_BLOCK])
+ assert block[0]["protection_method_suggestion"] == "suppress_inference"
+ assert block[0]["evidence"] == ["On 8 June 2004 he was arrested", "on 20 November 2006 he was released"]
+
+
+def test_format_disposition_block_omits_evidence_for_non_suppress_inference() -> None:
+ row = {COL_SENSITIVITY_DISPOSITION: _STUB_DISPOSITION, COL_LATENT_ENTITIES: {}}
+ result = _format_disposition_block(row)
+ block = json.loads(result[COL_SENSITIVITY_DISPOSITION_BLOCK])
+ assert "evidence" not in block[0]
+ assert "evidence" not in block[1]
+
+
def test_serialize_meaning_units_produces_valid_json() -> None:
row = {COL_MEANING_UNITS: _STUB_MEANING_UNITS}
result = _serialize_meaning_units(row)
@@ -190,6 +236,7 @@ def test_generate_privacy_qa_column_no_protected_entities() -> None:
protection_reason="City alone does not create meaningful re-identification risk.",
protection_method_suggestion=ProtectionMethod.leave_as_is,
combined_risk_level="low",
+ generalization_suggestion="N/A",
)
]
)
@@ -220,6 +267,7 @@ def test_generate_privacy_qa_from_disposition_empty_when_nothing_to_protect() ->
protection_reason="City alone does not create meaningful re-identification risk.",
protection_method_suggestion=ProtectionMethod.leave_as_is,
combined_risk_level="low",
+ generalization_suggestion="N/A",
)
]
)
@@ -239,6 +287,7 @@ def test_generate_privacy_qa_from_disposition_ids_are_sequential() -> None:
protection_reason="Direct identifier.",
protection_method_suggestion=ProtectionMethod.replace,
combined_risk_level="high",
+ generalization_suggestion="N/A",
),
EntityDispositionSchema(
id=2,
@@ -250,6 +299,7 @@ def test_generate_privacy_qa_from_disposition_ids_are_sequential() -> None:
protection_reason="Direct identifier.",
protection_method_suggestion=ProtectionMethod.replace,
combined_risk_level="high",
+ generalization_suggestion="N/A",
),
]
)
@@ -269,7 +319,8 @@ def test_meaning_unit_prompt_preserves_gitlab_protection_branches() -> None:
prompt = _get_meaning_unit_extraction_prompt()
assert "does_need_protection = True" in prompt
assert 'protection_method_suggestion is "replace" OR "remove"' in prompt
- assert 'protection_method_suggestion is "generalize" OR "suppress_inference"' in prompt
+ assert 'protection_method_suggestion is "generalize"' in prompt
+ assert 'protection_method_suggestion is "suppress_inference"' in prompt
assert "does_need_protection = False" in prompt
diff --git a/tests/engine/test_rewrite_generation.py b/tests/engine/test_rewrite_generation.py
index be32ad44..3a157fc3 100644
--- a/tests/engine/test_rewrite_generation.py
+++ b/tests/engine/test_rewrite_generation.py
@@ -55,6 +55,7 @@ def stub_sensitivity_disposition() -> dict:
"protection_reason": "Full name uniquely identifies the subject",
"protection_method_suggestion": "replace",
"combined_risk_level": "high",
+ "generalization_suggestion": "N/A",
}
]
}
@@ -93,6 +94,7 @@ def test_format_rewrite_disposition_block_excludes_unprotected_entities() -> Non
"protection_reason": "Not identifying alone",
"protection_method_suggestion": "leave_as_is",
"combined_risk_level": "low",
+ "generalization_suggestion": "N/A",
}
]
}
@@ -117,6 +119,39 @@ def test_format_rewrite_disposition_block_serializes_required_fields(
}
+def test_format_rewrite_disposition_block_includes_generalization_suggestion_for_generalize() -> None:
+ disposition = {
+ "sensitivity_disposition": [
+ {
+ "id": 1,
+ "source": "tagged",
+ "category": "quasi_identifier",
+ "sensitivity": "medium",
+ "entity_label": "city",
+ "entity_value": "Portland",
+ "protection_reason": "City combined with other quasi-identifiers enables re-identification",
+ "protection_method_suggestion": "generalize",
+ "combined_risk_level": "medium",
+ "generalization_suggestion": "a city in the Pacific Northwest",
+ }
+ ]
+ }
+ row = {COL_SENSITIVITY_DISPOSITION: disposition}
+ result = _format_rewrite_disposition_block(row)
+ block = result[COL_REWRITE_DISPOSITION_BLOCK]
+ assert len(block) == 1
+ assert block[0]["generalization_suggestion"] == "a city in the Pacific Northwest"
+
+
+def test_format_rewrite_disposition_block_omits_generalization_suggestion_for_non_generalize(
+ stub_sensitivity_disposition: dict,
+) -> None:
+ row = {COL_SENSITIVITY_DISPOSITION: stub_sensitivity_disposition}
+ result = _format_rewrite_disposition_block(row)
+ block = result[COL_REWRITE_DISPOSITION_BLOCK]
+ assert "generalization_suggestion" not in block[0]
+
+
def test_format_rewrite_disposition_block_empty_when_no_protected_entities() -> None:
disposition = {
"sensitivity_disposition": [
@@ -130,6 +165,7 @@ def test_format_rewrite_disposition_block_empty_when_no_protected_entities() ->
"protection_reason": "Not identifying alone",
"protection_method_suggestion": "leave_as_is",
"combined_risk_level": "low",
+ "generalization_suggestion": "N/A",
}
]
}
diff --git a/tests/engine/test_schemas.py b/tests/engine/test_schemas.py
index 3ceada56..a03c5299 100644
--- a/tests/engine/test_schemas.py
+++ b/tests/engine/test_schemas.py
@@ -218,6 +218,7 @@ def _make_entity(**kwargs) -> dict:
"protection_reason": "Direct identifier that uniquely identifies the individual.",
"protection_method_suggestion": "replace",
"combined_risk_level": "high",
+ "generalization_suggestion": "N/A",
}
return {**defaults, **kwargs}
@@ -346,6 +347,7 @@ def test_sensitivity_disposition_format_for_rewrite_context_includes_low_when_pr
protection_method_suggestion="generalize",
combined_risk_level="medium",
protection_reason="City combined with other quasi-identifiers enables re-identification",
+ generalization_suggestion="a city in the Pacific Northwest",
),
]
}
@@ -451,6 +453,7 @@ def _make_strict_entity(**kwargs) -> dict:
"protection_reason": "Direct identifier that uniquely identifies the individual.",
"protection_method_suggestion": "replace",
"combined_risk_level": "high",
+ "generalization_suggestion": "N/A",
}
return {**defaults, **kwargs}
diff --git a/tests/interface/test_display.py b/tests/interface/test_display.py
index 8578763c..2d1ae4d9 100644
--- a/tests/interface/test_display.py
+++ b/tests/interface/test_display.py
@@ -583,6 +583,7 @@ def test_render_record_html_rewrite_mode_with_disposition() -> None:
protection_reason="Direct identifier that uniquely identifies the subject.",
protection_method_suggestion="replace",
combined_risk_level="high",
+ generalization_suggestion="N/A",
),
]
).model_dump()