Needs Review: "
+ metric_parts.append(
+ f"{label}: "
f"{badge_text}"
)
+ if metric_parts:
+ section_rows.append("" + "".join(metric_parts) + "
")
+
+ # --- Row 2: detection validity inline with flagged-entities dropdown ---
+ detection_valid = row.get(COL_DETECTION_VALID)
+ if COL_DETECTION_VALID in row.index and (detection_valid is None or pd.isna(detection_valid)):
+ section_rows.append(
+ ""
+ "Detection Validity: "
+ "Unavailable
"
+ )
+ elif detection_valid is not None and not pd.isna(detection_valid):
+ det_span = (
+ f"Detection Validity: {float(detection_valid):.2f}"
+ )
+ details_html = ""
+ if float(detection_valid) < 1.0:
+ invalid_entries = _normalize_invalid_entities(row.get(COL_DETECTION_INVALID_ENTITIES))
+ if invalid_entries:
+ rows_html: list[str] = []
+ for entry in invalid_entries:
+ value = html.escape(str(entry.get("value", "")))
+ label = html.escape(str(entry.get("label", "")))
+ reasoning = html.escape(str(entry.get("reasoning", "")))
+ _, border_color = _color_for_label(entry.get("label", ""))
+ rows_html.append(
+ ""
+ f"| {value} | "
+ f""
+ f"{label} | "
+ f"{reasoning} | "
+ "
"
+ )
+ details_html = (
+ ""
+ f"Show {len(invalid_entries)} flagged "
+ "detection(s)
"
+ ""
+ ""
+ "| Value | "
+ "Label | "
+ "Reason | "
+ "
"
+ f"{''.join(rows_html)}"
+ "
"
+ " "
+ )
+ section_rows.append(
+ ""
+ f"{det_span}{details_html}
"
+ )
+ # --- Row 3: judge scores with highlighted criterion names ---
judge_raw = row.get(COL_JUDGE_EVALUATION)
judge_scores = _extract_judge_scores(judge_raw)
if isinstance(judge_raw, dict) and not judge_scores:
@@ -446,30 +503,42 @@ def _render_scores_section(row: pd.Series) -> str:
"Judge evaluation present but produced no scores (unexpected shape: %s)", type(judge_raw).__name__
)
if judge_scores:
- score_strs = [f"{name}: {score}/10" for name, score in judge_scores]
- parts.append(f"Judge: {html.escape(', '.join(score_strs))}")
+ score_parts = [
+ f""
+ f"{html.escape(str(name))}: {html.escape(str(score))}"
+ for name, score in judge_scores
+ ]
+ section_rows.append(
+ ""
+ "
Judge
"
+ "
" + "".join(score_parts) + "
"
+ "
"
+ )
- if not parts:
+ if not section_rows:
return "No scores available.
"
- return "" + "".join(parts) + "
"
+ return "" + "".join(section_rows) + "
"
-def _extract_judge_scores(raw: object) -> list[tuple[str, int]]:
+
+def _extract_judge_scores(raw: object) -> list[tuple[str, int | str]]:
"""Extract (name, score) pairs from the judge evaluation column.
LLMJudgeColumnConfig output is a plain dict keyed by rubric name, each
- value carrying ``{"score": , "reasoning": "..."}``.
+ value carrying ``{"score": , "reasoning": "..."}``. Scores are
+ returned as-is — callers must not assume int (rewrite mode uses strings).
"""
if not isinstance(raw, dict):
return []
- result: list[tuple[str, int]] = []
+ result: list[tuple[str, int | str]] = []
for name, value in raw.items():
if not isinstance(value, dict) or "score" not in value:
continue
- try:
- result.append((str(name), int(value["score"])))
- except (ValueError, TypeError):
+ score = value["score"]
+ if score is None:
continue
+ result.append((str(name), score))
return result
diff --git a/src/anonymizer/interface/results.py b/src/anonymizer/interface/results.py
index fa97c983..900653a6 100644
--- a/src/anonymizer/interface/results.py
+++ b/src/anonymizer/interface/results.py
@@ -8,6 +8,7 @@
import pandas as pd
from anonymizer.config.replace_strategies import ReplaceMethod
+from anonymizer.config.rewrite import PrivacyGoal
from anonymizer.engine.ndd.adapter import FailedRecord
from anonymizer.interface.display import render_record_html
@@ -59,6 +60,10 @@ class AnonymizerResult(_DisplayMixin):
``run()`` / ``preview()``; consumed by ``evaluate()`` to dispatch the
right judges. ``None`` on results that were constructed by hand or
loaded from a pre-strategy-tracking format.
+ rewrite_config: The privacy goal that produced this result when rewrite
+ mode was used. Set by ``run()`` / ``preview()``; consumed by
+ ``evaluate()`` to dispatch the rewrite judges. Mutually exclusive
+ with ``replace_method``.
"""
dataframe: pd.DataFrame
@@ -66,6 +71,7 @@ class AnonymizerResult(_DisplayMixin):
resolved_text_column: str
failed_records: list[FailedRecord]
replace_method: ReplaceMethod | None = None
+ rewrite_config: PrivacyGoal | None = None
_display_cycle_index: int = field(default=0, init=False, repr=False)
def __repr__(self) -> str:
@@ -96,6 +102,9 @@ class PreviewResult(_DisplayMixin):
``preview()``; consumed by ``evaluate()`` to dispatch the right
judges. ``None`` on results that were constructed by hand or loaded
from a pre-strategy-tracking format.
+ rewrite_config: The privacy goal that produced this preview when rewrite
+ mode was used. Set by ``preview()``; consumed by ``evaluate()`` to
+ dispatch the rewrite judges. Mutually exclusive with ``replace_method``.
"""
dataframe: pd.DataFrame
@@ -104,6 +113,7 @@ class PreviewResult(_DisplayMixin):
failed_records: list[FailedRecord]
preview_num_records: int
replace_method: ReplaceMethod | None = None
+ rewrite_config: PrivacyGoal | None = None
_display_cycle_index: int = field(default=0, init=False, repr=False)
def __repr__(self) -> str:
diff --git a/tests/conftest.py b/tests/conftest.py
index 8d374c95..391fb7ab 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -104,13 +104,13 @@ def stub_slim_model_selection() -> ModelSelection:
rewriter="known",
evaluator="known",
repairer="known",
- judge="known",
),
evaluate=EvaluateModelSelection(
detection_validity_judge="known",
replace_type_fidelity_judge="known",
replace_relational_consistency_judge="known",
replace_attribute_fidelity_judge="known",
+ rewrite_judge="known",
),
)
diff --git a/tests/engine/test_final_judge.py b/tests/engine/test_final_judge.py
index a33cb7c5..fd8c3ce0 100644
--- a/tests/engine/test_final_judge.py
+++ b/tests/engine/test_final_judge.py
@@ -3,24 +3,20 @@
from __future__ import annotations
-import pytest
-from data_designer.config.column_configs import CustomColumnConfig, LLMJudgeColumnConfig
+from data_designer.config.column_configs import LLMJudgeColumnConfig
-from anonymizer.config.models import RewriteModelSelection
-from anonymizer.config.rewrite import EvaluationCriteria, PrivacyGoal
+from anonymizer.config.models import EvaluateModelSelection
+from anonymizer.config.rewrite import PrivacyGoal
from anonymizer.engine.constants import (
- COL_ANY_HIGH_LEAKED,
COL_JUDGE_EVALUATION,
- COL_LEAKAGE_MASS,
- COL_NEEDS_HUMAN_REVIEW,
COL_REWRITTEN_TEXT,
COL_TEXT,
- COL_UTILITY_SCORE,
)
from anonymizer.engine.rewrite.final_judge import (
+ PRIVACY_RUBRIC,
+ QUALITY_RUBRIC,
+ STYLE_RUBRIC,
FinalJudgeWorkflow,
- HumanReviewParams,
- _determine_needs_human_review,
_judge_prompt,
)
@@ -29,8 +25,6 @@
preserve="General utility, content quality, and semantic meaning of the original text",
)
-_STUB_EVALUATION = EvaluationCriteria()
-
# ---------------------------------------------------------------------------
# Tests: _judge_prompt
@@ -66,164 +60,73 @@ def test_judge_prompt_references_required_columns() -> None:
# ---------------------------------------------------------------------------
-def test_columns_returns_two_configs(
- stub_rewrite_model_selection: RewriteModelSelection,
+def test_columns_returns_one_config(
+ stub_evaluate_model_selection: EvaluateModelSelection,
) -> None:
wf = FinalJudgeWorkflow()
cols = wf.columns(
- selected_models=stub_rewrite_model_selection,
+ selected_models=stub_evaluate_model_selection,
privacy_goal=_STUB_PRIVACY_GOAL,
- evaluation=_STUB_EVALUATION,
)
- assert len(cols) == 2
+ assert len(cols) == 1
-def test_judge_column_uses_judge_alias(
- stub_rewrite_model_selection: RewriteModelSelection,
+def test_judge_column_uses_rewrite_judge_alias(
+ stub_evaluate_model_selection: EvaluateModelSelection,
) -> None:
wf = FinalJudgeWorkflow()
cols = wf.columns(
- selected_models=stub_rewrite_model_selection,
+ selected_models=stub_evaluate_model_selection,
privacy_goal=_STUB_PRIVACY_GOAL,
- evaluation=_STUB_EVALUATION,
)
judge_cols = [c for c in cols if isinstance(c, LLMJudgeColumnConfig)]
assert len(judge_cols) == 1
- assert judge_cols[0].model_alias == stub_rewrite_model_selection.judge
+ assert judge_cols[0].model_alias == stub_evaluate_model_selection.rewrite_judge
def test_judge_column_has_three_rubrics(
- stub_rewrite_model_selection: RewriteModelSelection,
+ stub_evaluate_model_selection: EvaluateModelSelection,
) -> None:
wf = FinalJudgeWorkflow()
cols = wf.columns(
- selected_models=stub_rewrite_model_selection,
+ selected_models=stub_evaluate_model_selection,
privacy_goal=_STUB_PRIVACY_GOAL,
- evaluation=_STUB_EVALUATION,
)
judge_col = next(c for c in cols if isinstance(c, LLMJudgeColumnConfig))
assert judge_col.name == COL_JUDGE_EVALUATION
score_names = {s.name for s in judge_col.scores}
- assert score_names == {"privacy", "quality", "naturalness"}
- for score in judge_col.scores:
- assert 1 in score.options
- assert 10 in score.options
-
-
-def test_needs_human_review_column_present(
- stub_rewrite_model_selection: RewriteModelSelection,
-) -> None:
- wf = FinalJudgeWorkflow()
- cols = wf.columns(
- selected_models=stub_rewrite_model_selection,
- privacy_goal=_STUB_PRIVACY_GOAL,
- evaluation=_STUB_EVALUATION,
- )
- custom_cols = [c for c in cols if isinstance(c, CustomColumnConfig)]
- assert len(custom_cols) == 1
- assert custom_cols[0].name == COL_NEEDS_HUMAN_REVIEW
+ assert score_names == {"privacy", "quality", "style"}
-def test_needs_human_review_column_uses_evaluation_thresholds(
- stub_rewrite_model_selection: RewriteModelSelection,
+def test_judge_rubrics_use_categorical_scores(
+ stub_evaluate_model_selection: EvaluateModelSelection,
) -> None:
wf = FinalJudgeWorkflow()
- evaluation = EvaluationCriteria(risk_tolerance="minimal")
cols = wf.columns(
- selected_models=stub_rewrite_model_selection,
+ selected_models=stub_evaluate_model_selection,
privacy_goal=_STUB_PRIVACY_GOAL,
- evaluation=evaluation,
)
- custom_col = next(c for c in cols if isinstance(c, CustomColumnConfig))
- params = HumanReviewParams.model_validate(custom_col.generator_params)
- assert params.flag_utility_below == 0.6
- assert params.flag_leakage_above == 1.0
-
-
-# ---------------------------------------------------------------------------
-# Tests: _determine_needs_human_review
-# ---------------------------------------------------------------------------
-
-
-def _make_row(
- rewritten_text: str | None = "some rewritten text",
- utility_score: float = 0.8,
- leakage_mass: float = 0.5,
- any_high_leaked: bool = False,
-) -> dict:
- return {
- COL_REWRITTEN_TEXT: rewritten_text,
- COL_UTILITY_SCORE: utility_score,
- COL_LEAKAGE_MASS: leakage_mass,
- COL_ANY_HIGH_LEAKED: any_high_leaked,
- }
-
-
-def test_needs_human_review_flags_none_rewrite() -> None:
- row = _make_row(rewritten_text=None)
- params = HumanReviewParams(flag_utility_below=0.50, flag_leakage_above=2.0)
- result = _determine_needs_human_review(row, generator_params=params)
- assert result[COL_NEEDS_HUMAN_REVIEW] is True
-
-
-def test_needs_human_review_flags_low_utility() -> None:
- row = _make_row(utility_score=0.3)
- params = HumanReviewParams(flag_utility_below=0.50, flag_leakage_above=2.0)
- result = _determine_needs_human_review(row, generator_params=params)
- assert result[COL_NEEDS_HUMAN_REVIEW] is True
-
-
-def test_needs_human_review_flags_high_leakage() -> None:
- row = _make_row(leakage_mass=3.0)
- params = HumanReviewParams(flag_utility_below=0.50, flag_leakage_above=2.0)
- result = _determine_needs_human_review(row, generator_params=params)
- assert result[COL_NEEDS_HUMAN_REVIEW] is True
-
-
-def test_needs_human_review_flags_any_high_leaked() -> None:
- row = _make_row(any_high_leaked=True)
- params = HumanReviewParams(flag_utility_below=0.50, flag_leakage_above=2.0)
- result = _determine_needs_human_review(row, generator_params=params)
- assert result[COL_NEEDS_HUMAN_REVIEW] is True
-
-
-def test_needs_human_review_false_when_all_good() -> None:
- row = _make_row(utility_score=0.8, leakage_mass=0.5, any_high_leaked=False)
- params = HumanReviewParams(flag_utility_below=0.50, flag_leakage_above=2.0)
- result = _determine_needs_human_review(row, generator_params=params)
- assert result[COL_NEEDS_HUMAN_REVIEW] is False
-
-
-def test_needs_human_review_none_thresholds_skip_checks() -> None:
- row = _make_row(utility_score=0.1, leakage_mass=10.0)
- params = HumanReviewParams(flag_utility_below=None, flag_leakage_above=None)
- result = _determine_needs_human_review(row, generator_params=params)
- assert result[COL_NEEDS_HUMAN_REVIEW] is False
-
-
-def test_needs_human_review_exact_threshold_utility() -> None:
- row = _make_row(utility_score=0.50)
- params = HumanReviewParams(flag_utility_below=0.50, flag_leakage_above=2.0)
- result = _determine_needs_human_review(row, generator_params=params)
- assert result[COL_NEEDS_HUMAN_REVIEW] is False
+ judge_col = next(c for c in cols if isinstance(c, LLMJudgeColumnConfig))
+ for score in judge_col.scores:
+ assert "low" in score.options
+ assert "medium" in score.options
+ assert "high" in score.options
-def test_needs_human_review_exact_threshold_leakage() -> None:
- row = _make_row(leakage_mass=2.0)
- params = HumanReviewParams(flag_utility_below=0.50, flag_leakage_above=2.0)
- result = _determine_needs_human_review(row, generator_params=params)
- assert result[COL_NEEDS_HUMAN_REVIEW] is False
+def test_rubric_names_match_constants() -> None:
+ assert PRIVACY_RUBRIC.name == "privacy"
+ assert QUALITY_RUBRIC.name == "quality"
+ assert STYLE_RUBRIC.name == "style"
-def test_needs_human_review_raises_on_invalid_utility_score() -> None:
- row = _make_row(utility_score=None)
- params = HumanReviewParams(flag_utility_below=0.50, flag_leakage_above=2.0)
- with pytest.raises(TypeError):
- _determine_needs_human_review(row, generator_params=params)
+def test_judge_prompt_references_style_not_naturalness() -> None:
+ prompt = _judge_prompt(_STUB_PRIVACY_GOAL)
+ assert "style" in prompt.lower()
+ assert "naturalness" not in prompt.lower()
-def test_needs_human_review_raises_on_invalid_leakage_mass() -> None:
- row = _make_row(leakage_mass=None)
- params = HumanReviewParams(flag_utility_below=0.50, flag_leakage_above=2.0)
- with pytest.raises(TypeError):
- _determine_needs_human_review(row, generator_params=params)
+def test_judge_prompt_references_categorical_scale() -> None:
+ prompt = _judge_prompt(_STUB_PRIVACY_GOAL)
+ assert "high" in prompt
+ assert "medium" in prompt
+ assert "low" in prompt
diff --git a/tests/engine/test_model_loader.py b/tests/engine/test_model_loader.py
index 3b754dd4..818082ed 100644
--- a/tests/engine/test_model_loader.py
+++ b/tests/engine/test_model_loader.py
@@ -154,7 +154,7 @@ def test_load_default_model_selection_populates_all_workflows() -> None:
assert selection.detection.latent_detector
# Replace
assert selection.replace.replacement_generator
- # Rewrite — all 8 roles must be populated
+ # Rewrite — all 7 roles must be populated
assert selection.rewrite.domain_classifier
assert selection.rewrite.disposition_analyzer
assert selection.rewrite.meaning_extractor
@@ -162,7 +162,8 @@ def test_load_default_model_selection_populates_all_workflows() -> None:
assert selection.rewrite.rewriter
assert selection.rewrite.evaluator
assert selection.rewrite.repairer
- assert selection.rewrite.judge
+ # Evaluate — includes rewrite_judge
+ assert selection.evaluate.rewrite_judge
def test_parse_model_configs_none_uses_defaults() -> None:
diff --git a/tests/engine/test_rewrite_workflow.py b/tests/engine/test_rewrite_workflow.py
index 33bba839..8b77db8d 100644
--- a/tests/engine/test_rewrite_workflow.py
+++ b/tests/engine/test_rewrite_workflow.py
@@ -13,6 +13,8 @@
from anonymizer.config.rewrite import EvaluationCriteria, PrivacyGoal
from anonymizer.engine.constants import (
COL_ANY_HIGH_LEAKED,
+ COL_DETECTION_INVALID_ENTITIES,
+ COL_DETECTION_VALID,
COL_DOMAIN,
COL_ENTITIES_BY_VALUE,
COL_JUDGE_EVALUATION,
@@ -27,7 +29,7 @@
COL_WEIGHTED_LEAKAGE_RATE,
)
from anonymizer.engine.ndd.adapter import RECORD_ID_COLUMN, FailedRecord, WorkflowRunResult
-from anonymizer.engine.rewrite.rewrite_workflow import RewriteWorkflow
+from anonymizer.engine.rewrite.rewrite_workflow import RewriteWorkflow, _detection_valid_fraction
_REPLACE_PATCH = "anonymizer.engine.rewrite.rewrite_workflow.LlmReplaceWorkflow"
@@ -136,22 +138,23 @@ def stub_eval_df(stub_pipeline_df: pd.DataFrame) -> pd.DataFrame:
@pytest.fixture
def stub_judge_df(stub_eval_df: pd.DataFrame) -> pd.DataFrame:
+ """Fixture used by evaluate() tests — only judge-produced columns here."""
df = stub_eval_df.copy()
df[COL_JUDGE_EVALUATION] = None
- df[COL_NEEDS_HUMAN_REVIEW] = False
return df
def _standard_side_effect(
pipeline_df: pd.DataFrame,
eval_df: pd.DataFrame,
- judge_df: pd.DataFrame,
) -> list[WorkflowRunResult]:
- """Happy-path adapter side_effect: pipeline, evaluate, judge."""
+ """Happy-path adapter side_effect for run(): pipeline then evaluate.
+
+ The final-judge no longer runs inside run() — it only runs via evaluate().
+ """
return [
WorkflowRunResult(dataframe=pipeline_df, failed_records=[]),
WorkflowRunResult(dataframe=eval_df, failed_records=[]),
- WorkflowRunResult(dataframe=judge_df, failed_records=[]),
]
@@ -213,8 +216,8 @@ def test_passthrough_defaults_populated(
assert df[COL_WEIGHTED_LEAKAGE_RATE].tolist() == [0.0, 0.0]
assert df[COL_ANY_HIGH_LEAKED].tolist() == [False, False]
assert df[COL_NEEDS_HUMAN_REVIEW].tolist() == [False, False]
- assert df[COL_JUDGE_EVALUATION].tolist() == [None, None]
assert df[COL_REPAIR_ITERATIONS].tolist() == [0, 0]
+ assert COL_JUDGE_EVALUATION not in df.columns
def test_has_entities_returns_true_when_present(stub_entities_by_value_with_entities: dict) -> None:
@@ -248,10 +251,9 @@ def test_calls_sub_workflows_in_order(
stub_replace_df: pd.DataFrame,
stub_pipeline_df: pd.DataFrame,
stub_eval_df: pd.DataFrame,
- stub_judge_df: pd.DataFrame,
) -> None:
adapter = Mock()
- adapter.run_workflow.side_effect = _standard_side_effect(stub_pipeline_df, stub_eval_df, stub_judge_df)
+ adapter.run_workflow.side_effect = _standard_side_effect(stub_pipeline_df, stub_eval_df)
with patch(_REPLACE_PATCH) as mock_replace_cls:
_mock_replace(mock_replace_cls, stub_replace_df)
@@ -268,7 +270,7 @@ def test_calls_sub_workflows_in_order(
workflow_names = [call.kwargs["workflow_name"] for call in adapter.run_workflow.call_args_list]
assert workflow_names[0] == "rewrite-pipeline"
assert workflow_names[1].startswith("rewrite-evaluate")
- assert workflow_names[-1] == "rewrite-final-judge"
+ assert "rewrite-final-judge" not in workflow_names
assert len(result.dataframe) == 1
@@ -286,17 +288,14 @@ def test_failed_records_accumulated_across_steps(
stub_replace_df: pd.DataFrame,
stub_pipeline_df: pd.DataFrame,
stub_eval_df: pd.DataFrame,
- stub_judge_df: pd.DataFrame,
) -> None:
failed_pipeline = FailedRecord(record_id="a", step="rewrite-pipeline", reason="timeout")
failed_eval = FailedRecord(record_id="b", step="rewrite-evaluate-0", reason="timeout")
- failed_judge = FailedRecord(record_id="c", step="rewrite-final-judge", reason="timeout")
adapter = Mock()
adapter.run_workflow.side_effect = [
WorkflowRunResult(dataframe=stub_pipeline_df, failed_records=[failed_pipeline]),
WorkflowRunResult(dataframe=stub_eval_df, failed_records=[failed_eval]),
- WorkflowRunResult(dataframe=stub_judge_df, failed_records=[failed_judge]),
]
with patch(_REPLACE_PATCH) as mock_replace_cls:
@@ -315,7 +314,7 @@ def test_failed_records_accumulated_across_steps(
)
record_ids = {f.record_id for f in result.failed_records}
- assert record_ids == {"a", "b", "c", "d"}
+ assert record_ids == {"a", "b", "d"}
# ---------------------------------------------------------------------------
@@ -325,96 +324,77 @@ def test_failed_records_accumulated_across_steps(
def test_judge_failure_does_not_propagate(
stub_model_configs: list[ModelConfig],
- stub_rewrite_model_selection: RewriteModelSelection,
- stub_replace_model_selection: ReplaceModelSelection,
- stub_df_with_entities: pd.DataFrame,
- stub_replace_df: pd.DataFrame,
- stub_pipeline_df: pd.DataFrame,
+ stub_evaluate_model_selection,
stub_eval_df: pd.DataFrame,
) -> None:
+ """evaluate() holistic judge failure is non-fatal; rows get COL_JUDGE_EVALUATION=None."""
adapter = Mock()
- adapter.run_workflow.side_effect = [
- WorkflowRunResult(dataframe=stub_pipeline_df, failed_records=[]),
- WorkflowRunResult(dataframe=stub_eval_df, failed_records=[]),
- RuntimeError("Judge LLM unavailable"),
- ]
- with patch(_REPLACE_PATCH) as mock_replace_cls:
- _mock_replace(mock_replace_cls, stub_replace_df)
- wf = RewriteWorkflow(adapter=adapter)
- result = wf.run(
- stub_df_with_entities,
- model_configs=stub_model_configs,
- selected_models=stub_rewrite_model_selection,
- replace_model_selection=stub_replace_model_selection,
- privacy_goal=_PRIVACY_GOAL,
- evaluation=_EVALUATION,
- )
+ wf = RewriteWorkflow(adapter=adapter)
+ # Mock detection judge to return successfully
+ wf._detection_judge_wf = Mock()
+ wf._detection_judge_wf.evaluate.return_value = Mock(dataframe=stub_eval_df.copy(), failed_records=[])
+ # Make the holistic judge adapter call raise
+ adapter.run_workflow.side_effect = RuntimeError("Judge LLM unavailable")
+
+ result = wf.evaluate(
+ stub_eval_df,
+ model_configs=stub_model_configs,
+ selected_models=stub_evaluate_model_selection,
+ privacy_goal=_PRIVACY_GOAL,
+ )
assert len(result.dataframe) == 1
- assert result.dataframe[COL_NEEDS_HUMAN_REVIEW].iloc[0]
assert result.dataframe[COL_JUDGE_EVALUATION].iloc[0] is None
def test_judge_partial_row_loss_preserves_all_rows(
stub_model_configs: list[ModelConfig],
- stub_rewrite_model_selection: RewriteModelSelection,
- stub_replace_model_selection: ReplaceModelSelection,
+ stub_evaluate_model_selection,
stub_df_two_entities: pd.DataFrame,
) -> None:
- """Judge drops 1 of 2 rows -- surviving row gets scores, missing row gets defaults."""
- df = stub_df_two_entities
+ """evaluate() judge drops 1 of 2 rows — surviving row gets scores, missing row gets None."""
+ df = stub_df_two_entities.copy()
+ df["_anonymizer_record_id"] = ["rec-0", "rec-1"]
+
+ # Build a run()-style result dataframe with all required columns
+ run_result_df = df.copy()
+ run_result_df[COL_REWRITTEN_TEXT] = ["Maria works here", "Rob works there"]
+ run_result_df[COL_NEEDS_REPAIR] = False
+ run_result_df[COL_UTILITY_SCORE] = [0.9, 0.8]
+ run_result_df[COL_LEAKAGE_MASS] = [0.1, 0.2]
+ run_result_df[COL_ANY_HIGH_LEAKED] = False
+ run_result_df[COL_NEEDS_HUMAN_REVIEW] = False
+ run_result_df[COL_REPAIR_ITERATIONS] = 0
adapter = Mock()
- pre_gen_df = df.copy()
- pre_gen_df[COL_DOMAIN] = "BIOGRAPHY_PROFILE"
- pre_gen_df["_anonymizer_row_order"] = [0, 1]
- pre_gen_df["_anonymizer_record_id"] = ["rec-0", "rec-1"]
-
- rewrite_gen_df = pre_gen_df.copy()
- rewrite_gen_df[COL_REWRITTEN_TEXT] = ["Maria works here", "Rob works there"]
- rewrite_gen_df[COL_REPAIR_ITERATIONS] = 0
-
- eval_df = rewrite_gen_df.copy()
- eval_df[COL_NEEDS_REPAIR] = False
- eval_df[COL_UTILITY_SCORE] = [0.9, 0.8]
- eval_df[COL_LEAKAGE_MASS] = [0.1, 0.2]
- eval_df[COL_ANY_HIGH_LEAKED] = False
-
- judge_df = eval_df.iloc[[0]].copy().reset_index(drop=True)
- judge_df[COL_JUDGE_EVALUATION] = [{"privacy": {"score": 8}, "quality": {"score": 9}, "naturalness": {"score": 7}}]
- judge_df[COL_NEEDS_HUMAN_REVIEW] = False
-
- replace_df = df.copy()
- replace_df["_replacement_map"] = [{"replacements": []}, {"replacements": []}]
-
- adapter.run_workflow.side_effect = [
- WorkflowRunResult(dataframe=rewrite_gen_df, failed_records=[]),
- WorkflowRunResult(dataframe=eval_df, failed_records=[]),
- WorkflowRunResult(
- dataframe=judge_df,
- failed_records=[FailedRecord(record_id="rec-1", step="rewrite-final-judge", reason="timeout")],
- ),
+ # detection judge returns both rows
+ det_df = run_result_df.copy()
+ # holistic judge returns only first row
+ judge_df = run_result_df.iloc[[0]].copy().reset_index(drop=True)
+ judge_df[COL_JUDGE_EVALUATION] = [
+ {"privacy": {"score": "high"}, "quality": {"score": "high"}, "style": {"score": "medium"}}
]
- with patch(_REPLACE_PATCH) as mock_replace_cls:
- _mock_replace(mock_replace_cls, replace_df)
- wf = RewriteWorkflow(adapter=adapter)
- result = wf.run(
- df,
- model_configs=stub_model_configs,
- selected_models=stub_rewrite_model_selection,
- replace_model_selection=stub_replace_model_selection,
- privacy_goal=_PRIVACY_GOAL,
- evaluation=_EVALUATION,
- )
+ wf = RewriteWorkflow(adapter=adapter)
+ wf._detection_judge_wf = Mock()
+ wf._detection_judge_wf.evaluate.return_value = Mock(dataframe=det_df, failed_records=[])
+ adapter.run_workflow.return_value = WorkflowRunResult(
+ dataframe=judge_df,
+ failed_records=[FailedRecord(record_id="rec-1", step="rewrite-final-judge", reason="timeout")],
+ )
+
+ result = wf.evaluate(
+ run_result_df,
+ model_configs=stub_model_configs,
+ selected_models=stub_evaluate_model_selection,
+ privacy_goal=_PRIVACY_GOAL,
+ )
assert len(result.dataframe) == 2
assert result.dataframe[COL_JUDGE_EVALUATION].iloc[0] is not None
- assert not result.dataframe[COL_NEEDS_HUMAN_REVIEW].iloc[0]
assert result.dataframe[COL_JUDGE_EVALUATION].iloc[1] is None
- assert result.dataframe[COL_NEEDS_HUMAN_REVIEW].iloc[1]
# ---------------------------------------------------------------------------
@@ -430,10 +410,9 @@ def test_repair_loop_exits_early_when_no_rows_need_repair(
stub_replace_df: pd.DataFrame,
stub_pipeline_df: pd.DataFrame,
stub_eval_df: pd.DataFrame,
- stub_judge_df: pd.DataFrame,
) -> None:
adapter = Mock()
- adapter.run_workflow.side_effect = _standard_side_effect(stub_pipeline_df, stub_eval_df, stub_judge_df)
+ adapter.run_workflow.side_effect = _standard_side_effect(stub_pipeline_df, stub_eval_df)
with patch(_REPLACE_PATCH) as mock_replace_cls:
_mock_replace(mock_replace_cls, stub_replace_df)
@@ -662,10 +641,9 @@ def test_zero_max_repair_iterations_still_evaluates(
stub_replace_df: pd.DataFrame,
stub_pipeline_df: pd.DataFrame,
stub_eval_df: pd.DataFrame,
- stub_judge_df: pd.DataFrame,
) -> None:
adapter = Mock()
- adapter.run_workflow.side_effect = _standard_side_effect(stub_pipeline_df, stub_eval_df, stub_judge_df)
+ adapter.run_workflow.side_effect = _standard_side_effect(stub_pipeline_df, stub_eval_df)
with patch(_REPLACE_PATCH) as mock_replace_cls:
_mock_replace(mock_replace_cls, stub_replace_df)
@@ -887,3 +865,233 @@ def test_passthrough_rows_get_defaults(
assert df[COL_UTILITY_SCORE].iloc[1] == 1.0
assert df[COL_LEAKAGE_MASS].iloc[1] == 0.0
assert not df[COL_NEEDS_HUMAN_REVIEW].iloc[1]
+
+
+# ---------------------------------------------------------------------------
+# Tests: evaluate() — happy path and column presence
+# ---------------------------------------------------------------------------
+
+
+def test_evaluate_produces_judge_evaluation_column(
+ stub_model_configs: list[ModelConfig],
+ stub_evaluate_model_selection,
+ stub_eval_df: pd.DataFrame,
+) -> None:
+ adapter = Mock()
+ wf = RewriteWorkflow(adapter=adapter)
+
+ det_df = stub_eval_df.copy()
+ det_df[COL_DETECTION_VALID] = True
+ wf._detection_judge_wf = Mock()
+ wf._detection_judge_wf.evaluate.return_value = Mock(dataframe=det_df, failed_records=[])
+
+ judge_df = stub_eval_df.copy()
+ judge_df[COL_JUDGE_EVALUATION] = [
+ {"privacy": {"score": "high"}, "quality": {"score": "high"}, "style": {"score": "medium"}}
+ ]
+ adapter.run_workflow.return_value = WorkflowRunResult(dataframe=judge_df, failed_records=[])
+
+ result = wf.evaluate(
+ stub_eval_df,
+ model_configs=stub_model_configs,
+ selected_models=stub_evaluate_model_selection,
+ privacy_goal=_PRIVACY_GOAL,
+ )
+
+ assert COL_JUDGE_EVALUATION in result.dataframe.columns
+ assert result.dataframe[COL_JUDGE_EVALUATION].iloc[0] is not None
+
+
+def test_evaluate_produces_detection_valid_column(
+ stub_model_configs: list[ModelConfig],
+ stub_evaluate_model_selection,
+ stub_eval_df: pd.DataFrame,
+) -> None:
+ adapter = Mock()
+ wf = RewriteWorkflow(adapter=adapter)
+
+ det_df = stub_eval_df.copy()
+ det_df[COL_DETECTION_VALID] = True
+ wf._detection_judge_wf = Mock()
+ wf._detection_judge_wf.evaluate.return_value = Mock(dataframe=det_df, failed_records=[])
+
+ judge_df = stub_eval_df.copy()
+ judge_df[COL_JUDGE_EVALUATION] = [None]
+ adapter.run_workflow.return_value = WorkflowRunResult(dataframe=judge_df, failed_records=[])
+
+ result = wf.evaluate(
+ stub_eval_df,
+ model_configs=stub_model_configs,
+ selected_models=stub_evaluate_model_selection,
+ privacy_goal=_PRIVACY_GOAL,
+ )
+
+ assert COL_DETECTION_VALID in result.dataframe.columns
+
+
+# ---------------------------------------------------------------------------
+# Tests: evaluate() — passthrough rows
+# ---------------------------------------------------------------------------
+
+
+def test_evaluate_skips_passthrough_rows(
+ stub_model_configs: list[ModelConfig],
+ stub_evaluate_model_selection,
+ stub_eval_df: pd.DataFrame,
+) -> None:
+ """evaluate() must only send entity rows to the judges, not passthrough rows."""
+ passthrough_row = stub_eval_df.iloc[0].to_dict()
+ passthrough_row[COL_ENTITIES_BY_VALUE] = {"entities_by_value": []}
+ mixed_df = pd.concat([stub_eval_df, pd.DataFrame([passthrough_row])], ignore_index=True)
+
+ adapter = Mock()
+ wf = RewriteWorkflow(adapter=adapter)
+
+ det_df = stub_eval_df.copy()
+ det_df[COL_DETECTION_VALID] = True
+ wf._detection_judge_wf = Mock()
+ wf._detection_judge_wf.evaluate.return_value = Mock(dataframe=det_df, failed_records=[])
+
+ judge_df = stub_eval_df.copy()
+ judge_df[COL_JUDGE_EVALUATION] = [None]
+ adapter.run_workflow.return_value = WorkflowRunResult(dataframe=judge_df, failed_records=[])
+
+ result = wf.evaluate(
+ mixed_df,
+ model_configs=stub_model_configs,
+ selected_models=stub_evaluate_model_selection,
+ privacy_goal=_PRIVACY_GOAL,
+ )
+
+ detection_call_df = wf._detection_judge_wf.evaluate.call_args.args[0]
+ assert len(detection_call_df) == 1
+ assert len(result.dataframe) == 2
+
+
+def test_evaluate_passthrough_rows_get_none_judge_defaults(
+ stub_model_configs: list[ModelConfig],
+ stub_evaluate_model_selection,
+ stub_eval_df: pd.DataFrame,
+) -> None:
+ """Passthrough rows must have COL_JUDGE_EVALUATION=None and COL_DETECTION_VALID=1.0 (trivially valid)."""
+ passthrough_row = stub_eval_df.iloc[0].to_dict()
+ passthrough_row[COL_ENTITIES_BY_VALUE] = {"entities_by_value": []}
+ mixed_df = pd.concat([stub_eval_df, pd.DataFrame([passthrough_row])], ignore_index=True)
+
+ adapter = Mock()
+ wf = RewriteWorkflow(adapter=adapter)
+
+ det_df = stub_eval_df.copy()
+ det_df[COL_DETECTION_VALID] = True
+ wf._detection_judge_wf = Mock()
+ wf._detection_judge_wf.evaluate.return_value = Mock(dataframe=det_df, failed_records=[])
+
+ judge_df = stub_eval_df.copy()
+ judge_df[COL_JUDGE_EVALUATION] = [None]
+ adapter.run_workflow.return_value = WorkflowRunResult(dataframe=judge_df, failed_records=[])
+
+ result = wf.evaluate(
+ mixed_df,
+ model_configs=stub_model_configs,
+ selected_models=stub_evaluate_model_selection,
+ privacy_goal=_PRIVACY_GOAL,
+ )
+
+ passthrough_result = result.dataframe[
+ result.dataframe[COL_ENTITIES_BY_VALUE].apply(lambda x: len(x.get("entities_by_value", [])) == 0)
+ ]
+ assert passthrough_result[COL_JUDGE_EVALUATION].iloc[0] is None
+ assert passthrough_result[COL_DETECTION_VALID].iloc[0] == 1.0
+
+
+# ---------------------------------------------------------------------------
+# Tests: needs_human_review not overwritten by evaluate()
+# ---------------------------------------------------------------------------
+
+
+# ---------------------------------------------------------------------------
+# Tests: _detection_valid_fraction
+# ---------------------------------------------------------------------------
+
+
+def test_detection_valid_fraction_returns_none_when_valid_is_none() -> None:
+ row = pd.Series({COL_DETECTION_VALID: None, COL_DETECTION_INVALID_ENTITIES: [], COL_ENTITIES_BY_VALUE: {}})
+ assert _detection_valid_fraction(row) is None
+
+
+def test_detection_valid_fraction_returns_1_when_all_valid() -> None:
+ row = pd.Series(
+ {
+ COL_DETECTION_VALID: True,
+ COL_DETECTION_INVALID_ENTITIES: [],
+ COL_ENTITIES_BY_VALUE: {"entities_by_value": [{"value": "Alice", "labels": ["first_name"]}]},
+ }
+ )
+ assert _detection_valid_fraction(row) == 1.0
+
+
+def test_detection_valid_fraction_computes_correct_fraction() -> None:
+ entities = {"entities_by_value": [{"value": "Alice", "labels": ["first_name", "full_name"]}]}
+ row = pd.Series(
+ {
+ COL_DETECTION_VALID: False,
+ COL_DETECTION_INVALID_ENTITIES: [{"value": "Alice", "label": "full_name", "reasoning": "wrong label"}],
+ COL_ENTITIES_BY_VALUE: entities,
+ }
+ )
+ result = _detection_valid_fraction(row)
+ assert result == pytest.approx(0.5)
+
+
+def test_detection_valid_fraction_returns_none_on_parse_failure() -> None:
+ row = pd.Series(
+ {
+ COL_DETECTION_VALID: False,
+ COL_DETECTION_INVALID_ENTITIES: [{"value": "x", "label": "y", "reasoning": "z"}],
+ COL_ENTITIES_BY_VALUE: "not a valid schema payload <<<",
+ }
+ )
+ assert _detection_valid_fraction(row) is None
+
+
+def test_detection_valid_fraction_returns_none_when_total_is_zero_and_valid_false() -> None:
+ """valid=False with an empty entity list — judge flagged invalid but no entities found."""
+ row = pd.Series(
+ {
+ COL_DETECTION_VALID: False,
+ COL_DETECTION_INVALID_ENTITIES: [],
+ COL_ENTITIES_BY_VALUE: {"entities_by_value": []},
+ }
+ )
+ assert _detection_valid_fraction(row) is None
+
+
+def test_run_needs_human_review_not_overwritten_by_evaluate(
+ stub_model_configs: list[ModelConfig],
+ stub_evaluate_model_selection,
+ stub_eval_df: pd.DataFrame,
+) -> None:
+ """COL_NEEDS_HUMAN_REVIEW set during run() must not be modified by evaluate()."""
+ run_df = stub_eval_df.copy()
+ run_df[COL_NEEDS_HUMAN_REVIEW] = True
+
+ adapter = Mock()
+ wf = RewriteWorkflow(adapter=adapter)
+
+ det_df = run_df.copy()
+ det_df[COL_DETECTION_VALID] = True
+ wf._detection_judge_wf = Mock()
+ wf._detection_judge_wf.evaluate.return_value = Mock(dataframe=det_df, failed_records=[])
+
+ judge_df = run_df.copy()
+ judge_df[COL_JUDGE_EVALUATION] = [{"privacy": {"score": "high"}}]
+ adapter.run_workflow.return_value = WorkflowRunResult(dataframe=judge_df, failed_records=[])
+
+ result = wf.evaluate(
+ run_df,
+ model_configs=stub_model_configs,
+ selected_models=stub_evaluate_model_selection,
+ privacy_goal=_PRIVACY_GOAL,
+ )
+
+ assert bool(result.dataframe[COL_NEEDS_HUMAN_REVIEW].iloc[0]) is True
diff --git a/tests/interface/test_anonymizer_interface.py b/tests/interface/test_anonymizer_interface.py
index f892285c..a0db7308 100644
--- a/tests/interface/test_anonymizer_interface.py
+++ b/tests/interface/test_anonymizer_interface.py
@@ -16,7 +16,9 @@
from anonymizer.config.replace_strategies import Redact, Substitute
from anonymizer.engine.constants import (
COL_DETECTED_ENTITIES,
+ COL_DETECTION_VALID,
COL_FINAL_ENTITIES,
+ COL_JUDGE_EVALUATION,
COL_REPLACED_TEXT,
COL_REPLACEMENT_MAP,
COL_REWRITTEN_TEXT,
@@ -25,6 +27,7 @@
)
from anonymizer.engine.detection.detection_workflow import EntityDetectionResult, EntityDetectionWorkflow
from anonymizer.engine.ndd.adapter import FailedRecord
+from anonymizer.engine.ndd.model_loader import validate_model_alias_references
from anonymizer.engine.replace.replace_runner import ReplacementResult, ReplacementWorkflow
from anonymizer.engine.rewrite.rewrite_workflow import RewriteResult, RewriteWorkflow
from anonymizer.interface.anonymizer import Anonymizer, _resolve_model_providers
@@ -705,3 +708,134 @@ def test_evaluate_raises_value_error_on_legacy_result_without_replace_method() -
with pytest.raises(ValueError, match="replace_method"):
anonymizer.evaluate(legacy_result) # type: ignore[arg-type]
+
+
+# ---------------------------------------------------------------------------
+# Tests: Anonymizer.evaluate() for rewrite results
+# ---------------------------------------------------------------------------
+
+
+def test_run_rewrite_does_not_include_judge_in_user_dataframe(stub_input: AnonymizerInput) -> None:
+ """run() output must not include COL_JUDGE_EVALUATION — it only appears after evaluate()."""
+ config = AnonymizerConfig(rewrite=Rewrite())
+ anonymizer, _, _, _ = _make_anonymizer()
+
+ result = anonymizer.run(config=config, data=stub_input)
+
+ assert COL_JUDGE_EVALUATION not in result.dataframe.columns
+
+
+def test_evaluate_rewrite_result_adds_judge_columns(stub_input: AnonymizerInput) -> None:
+ """anonymizer.evaluate() on a rewrite result must add COL_JUDGE_EVALUATION."""
+ config = AnonymizerConfig(rewrite=Rewrite())
+ anonymizer, _, _, rewrite_runner = _make_anonymizer()
+
+ run_result = anonymizer.run(config=config, data=stub_input)
+
+ eval_df = pd.DataFrame(
+ {
+ COL_TEXT: ["Alice works at Acme"],
+ COL_REWRITTEN_TEXT: ["Beth works at Globex"],
+ "utility_score": [0.85],
+ "leakage_mass": [0.3],
+ "weighted_leakage_rate": [0.23],
+ "any_high_leaked": [False],
+ "needs_human_review": [False],
+ COL_JUDGE_EVALUATION: [{"privacy": {"score": "high"}}],
+ COL_DETECTION_VALID: [1.0],
+ }
+ )
+ rewrite_runner.evaluate.return_value = RewriteResult(dataframe=eval_df, failed_records=[])
+
+ evaluated = anonymizer.evaluate(run_result)
+
+ assert COL_JUDGE_EVALUATION in evaluated.dataframe.columns
+
+
+def test_evaluate_rewrite_result_adds_detection_valid(stub_input: AnonymizerInput) -> None:
+ """anonymizer.evaluate() on a rewrite result must add COL_DETECTION_VALID."""
+ config = AnonymizerConfig(rewrite=Rewrite())
+ anonymizer, _, _, rewrite_runner = _make_anonymizer()
+
+ run_result = anonymizer.run(config=config, data=stub_input)
+
+ eval_df = pd.DataFrame(
+ {
+ COL_TEXT: ["Alice works at Acme"],
+ COL_REWRITTEN_TEXT: ["Beth works at Globex"],
+ "utility_score": [0.85],
+ "leakage_mass": [0.3],
+ "weighted_leakage_rate": [0.23],
+ "any_high_leaked": [False],
+ "needs_human_review": [False],
+ COL_JUDGE_EVALUATION: [None],
+ COL_DETECTION_VALID: [0.9],
+ }
+ )
+ rewrite_runner.evaluate.return_value = RewriteResult(dataframe=eval_df, failed_records=[])
+
+ evaluated = anonymizer.evaluate(run_result)
+
+ assert COL_DETECTION_VALID in evaluated.dataframe.columns
+
+
+def test_evaluate_rewrite_raises_without_rewrite_config() -> None:
+ """evaluate() must raise ValueError when result has no rewrite_config and no replace_method."""
+ anonymizer, _, _, _ = _make_anonymizer()
+ bare_result = SimpleNamespace(
+ dataframe=pd.DataFrame(),
+ trace_dataframe=pd.DataFrame(),
+ resolved_text_column="text",
+ rewrite_config=None,
+ )
+
+ with pytest.raises(ValueError):
+ anonymizer.evaluate(bare_result) # type: ignore[arg-type]
+
+
+def test_evaluate_rewrite_calls_validate_with_check_rewrite_false(stub_input: AnonymizerInput) -> None:
+ """evaluate() on a rewrite result must NOT validate rewrite pipeline model aliases.
+
+ Passing check_rewrite=True would require domain-classifier / rewrite-generator
+ aliases that are irrelevant for post-hoc evaluation. This test asserts the call
+ uses check_rewrite=False so users with evaluate-only configs are not blocked.
+ """
+ from unittest.mock import patch as _patch
+
+ config = AnonymizerConfig(rewrite=Rewrite())
+ anonymizer, _, _, rewrite_runner = _make_anonymizer()
+
+ run_result = anonymizer.run(config=config, data=stub_input)
+
+ eval_df = pd.DataFrame(
+ {
+ COL_TEXT: ["Alice works at Acme"],
+ COL_REWRITTEN_TEXT: ["Beth works at Globex"],
+ "utility_score": [0.85],
+ "leakage_mass": [0.3],
+ "weighted_leakage_rate": [0.23],
+ "any_high_leaked": [False],
+ "needs_human_review": [False],
+ COL_JUDGE_EVALUATION: [None],
+ COL_DETECTION_VALID: [1.0],
+ }
+ )
+ rewrite_runner.evaluate.return_value = RewriteResult(dataframe=eval_df, failed_records=[])
+
+ with _patch(
+ "anonymizer.interface.anonymizer.validate_model_alias_references",
+ wraps=validate_model_alias_references,
+ ) as mock_validate:
+ anonymizer.evaluate(run_result)
+
+ rewrite_eval_calls = [call for call in mock_validate.call_args_list if call.kwargs.get("check_evaluate") is True]
+ assert rewrite_eval_calls, "validate_model_alias_references was not called with check_evaluate=True"
+ for call in rewrite_eval_calls:
+ assert call.kwargs.get("check_rewrite") is False, (
+ "evaluate() on a rewrite result must pass check_rewrite=False to avoid "
+ "requiring rewrite pipeline model aliases that are unused during evaluation"
+ )
+ assert call.kwargs.get("check_rewrite_evaluate") is True, (
+ "evaluate() on a rewrite result must pass check_rewrite_evaluate=True "
+ "so that an invalid rewrite_judge alias is caught at validation time"
+ )
diff --git a/tests/interface/test_anonymizer_telemetry.py b/tests/interface/test_anonymizer_telemetry.py
index 9e4c2b05..a4864866 100644
--- a/tests/interface/test_anonymizer_telemetry.py
+++ b/tests/interface/test_anonymizer_telemetry.py
@@ -272,10 +272,11 @@ def test_rewrite_populates_rewrite_models(
event = captured_events[0]
assert event.transformation_type == "rewrite"
assert event.rewriter_model != NOT_APPLICABLE
- assert event.judge_model != NOT_APPLICABLE
assert event.repairer_model != NOT_APPLICABLE
assert event.max_repair_iterations == 2
assert event.strict_entity_protection is True
+ # judge runs in evaluate(), not run() — stays not_applicable here
+ assert event.judge_model == NOT_APPLICABLE
# Substitute-only field stays not_applicable
assert event.replacement_generator_model == NOT_APPLICABLE
diff --git a/tests/interface/test_display.py b/tests/interface/test_display.py
index fddc6028..6242fc0b 100644
--- a/tests/interface/test_display.py
+++ b/tests/interface/test_display.py
@@ -11,16 +11,18 @@
from anonymizer.engine.constants import (
COL_DETECTED_ENTITIES,
+ COL_DETECTION_VALID,
COL_FINAL_ENTITIES,
COL_JUDGE_EVALUATION,
COL_REPLACEMENT_MAP,
COL_SENSITIVITY_DISPOSITION,
)
-from anonymizer.engine.rewrite.final_judge import NATURALNESS_RUBRIC, PRIVACY_RUBRIC, QUALITY_RUBRIC
+from anonymizer.engine.rewrite.final_judge import PRIVACY_RUBRIC, QUALITY_RUBRIC, STYLE_RUBRIC
from anonymizer.engine.schemas import EntitiesSchema, EntitySchema
from anonymizer.engine.schemas.rewrite import EntityDispositionSchema, SensitivityDispositionSchema
from anonymizer.interface.display import (
_build_replaced_entities,
+ _extract_judge_scores,
_normalize_replacement_map,
_render_highlighted_text,
_verdict_badge,
@@ -548,9 +550,9 @@ def test_render_record_html_rewrite_mode_shows_rewrite_layout() -> None:
def test_render_record_html_rewrite_mode_with_judge_scores() -> None:
# Derive keys from the actual rubric configs so test↔runtime drift is impossible.
judge_eval = {
- PRIVACY_RUBRIC.name: {"score": 8, "reasoning": "good privacy"},
- QUALITY_RUBRIC.name: {"score": 9, "reasoning": "high quality"},
- NATURALNESS_RUBRIC.name: {"score": 7, "reasoning": "mostly natural"},
+ PRIVACY_RUBRIC.name: {"score": "high", "reasoning": "good privacy"},
+ QUALITY_RUBRIC.name: {"score": "high", "reasoning": "high quality"},
+ STYLE_RUBRIC.name: {"score": "medium", "reasoning": "mostly natural"},
}
row = pd.Series(
{
@@ -564,9 +566,9 @@ def test_render_record_html_rewrite_mode_with_judge_scores() -> None:
}
)
result = render_record_html(row, record_index=0)
- assert f"{PRIVACY_RUBRIC.name}: 8/10" in result
- assert f"{QUALITY_RUBRIC.name}: 9/10" in result
- assert f"{NATURALNESS_RUBRIC.name}: 7/10" in result
+ assert f"{PRIVACY_RUBRIC.name}: high" in result
+ assert f"{QUALITY_RUBRIC.name}: high" in result
+ assert f"{STYLE_RUBRIC.name}: medium" in result
def test_render_record_html_rewrite_mode_nan_judge_column_does_not_warn(
@@ -665,3 +667,117 @@ def test_render_record_html_replace_mode_unchanged_when_no_rewritten_column() ->
assert "Replacement Map" in result
assert "Rewritten" not in result
assert "Scores" not in result
+
+
+# ---------------------------------------------------------------------------
+# Tests: _extract_judge_scores
+# ---------------------------------------------------------------------------
+
+
+def test_extract_judge_scores_returns_string_scores() -> None:
+ raw = {
+ "privacy": {"score": "high", "reasoning": "good"},
+ "quality": {"score": "medium", "reasoning": "ok"},
+ "style": {"score": "low", "reasoning": "rough"},
+ }
+ result = _extract_judge_scores(raw)
+ assert result == [("privacy", "high"), ("quality", "medium"), ("style", "low")]
+
+
+def test_extract_judge_scores_categorical_not_silently_empty() -> None:
+ """String scores must not be silently dropped (old int() cast raised ValueError)."""
+ raw = {"privacy": {"score": "high", "reasoning": "..."}}
+ result = _extract_judge_scores(raw)
+ assert len(result) == 1
+ assert result[0] == ("privacy", "high")
+
+
+# ---------------------------------------------------------------------------
+# Tests: detection_valid and label rendering
+# ---------------------------------------------------------------------------
+
+
+def test_detection_valid_rendered_in_main_scores_section() -> None:
+ row = pd.Series(
+ {
+ "text": "Alice works at Acme",
+ "text_rewritten": "Beth works at Globex",
+ COL_DETECTED_ENTITIES: {"entities": []},
+ "utility_score": 0.9,
+ "leakage_mass": 0.1,
+ "needs_human_review": False,
+ COL_DETECTION_VALID: 0.75,
+ }
+ )
+ result = render_record_html(row, record_index=0)
+ assert "Detection Validity" in result
+ assert "0.75" in result
+
+
+def test_render_record_html_rewrite_mode_detection_valid_none_shows_unavailable() -> None:
+ """When evaluate() ran but detection_valid is None, display renders 'Unavailable' not a score."""
+ row = pd.Series(
+ {
+ "text": "Alice works at Acme",
+ "text_rewritten": "Beth works at Globex",
+ COL_DETECTED_ENTITIES: {"entities": []},
+ "utility_score": 0.9,
+ "leakage_mass": 0.1,
+ "needs_human_review": False,
+ COL_DETECTION_VALID: None,
+ }
+ )
+ result = render_record_html(row, record_index=0)
+ assert "Detection Validity" in result
+ assert "Unavailable" in result
+ assert "0." not in result.split("Detection Validity")[1].split("")[0]
+
+
+def test_render_record_html_rewrite_mode_detection_valid_nan_shows_unavailable() -> None:
+ """NaN in COL_DETECTION_VALID (pandas missing-value sentinel) renders 'Unavailable'."""
+ row = pd.Series(
+ {
+ "text": "Alice works at Acme",
+ "text_rewritten": "Beth works at Globex",
+ COL_DETECTED_ENTITIES: {"entities": []},
+ "utility_score": 0.9,
+ "leakage_mass": 0.1,
+ "needs_human_review": False,
+ COL_DETECTION_VALID: np.nan,
+ }
+ )
+ result = render_record_html(row, record_index=0)
+ assert "Detection Validity" in result
+ assert "Unavailable" in result
+
+
+def test_render_record_html_rewrite_mode_no_detection_valid_column_omits_section() -> None:
+ """When COL_DETECTION_VALID is absent (evaluate() never called), the row is omitted entirely."""
+ row = pd.Series(
+ {
+ "text": "Alice works at Acme",
+ "text_rewritten": "Beth works at Globex",
+ COL_DETECTED_ENTITIES: {"entities": []},
+ "utility_score": 0.9,
+ "leakage_mass": 0.1,
+ "needs_human_review": False,
+ }
+ )
+ result = render_record_html(row, record_index=0)
+ assert "Detection Validity" not in result
+
+
+def test_rewrite_needs_human_review_label_is_rewrite_need_review() -> None:
+ row = pd.Series(
+ {
+ "text": "Alice works at Acme",
+ "text_rewritten": "Beth works at Globex",
+ COL_DETECTED_ENTITIES: {"entities": []},
+ "utility_score": 0.9,
+ "leakage_mass": 0.1,
+ "needs_human_review": True,
+ }
+ )
+ result = render_record_html(row, record_index=0)
+ assert "Rewrite Need Review" in result
+ assert "Needs Review:" not in result