From b23de7c5aea738d5a816cf0f741b97d871458a1a Mon Sep 17 00:00:00 2001 From: memadi Date: Tue, 9 Jun 2026 16:44:49 -0700 Subject: [PATCH 01/15] add a plan for updating anonymizer rewrite evaluation Signed-off-by: memadi --- plans/rewrite-evaluation/plan.md | 340 +++++++++++++++++++++++++++++++ 1 file changed, 340 insertions(+) create mode 100644 plans/rewrite-evaluation/plan.md diff --git a/plans/rewrite-evaluation/plan.md b/plans/rewrite-evaluation/plan.md new file mode 100644 index 00000000..31be2c86 --- /dev/null +++ b/plans/rewrite-evaluation/plan.md @@ -0,0 +1,340 @@ +# Rewrite Evaluation Improvements — Implementation Plan + +## Problem + +The rewrite evaluation has four related issues: + +- **Evaluation is baked into `run()` / `preview()`** — the final judge (holistic privacy / quality / fluency scores) runs unconditionally as part of the rewrite pipeline. Replace mode separates this into a dedicated `anonymizer.evaluate()` call, letting users skip it during fast iteration and run it deliberately. Rewrite has no equivalent. +- **No detection validity score in rewrite mode** — `anonymizer.evaluate()` produces a `detection_valid` column for replace mode (via `DetectionJudgeWorkflow`). Rewrite mode runs the same GLiNER + LLM detection pipeline but never scores its accuracy. +- **Judge scores are 1–10 integers that saturate** — observed outputs cluster at the extremes (8–10 or 1–2), making the middle of the scale nearly unused. A 3-level categorical (`low` / `medium` / `high`) better matches the distribution, removes false precision, and makes rubric definitions more actionable. +- **"Naturalness" is an ambiguous name** — the dimension measures fluency and readability of the output text, not faithfulness to the original or any other concept the word might suggest. `fluency` is the standard NLP term for this and is unambiguous. + +--- + +## Design Decision: categorical scale + +**`low / medium / high`** is preferred over a boolean (`pass / fail`): + +- Boolean formally acknowledges the binary behaviour but loses the middle bucket entirely. A rewrite that is "mostly fluent but has one awkward sentence" must be called either passing or failing, which is worse for human review triage. +- `low / medium / high` gives reviewers actionable signal: `medium` means "worth a look but not a clear failure", which is exactly the category that surfaces regressions in prompt or model changes. +- The `Score` class already accepts string option keys, so no framework changes are needed. + +Rubric anchors for each dimension: + +| Score | Privacy | Quality | Fluency | +|---|---|---|---| +| `high` | Original direct identifiers removed; remaining quasi-identifiers create low linkage risk | Important meaning, facts, and structure fully preserved | Reads as fluent, coherent, human-written prose | +| `medium` | No obvious direct identifiers remain, but a distinctive quasi-identifier bundle creates noticeable linkage risk | Most content preserved; minor details lost or slightly distorted | Mostly readable; isolated awkward phrasing or stiff transitions | +| `low` | One or more original direct identifiers or near-equivalents remain, or the record is easily linkable | Material loss of important information, contradictions, or distorted core meaning | Noticeably unnatural; broken grammar, placeholder-like language, or machine-generated feel | + +--- + +## Scope + +No new public API symbols beyond extending `EvaluateConfig` and `EvaluateModelSelection`. +All changes are backwards-compatible for replace-mode users. + +--- + +## Files Changed + +| File | Change | +|---|---| +| `src/anonymizer/engine/rewrite/final_judge.py` | Rename `NATURALNESS_RUBRIC` → `FLUENCY_RUBRIC`; change options to `low/medium/high`; update `_judge_prompt` scoring instructions; update `scores=` list | +| `src/anonymizer/engine/rewrite/rewrite_workflow.py` | Remove `_run_final_judge` call from `run()`; add `evaluate()` method that runs detection judge + final judge | +| `src/anonymizer/interface/anonymizer.py` | Extend `evaluate()` to handle rewrite results; add `COL_JUDGE_EVALUATION` + `COL_DETECTION_VALID` to the rewrite allowed-column set in `_build_user_dataframe` | +| `src/anonymizer/config/models.py` | Add `rewrite_judge` alias to `EvaluateModelSelection`; remove `judge` from `RewriteModelSelection` (or keep as deprecated with a note) | +| `src/anonymizer/config/anonymizer_config.py` | `EvaluateConfig` is no longer a placeholder — add a docstring clarifying it covers both replace and rewrite evaluation | +| `src/anonymizer/interface/display.py` | Update `_render_scores_section` to not append `/10`; update `_extract_judge_scores` if it assumes integer scores | +| `src/anonymizer/engine/schemas/rewrite.py` | Update any schema or docstring that references the 1–10 scale or "naturalness" | +| `docs/concepts/rewrite.md` | Update judge score documentation (rename naturalness → fluency, describe categorical scale, move judge to evaluate step) | +| `skills/anonymizer/SKILL.md` | Update evaluate workflow section with rewrite evaluate example | +| `tests/engine/rewrite/test_final_judge.py` | Update rubric tests; add tests for categorical output parsing | +| `tests/interface/test_anonymizer.py` | Add tests for `evaluate()` on a rewrite result | + +`src/anonymizer/engine/evaluation/detection_judge.py` — no changes needed; already works on any trace dataframe that has `COL_TEXT` and `COL_ENTITIES_BY_VALUE`. + +--- + +## Step 1 — Rename and recategorise the judge rubrics (`final_judge.py`) + +### Rename + +```python +# before +NATURALNESS_RUBRIC = Score(name="naturalness", ...) + +# after +FLUENCY_RUBRIC = Score(name="fluency", ...) +``` + +Update `scores=` list in `FinalJudgeWorkflow.columns()`: + +```python +scores=[PRIVACY_RUBRIC, QUALITY_RUBRIC, FLUENCY_RUBRIC], +``` + +### Change options to `low / medium / high` + +Replace the integer-keyed `options` dicts with string keys on all three rubrics: + +```python +PRIVACY_RUBRIC = Score( + name="privacy", + description="Privacy protection — how well the rewrite removes linkage risk to the original record.", + options={ + "high": "Original direct identifiers removed; remaining quasi-identifiers create low linkage risk.", + "medium": "No obvious direct identifiers remain, but a distinctive quasi-identifier bundle creates noticeable linkage risk.", + "low": "One or more original direct identifiers or near-equivalents remain, or the record is easily linkable.", + }, +) + +QUALITY_RUBRIC = Score( + name="quality", + description="Content quality — how well important meaning, facts, and structure are preserved.", + options={ + "high": "Important meaning, facts, and structure fully preserved.", + "medium": "Most content preserved; minor details lost or slightly distorted.", + "low": "Material loss of important information, contradictions, or distorted core meaning.", + }, +) + +FLUENCY_RUBRIC = Score( + name="fluency", + description="Writing fluency — does the rewritten text read as natural, grammatically correct, human-written prose?", + options={ + "high": "Reads as fluent, coherent, human-written prose.", + "medium": "Mostly readable; isolated awkward phrasing or stiff transitions.", + "low": "Noticeably unnatural; broken grammar, placeholder-like language, or machine-generated feel.", + }, +) +``` + +### Update `_judge_prompt` + +Replace the three `<*_scoring_instructions>` blocks to match the new categorical rubric anchors. The core guidance (assess independently, don't penalise necessary changes, etc.) is preserved — only the scale reference changes: + +``` + + ...existing contextual guidance (linkage risk, quasi-identifiers, etc.) preserved verbatim... + + Score as: + - high — original direct identifiers removed; remaining details create low linkage risk + - medium — no obvious direct identifiers, but a distinctive quasi-identifier bundle creates + noticeable linkage risk + - low — one or more direct identifiers or near-equivalents remain, or easily linkable + + + + ...existing guidance preserved... + + Score as: + - high — important meaning, facts, and structure fully preserved + - medium — most content preserved; minor details lost or slightly distorted + - low — material loss of important information, contradictions, or distorted core meaning + + + + ...naturalness guidance renamed and preserved... + + Score as: + - high — fluent, coherent, human-written prose + - medium — mostly readable; isolated awkward phrasing or stiff transitions + - low — noticeably unnatural; broken grammar, placeholder-like language, or machine feel + +``` + +The `` block changes "naturalness of writing" to "fluency of writing". + +--- + +## Step 2 — Move final judge out of `run()` (`rewrite_workflow.py`) + +Remove the `_run_final_judge` call from `RewriteWorkflow.run()` and the `COL_JUDGE_EVALUATION` default from `_PASSTHROUGH_DEFAULTS`. + +Add a standalone `evaluate()` method on `RewriteWorkflow`: + +```python +def evaluate( + self, + df: pd.DataFrame, + *, + model_configs: list[ModelConfig], + selected_models: EvaluateModelSelection, + privacy_goal: PrivacyGoal, + preview_num_records: int | None = None, +) -> RewriteResult: + """Run detection validity judge and final holistic judge on a completed rewrite result. + + Mirrors ReplacementWorkflow.evaluate(): takes the trace dataframe from a + prior run() / preview() and appends COL_DETECTION_VALID, + COL_DETECTION_INVALID_ENTITIES, and COL_JUDGE_EVALUATION. + """ +``` + +Inside `evaluate()`: +1. Run `DetectionJudgeWorkflow` (already in `engine/evaluation/detection_judge.py`) against `COL_ENTITIES_BY_VALUE` + `COL_TEXT`. +2. Run `FinalJudgeWorkflow` (already in `engine/rewrite/final_judge.py`) for privacy / quality / fluency scores. +3. Merge results and return a new `RewriteResult`. + +`COL_NEEDS_HUMAN_REVIEW` is **not** re-computed here — it was set during `run()` based on objective metrics and should not be overwritten by the evaluate step. + +--- + +## Step 3 — Wire `Anonymizer.evaluate()` for rewrite results (`anonymizer.py`) + +The existing `evaluate()` currently raises if `output.replace_method` is `None`: + +```python +# before +replace_method = getattr(output, "replace_method", None) +if replace_method is None: + raise ValueError(...) +``` + +Extend the dispatch: + +```python +rewrite_config = getattr(output, "rewrite_config", None) +replace_method = getattr(output, "replace_method", None) + +if rewrite_config is not None: + # Rewrite evaluate path + ...call self._rewrite_runner.evaluate(...) +elif replace_method is not None: + # Replace evaluate path (unchanged) + ... +else: + raise ValueError(...) +``` + +`AnonymizerResult` / `PreviewResult` in `results.py` need a `rewrite_config` field (carrying `PrivacyGoal`) set during `run()` in rewrite mode — analogous to how `replace_method` is set in replace mode. + +### Update `_build_user_dataframe` + +Add `COL_JUDGE_EVALUATION`, `COL_DETECTION_VALID`, and `COL_DETECTION_INVALID_ENTITIES` to the rewrite allowed set: + +```python +if f"{text_col}_rewritten" in t.columns: + allowed = { + text_col, + f"{text_col}_rewritten", + COL_UTILITY_SCORE, + COL_LEAKAGE_MASS, + COL_WEIGHTED_LEAKAGE_RATE, + COL_ANY_HIGH_LEAKED, + COL_NEEDS_HUMAN_REVIEW, + COL_JUDGE_EVALUATION, # ← new, only present after evaluate() + COL_DETECTION_VALID, # ← new, only present after evaluate() + COL_DETECTION_INVALID_ENTITIES,# ← new, only present after evaluate() + } +``` + +--- + +## Step 4 — Update model selection (`models.py`) + +Move the `judge` alias out of `RewriteModelSelection` and into `EvaluateModelSelection`: + +```python +class EvaluateModelSelection(BaseModel): + detection_validity_judge: str + replace_type_fidelity_judge: str + replace_relational_consistency_judge: str + replace_attribute_fidelity_judge: str + rewrite_judge: str # ← new: holistic privacy/quality/fluency judge for rewrite evaluate +``` + +`RewriteModelSelection.judge` is removed (or kept with a deprecation note if model YAML defaults need a phased migration). + +Update `engine/ndd/model_loader.py` validation to check `evaluate.rewrite_judge` when `check_evaluate=True` and the output is a rewrite result. + +--- + +## Step 5 — Fix display rendering (`display.py`) + +Line 449 currently renders: + +```python +score_strs = [f"{name}: {score}/10" for name, score in judge_scores] +``` + +Change to: + +```python +score_strs = [f"{name}: {score}" for name, score in judge_scores] +``` + +`_extract_judge_scores` returns `list[tuple[str, int]]` — update the return type to `list[tuple[str, int | str]]` since scores are now strings. + +--- + +## Step 6 — Docs and skills + +### `docs/concepts/rewrite.md` + +- Output columns table: remove `judge evaluation` from the `run()` output section; add a new **Evaluation** subsection (parallel to the existing replace evaluate docs) showing the `evaluate()` call pattern and what columns it adds. +- Update the judge score description: rename "naturalness" → "fluency", describe `low/medium/high` scale. +- Model roles table: move `judge` from the rewrite pipeline roles to the evaluate roles. + +### `skills/anonymizer/SKILL.md` + +Add a rewrite evaluate workflow example alongside the existing replace evaluate example: + +```python +# after rewrite run / preview: +evaluated = anonymizer.evaluate(result) +evaluated.display_record(0) +# → adds detection_valid, judge evaluation (privacy/quality/fluency: low/medium/high) +``` + +--- + +## Step 7 — Tests + +### Update existing tests + +- `tests/engine/rewrite/test_final_judge.py` — update rubric option assertions for `low/medium/high`; update any test that checks score parsing for integer values; rename all `naturalness` references to `fluency`. +- `tests/interface/test_anonymizer.py` — update assertions that check `COL_JUDGE_EVALUATION` is in the `run()` output (it now only appears after `evaluate()`). + +### New tests to add + +``` +# final_judge.py +test_fluency_rubric_has_low_medium_high_options +test_privacy_rubric_has_low_medium_high_options +test_quality_rubric_has_low_medium_high_options +test_judge_prompt_references_fluency_not_naturalness +test_judge_prompt_references_categorical_scale + +# rewrite_workflow.py +test_run_does_not_produce_judge_evaluation_column +test_evaluate_produces_judge_evaluation_column +test_evaluate_produces_detection_valid_column + +# anonymizer.py +test_evaluate_rewrite_result_adds_judge_columns +test_evaluate_rewrite_result_adds_detection_valid +test_evaluate_rewrite_raises_without_rewrite_config +test_run_rewrite_does_not_include_judge_in_user_dataframe + +# display.py +test_render_scores_section_categorical_no_slash_10 +test_extract_judge_scores_returns_string_scores +``` + +All new tests construct result objects directly — no real pipeline or LLM calls. + +--- + +## Implementation Order + +1. Update rubrics and prompt in `final_judge.py` (rename naturalness → fluency, 1-10 → low/medium/high) +2. Move `_run_final_judge` out of `RewriteWorkflow.run()`; add `RewriteWorkflow.evaluate()` +3. Add `rewrite_config` field to `AnonymizerResult` / `PreviewResult`; wire `Anonymizer.evaluate()` for rewrite +4. Move `judge` alias from `RewriteModelSelection` to `EvaluateModelSelection` (as `rewrite_judge`); update model loader validation +5. Update `_build_user_dataframe` allowed columns for rewrite +6. Fix `display.py` score rendering +7. Update `docs/concepts/rewrite.md` and `skills/anonymizer/SKILL.md` +8. Update existing tests; add new tests +9. Run `make format && make typecheck && make test` From 33084798fd8ef9d935364dee509bed0b850938e0 Mon Sep 17 00:00:00 2001 From: memadi Date: Tue, 9 Jun 2026 16:59:22 -0700 Subject: [PATCH 02/15] nit Signed-off-by: memadi --- plans/rewrite-evaluation/plan.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plans/rewrite-evaluation/plan.md b/plans/rewrite-evaluation/plan.md index 31be2c86..3a081fca 100644 --- a/plans/rewrite-evaluation/plan.md +++ b/plans/rewrite-evaluation/plan.md @@ -85,7 +85,7 @@ PRIVACY_RUBRIC = Score( options={ "high": "Original direct identifiers removed; remaining quasi-identifiers create low linkage risk.", "medium": "No obvious direct identifiers remain, but a distinctive quasi-identifier bundle creates noticeable linkage risk.", - "low": "One or more original direct identifiers or near-equivalents remain, or the record is easily linkable.", + "low" : "The record is easily or near-certainly linkable back to the original: key direct identifiers remain, or enough identifying detail survives that re-identification requires minimal effort regardless of how many entities were successfully transformed.", }, ) From df3f4f247ea92a1051426d250b935d074724e4e3 Mon Sep 17 00:00:00 2001 From: memadi Date: Tue, 9 Jun 2026 17:52:12 -0700 Subject: [PATCH 03/15] address gliner feedback Signed-off-by: memadi --- plans/rewrite-evaluation/plan.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/plans/rewrite-evaluation/plan.md b/plans/rewrite-evaluation/plan.md index 3a081fca..a222cf8c 100644 --- a/plans/rewrite-evaluation/plan.md +++ b/plans/rewrite-evaluation/plan.md @@ -42,10 +42,14 @@ All changes are backwards-compatible for replace-mode users. |---|---| | `src/anonymizer/engine/rewrite/final_judge.py` | Rename `NATURALNESS_RUBRIC` → `FLUENCY_RUBRIC`; change options to `low/medium/high`; update `_judge_prompt` scoring instructions; update `scores=` list | | `src/anonymizer/engine/rewrite/rewrite_workflow.py` | Remove `_run_final_judge` call from `run()`; add `evaluate()` method that runs detection judge + final judge | -| `src/anonymizer/interface/anonymizer.py` | Extend `evaluate()` to handle rewrite results; add `COL_JUDGE_EVALUATION` + `COL_DETECTION_VALID` to the rewrite allowed-column set in `_build_user_dataframe` | -| `src/anonymizer/config/models.py` | Add `rewrite_judge` alias to `EvaluateModelSelection`; remove `judge` from `RewriteModelSelection` (or keep as deprecated with a note) | +| `src/anonymizer/interface/results.py` | Add `rewrite_config: PrivacyGoal \| None = None` field to `AnonymizerResult` and `PreviewResult`; set it during rewrite `run()` analogous to `replace_method` | +| `src/anonymizer/interface/anonymizer.py` | Extend `evaluate()` to dispatch on `rewrite_config`; add `COL_JUDGE_EVALUATION` + `COL_DETECTION_VALID` to the rewrite allowed-column set in `_build_user_dataframe` | +| `src/anonymizer/config/models.py` | Add `rewrite_judge` alias to `EvaluateModelSelection`; remove `judge` from `RewriteModelSelection` | | `src/anonymizer/config/anonymizer_config.py` | `EvaluateConfig` is no longer a placeholder — add a docstring clarifying it covers both replace and rewrite evaluation | -| `src/anonymizer/interface/display.py` | Update `_render_scores_section` to not append `/10`; update `_extract_judge_scores` if it assumes integer scores | +| `src/anonymizer/config/default_model_configs/evaluate.yaml` | Add `rewrite_judge: nemotron-30b-thinking` — required to avoid a Pydantic startup crash when the new field lands in `EvaluateModelSelection` | +| `src/anonymizer/config/default_model_configs/rewrite.yaml` | Remove `judge` entry — it moves to `evaluate.yaml` | +| `src/anonymizer/engine/ndd/model_loader.py` | Update `validate_model_alias_references` to check `evaluate.rewrite_judge` when `check_evaluate=True` on a rewrite result | +| `src/anonymizer/interface/display.py` | Update `_render_scores_section` to not append `/10`; update `_extract_judge_scores` return type to `list[tuple[str, int \| str]]` | | `src/anonymizer/engine/schemas/rewrite.py` | Update any schema or docstring that references the 1–10 scale or "naturalness" | | `docs/concepts/rewrite.md` | Update judge score documentation (rename naturalness → fluency, describe categorical scale, move judge to evaluate step) | | `skills/anonymizer/SKILL.md` | Update evaluate workflow section with rewrite evaluate example | From 24b4185bde99984ed94b3e9e85a3e35e35ca45ec Mon Sep 17 00:00:00 2001 From: memadi Date: Tue, 9 Jun 2026 18:05:29 -0700 Subject: [PATCH 04/15] address greptile feedback-part 2 Signed-off-by: memadi --- plans/rewrite-evaluation/plan.md | 38 +++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/plans/rewrite-evaluation/plan.md b/plans/rewrite-evaluation/plan.md index a222cf8c..171cc88c 100644 --- a/plans/rewrite-evaluation/plan.md +++ b/plans/rewrite-evaluation/plan.md @@ -40,7 +40,7 @@ All changes are backwards-compatible for replace-mode users. | File | Change | |---|---| -| `src/anonymizer/engine/rewrite/final_judge.py` | Rename `NATURALNESS_RUBRIC` → `FLUENCY_RUBRIC`; change options to `low/medium/high`; update `_judge_prompt` scoring instructions; update `scores=` list | +| `src/anonymizer/engine/rewrite/final_judge.py` | Rename `NATURALNESS_RUBRIC` → `FLUENCY_RUBRIC`; change options to `low/medium/high`; update `_judge_prompt` scoring instructions; update `scores=` list; update `FinalJudgeWorkflow.columns()` signature to accept `EvaluateModelSelection` instead of `RewriteModelSelection`; remove `COL_NEEDS_HUMAN_REVIEW` from its column output (see Step 2) | | `src/anonymizer/engine/rewrite/rewrite_workflow.py` | Remove `_run_final_judge` call from `run()`; add `evaluate()` method that runs detection judge + final judge | | `src/anonymizer/interface/results.py` | Add `rewrite_config: PrivacyGoal \| None = None` field to `AnonymizerResult` and `PreviewResult`; set it during rewrite `run()` analogous to `replace_method` | | `src/anonymizer/interface/anonymizer.py` | Extend `evaluate()` to dispatch on `rewrite_config`; add `COL_JUDGE_EVALUATION` + `COL_DETECTION_VALID` to the rewrite allowed-column set in `_build_user_dataframe` | @@ -49,7 +49,7 @@ All changes are backwards-compatible for replace-mode users. | `src/anonymizer/config/default_model_configs/evaluate.yaml` | Add `rewrite_judge: nemotron-30b-thinking` — required to avoid a Pydantic startup crash when the new field lands in `EvaluateModelSelection` | | `src/anonymizer/config/default_model_configs/rewrite.yaml` | Remove `judge` entry — it moves to `evaluate.yaml` | | `src/anonymizer/engine/ndd/model_loader.py` | Update `validate_model_alias_references` to check `evaluate.rewrite_judge` when `check_evaluate=True` on a rewrite result | -| `src/anonymizer/interface/display.py` | Update `_render_scores_section` to not append `/10`; update `_extract_judge_scores` return type to `list[tuple[str, int \| str]]` | +| `src/anonymizer/interface/display.py` | Update `_render_scores_section` to not append `/10`; fix `_extract_judge_scores` to not cast `value["score"]` through `int()` — for string keys `"low"/"medium"/"high"` this raises `ValueError` which is silently swallowed, causing the judge section to never render; update return type to `list[tuple[str, int \| str]]` | | `src/anonymizer/engine/schemas/rewrite.py` | Update any schema or docstring that references the 1–10 scale or "naturalness" | | `docs/concepts/rewrite.md` | Update judge score documentation (rename naturalness → fluency, describe categorical scale, move judge to evaluate step) | | `skills/anonymizer/SKILL.md` | Update evaluate workflow section with rewrite evaluate example | @@ -177,11 +177,26 @@ def evaluate( ``` Inside `evaluate()`: -1. Run `DetectionJudgeWorkflow` (already in `engine/evaluation/detection_judge.py`) against `COL_ENTITIES_BY_VALUE` + `COL_TEXT`. -2. Run `FinalJudgeWorkflow` (already in `engine/rewrite/final_judge.py`) for privacy / quality / fluency scores. -3. Merge results and return a new `RewriteResult`. -`COL_NEEDS_HUMAN_REVIEW` is **not** re-computed here — it was set during `run()` based on objective metrics and should not be overwritten by the evaluate step. +1. **Split entity vs passthrough rows** using the same `split_rows` / `_has_entities` pattern as `run()`. Passthrough rows (no detected entities) must be excluded from both judge calls — running `DetectionJudgeWorkflow` on them produces vacuously-valid scores (nothing to validate → trivially satisfied), and running `FinalJudgeWorkflow` on them produces misleadingly-high scores for records that were never anonymized. Passthrough rows receive `COL_DETECTION_VALID = None` and `COL_JUDGE_EVALUATION = None` as defaults. +2. Run `DetectionJudgeWorkflow` (already in `engine/evaluation/detection_judge.py`) against entity rows only, using `COL_ENTITIES_BY_VALUE` + `COL_TEXT`. +3. Run `FinalJudgeWorkflow` (already in `engine/rewrite/final_judge.py`) against entity rows only, for privacy / quality / fluency scores. +4. Merge entity and passthrough rows and return a new `RewriteResult`. + +### `COL_NEEDS_HUMAN_REVIEW` must not be overwritten + +`FinalJudgeWorkflow.columns()` currently emits a `CustomColumnConfig` for `COL_NEEDS_HUMAN_REVIEW` (via `_determine_needs_human_review`). This flag was already set correctly during `run()` based on objective metrics. If `evaluate()` calls `FinalJudgeWorkflow` as-is, `_join_judge_columns` unconditionally resets it to `True` for all rows before re-assigning from the judge output, silently clobbering the run-time value. + +Fix: remove `COL_NEEDS_HUMAN_REVIEW` from `FinalJudgeWorkflow.columns()` entirely. The `CustomColumnConfig` for it and `_determine_needs_human_review` move to `RewriteWorkflow._run_final_judge()` — but since that method is being removed, the column is instead produced at the end of the evaluate-repair loop in `_run_evaluate_repair_loop()`, where `HumanReviewParams` is already available. `evaluate()` never touches `COL_NEEDS_HUMAN_REVIEW`. + +### Telemetry — `rewrite.judge` reference + +`anonymizer.py` references `rewrite.judge` in two places that will break when the field is removed from `RewriteModelSelection`: + +- `_collect_step_models()` — `"judge": rewrite.judge if has_rewrite else NOT_APPLICABLE` +- `_build_telemetry_event()` — `judge_model=models["judge"]` + +Resolution: drop the `judge` key from the `_collect_step_models` rewrite block and from `_build_telemetry_event`. The judge is now an evaluate-time role, not a run-time role, so it doesn't belong in run telemetry. `anonymizer.py` is already in the Files Changed table for the `evaluate()` dispatch; this telemetry fix is part of the same change. --- @@ -253,6 +268,8 @@ class EvaluateModelSelection(BaseModel): Update `engine/ndd/model_loader.py` validation to check `evaluate.rewrite_judge` when `check_evaluate=True` and the output is a rewrite result. +Update `FinalJudgeWorkflow.columns()` to accept `EvaluateModelSelection` instead of `RewriteModelSelection`, and resolve the judge alias via `evaluate.rewrite_judge` instead of `rewrite.judge`. + --- ## Step 5 — Fix display rendering (`display.py`) @@ -325,6 +342,15 @@ test_run_rewrite_does_not_include_judge_in_user_dataframe # display.py test_render_scores_section_categorical_no_slash_10 test_extract_judge_scores_returns_string_scores +test_extract_judge_scores_categorical_not_silently_empty + +# rewrite_workflow.py — passthrough + needs_human_review +test_evaluate_skips_passthrough_rows +test_evaluate_passthrough_rows_get_none_judge_defaults +test_run_needs_human_review_not_overwritten_by_evaluate + +# anonymizer.py — telemetry +test_run_rewrite_telemetry_has_no_judge_field ``` All new tests construct result objects directly — no real pipeline or LLM calls. From 0a6f0bfc38f864816489b8cb771e1c9063cc4ad0 Mon Sep 17 00:00:00 2001 From: memadi Date: Wed, 10 Jun 2026 15:04:10 -0700 Subject: [PATCH 05/15] nit Signed-off-by: memadi --- plans/rewrite-evaluation/plan.md | 54 +++++++++++++++++++------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/plans/rewrite-evaluation/plan.md b/plans/rewrite-evaluation/plan.md index 171cc88c..04de8a26 100644 --- a/plans/rewrite-evaluation/plan.md +++ b/plans/rewrite-evaluation/plan.md @@ -7,7 +7,7 @@ The rewrite evaluation has four related issues: - **Evaluation is baked into `run()` / `preview()`** — the final judge (holistic privacy / quality / fluency scores) runs unconditionally as part of the rewrite pipeline. Replace mode separates this into a dedicated `anonymizer.evaluate()` call, letting users skip it during fast iteration and run it deliberately. Rewrite has no equivalent. - **No detection validity score in rewrite mode** — `anonymizer.evaluate()` produces a `detection_valid` column for replace mode (via `DetectionJudgeWorkflow`). Rewrite mode runs the same GLiNER + LLM detection pipeline but never scores its accuracy. - **Judge scores are 1–10 integers that saturate** — observed outputs cluster at the extremes (8–10 or 1–2), making the middle of the scale nearly unused. A 3-level categorical (`low` / `medium` / `high`) better matches the distribution, removes false precision, and makes rubric definitions more actionable. -- **"Naturalness" is an ambiguous name** — the dimension measures fluency and readability of the output text, not faithfulness to the original or any other concept the word might suggest. `fluency` is the standard NLP term for this and is unambiguous. +- **"Naturalness" is an ambiguous name** — the dimension measures writing style and readability of the output text, not faithfulness to the original or any other concept the word might suggest. `style` is a clearer term for this and is unambiguous. --- @@ -21,9 +21,9 @@ The rewrite evaluation has four related issues: Rubric anchors for each dimension: -| Score | Privacy | Quality | Fluency | +| Score | Privacy | Quality | Style | |---|---|---|---| -| `high` | Original direct identifiers removed; remaining quasi-identifiers create low linkage risk | Important meaning, facts, and structure fully preserved | Reads as fluent, coherent, human-written prose | +| `high` | Original direct identifiers removed; remaining quasi-identifiers create low linkage risk | Important meaning, facts, and structure fully preserved | Reads as natural, coherent, human-written prose | | `medium` | No obvious direct identifiers remain, but a distinctive quasi-identifier bundle creates noticeable linkage risk | Most content preserved; minor details lost or slightly distorted | Mostly readable; isolated awkward phrasing or stiff transitions | | `low` | One or more original direct identifiers or near-equivalents remain, or the record is easily linkable | Material loss of important information, contradictions, or distorted core meaning | Noticeably unnatural; broken grammar, placeholder-like language, or machine-generated feel | @@ -40,7 +40,7 @@ All changes are backwards-compatible for replace-mode users. | File | Change | |---|---| -| `src/anonymizer/engine/rewrite/final_judge.py` | Rename `NATURALNESS_RUBRIC` → `FLUENCY_RUBRIC`; change options to `low/medium/high`; update `_judge_prompt` scoring instructions; update `scores=` list; update `FinalJudgeWorkflow.columns()` signature to accept `EvaluateModelSelection` instead of `RewriteModelSelection`; remove `COL_NEEDS_HUMAN_REVIEW` from its column output (see Step 2) | +| `src/anonymizer/engine/rewrite/final_judge.py` | Rename `NATURALNESS_RUBRIC` → `STYLE_RUBRIC`; change options to `low/medium/high`; update `_judge_prompt` scoring instructions; update `scores=` list; update `FinalJudgeWorkflow.columns()` signature to accept `EvaluateModelSelection` instead of `RewriteModelSelection`; remove `COL_NEEDS_HUMAN_REVIEW` from its column output (see Step 2) | | `src/anonymizer/engine/rewrite/rewrite_workflow.py` | Remove `_run_final_judge` call from `run()`; add `evaluate()` method that runs detection judge + final judge | | `src/anonymizer/interface/results.py` | Add `rewrite_config: PrivacyGoal \| None = None` field to `AnonymizerResult` and `PreviewResult`; set it during rewrite `run()` analogous to `replace_method` | | `src/anonymizer/interface/anonymizer.py` | Extend `evaluate()` to dispatch on `rewrite_config`; add `COL_JUDGE_EVALUATION` + `COL_DETECTION_VALID` to the rewrite allowed-column set in `_build_user_dataframe` | @@ -69,13 +69,13 @@ All changes are backwards-compatible for replace-mode users. NATURALNESS_RUBRIC = Score(name="naturalness", ...) # after -FLUENCY_RUBRIC = Score(name="fluency", ...) +STYLE_RUBRIC = Score(name="style", ...) ``` Update `scores=` list in `FinalJudgeWorkflow.columns()`: ```python -scores=[PRIVACY_RUBRIC, QUALITY_RUBRIC, FLUENCY_RUBRIC], +scores=[PRIVACY_RUBRIC, QUALITY_RUBRIC, STYLE_RUBRIC], ``` ### Change options to `low / medium / high` @@ -103,11 +103,11 @@ QUALITY_RUBRIC = Score( }, ) -FLUENCY_RUBRIC = Score( - name="fluency", - description="Writing fluency — does the rewritten text read as natural, grammatically correct, human-written prose?", +STYLE_RUBRIC = Score( + name="style", + description="Writing style — does the rewritten text read as natural, grammatically correct, human-written prose?", options={ - "high": "Reads as fluent, coherent, human-written prose.", + "high": "Reads as natural, coherent, human-written prose.", "medium": "Mostly readable; isolated awkward phrasing or stiff transitions.", "low": "Noticeably unnatural; broken grammar, placeholder-like language, or machine-generated feel.", }, @@ -138,17 +138,17 @@ Replace the three `<*_scoring_instructions>` blocks to match the new categorical - low — material loss of important information, contradictions, or distorted core meaning - + ...naturalness guidance renamed and preserved... Score as: - - high — fluent, coherent, human-written prose + - high — natural, coherent, human-written prose - medium — mostly readable; isolated awkward phrasing or stiff transitions - low — noticeably unnatural; broken grammar, placeholder-like language, or machine feel - + ``` -The `` block changes "naturalness of writing" to "fluency of writing". +The `` block changes "naturalness of writing" to "style of writing". --- @@ -179,8 +179,8 @@ def evaluate( Inside `evaluate()`: 1. **Split entity vs passthrough rows** using the same `split_rows` / `_has_entities` pattern as `run()`. Passthrough rows (no detected entities) must be excluded from both judge calls — running `DetectionJudgeWorkflow` on them produces vacuously-valid scores (nothing to validate → trivially satisfied), and running `FinalJudgeWorkflow` on them produces misleadingly-high scores for records that were never anonymized. Passthrough rows receive `COL_DETECTION_VALID = None` and `COL_JUDGE_EVALUATION = None` as defaults. -2. Run `DetectionJudgeWorkflow` (already in `engine/evaluation/detection_judge.py`) against entity rows only, using `COL_ENTITIES_BY_VALUE` + `COL_TEXT`. -3. Run `FinalJudgeWorkflow` (already in `engine/rewrite/final_judge.py`) against entity rows only, for privacy / quality / fluency scores. +2. Run `DetectionJudgeWorkflow` (already in `engine/evaluation/detection_judge.py`) against entity rows only, using `COL_ENTITIES_BY_VALUE` + `COL_TEXT`. The detection validity score is surfaced as a **0–1 value** (the LLM alignment score normalised from its raw percentage), matching the scale of `utility_score` and `leakage_mass`. This is the same score as in replace-mode evaluation, with the only difference being the 0–1 normalisation instead of exposing a raw percentage. `COL_DETECTION_VALID` is a **main output score** — it appears alongside `utility_score` and `leakage_mass` in the user dataframe and display, **not** grouped with the judge scores (privacy/quality/style). It does **not** influence `COL_NEEDS_HUMAN_REVIEW`; human review is determined solely by `leakage_mass` and `utility_score` thresholds, unchanged from the repair loop. +3. Run `FinalJudgeWorkflow` (already in `engine/rewrite/final_judge.py`) against entity rows only, for privacy / quality / style scores. 4. Merge entity and passthrough rows and return a new `RewriteResult`. ### `COL_NEEDS_HUMAN_REVIEW` must not be overwritten @@ -288,6 +288,14 @@ score_strs = [f"{name}: {score}" for name, score in judge_scores] `_extract_judge_scores` returns `list[tuple[str, int]]` — update the return type to `list[tuple[str, int | str]]` since scores are now strings. +### Detection validity placement + +`COL_DETECTION_VALID` must be rendered in the **main scores section** (alongside `utility_score` and `leakage_mass`), not inside the judge scores block. Update `_render_scores_section` to include it there when present. The value is already 0–1 so no `/10` suffix is needed and no additional scaling is required. + +### "Rewrite Need Review" label + +In rewrite mode the `COL_NEEDS_HUMAN_REVIEW` column must be displayed as **"Rewrite Need Review"** (not the generic "Needs Review" used in replace mode). Update the label resolution in `display.py` to emit the rewrite-specific label when rendering a rewrite result. + --- ## Step 6 — Docs and skills @@ -295,7 +303,7 @@ score_strs = [f"{name}: {score}" for name, score in judge_scores] ### `docs/concepts/rewrite.md` - Output columns table: remove `judge evaluation` from the `run()` output section; add a new **Evaluation** subsection (parallel to the existing replace evaluate docs) showing the `evaluate()` call pattern and what columns it adds. -- Update the judge score description: rename "naturalness" → "fluency", describe `low/medium/high` scale. +- Update the judge score description: rename "naturalness" → "style", describe `low/medium/high` scale; note detection validity appears in the main scores section (0–1) not judge scores; note `COL_NEEDS_HUMAN_REVIEW` is labelled "Rewrite Need Review" in the output column table. - Model roles table: move `judge` from the rewrite pipeline roles to the evaluate roles. ### `skills/anonymizer/SKILL.md` @@ -306,7 +314,7 @@ Add a rewrite evaluate workflow example alongside the existing replace evaluate # after rewrite run / preview: evaluated = anonymizer.evaluate(result) evaluated.display_record(0) -# → adds detection_valid, judge evaluation (privacy/quality/fluency: low/medium/high) +# → adds detection_valid (0–1, main scores section), judge evaluation (privacy/quality/style: low/medium/high) ``` --- @@ -315,17 +323,17 @@ evaluated.display_record(0) ### Update existing tests -- `tests/engine/rewrite/test_final_judge.py` — update rubric option assertions for `low/medium/high`; update any test that checks score parsing for integer values; rename all `naturalness` references to `fluency`. +- `tests/engine/rewrite/test_final_judge.py` — update rubric option assertions for `low/medium/high`; update any test that checks score parsing for integer values; rename all `naturalness` references to `style`. - `tests/interface/test_anonymizer.py` — update assertions that check `COL_JUDGE_EVALUATION` is in the `run()` output (it now only appears after `evaluate()`). ### New tests to add ``` # final_judge.py -test_fluency_rubric_has_low_medium_high_options +test_style_rubric_has_low_medium_high_options test_privacy_rubric_has_low_medium_high_options test_quality_rubric_has_low_medium_high_options -test_judge_prompt_references_fluency_not_naturalness +test_judge_prompt_references_style_not_naturalness test_judge_prompt_references_categorical_scale # rewrite_workflow.py @@ -343,6 +351,8 @@ test_run_rewrite_does_not_include_judge_in_user_dataframe test_render_scores_section_categorical_no_slash_10 test_extract_judge_scores_returns_string_scores test_extract_judge_scores_categorical_not_silently_empty +test_detection_valid_rendered_in_main_scores_section +test_rewrite_needs_human_review_label_is_rewrite_need_review # rewrite_workflow.py — passthrough + needs_human_review test_evaluate_skips_passthrough_rows @@ -359,7 +369,7 @@ All new tests construct result objects directly — no real pipeline or LLM call ## Implementation Order -1. Update rubrics and prompt in `final_judge.py` (rename naturalness → fluency, 1-10 → low/medium/high) +1. Update rubrics and prompt in `final_judge.py` (rename naturalness → style, 1-10 → low/medium/high) 2. Move `_run_final_judge` out of `RewriteWorkflow.run()`; add `RewriteWorkflow.evaluate()` 3. Add `rewrite_config` field to `AnonymizerResult` / `PreviewResult`; wire `Anonymizer.evaluate()` for rewrite 4. Move `judge` alias from `RewriteModelSelection` to `EvaluateModelSelection` (as `rewrite_judge`); update model loader validation From d3474633e49500e2de04b1cddc9c527c0c3eabf5 Mon Sep 17 00:00:00 2001 From: memadi Date: Wed, 10 Jun 2026 17:08:41 -0700 Subject: [PATCH 06/15] implement the plan Signed-off-by: memadi --- plans/rewrite-evaluation/plan.md | 29 ++-- .../default_model_configs/evaluate.yaml | 6 + .../config/default_model_configs/rewrite.yaml | 1 - src/anonymizer/config/models.py | 2 +- src/anonymizer/engine/ndd/model_loader.py | 2 + src/anonymizer/engine/rewrite/final_judge.py | 162 +++++------------- .../engine/rewrite/rewrite_workflow.py | 104 +++++++---- src/anonymizer/interface/anonymizer.py | 52 ++++-- src/anonymizer/interface/display.py | 25 ++- src/anonymizer/interface/results.py | 10 ++ 10 files changed, 209 insertions(+), 184 deletions(-) diff --git a/plans/rewrite-evaluation/plan.md b/plans/rewrite-evaluation/plan.md index 04de8a26..cb3fc497 100644 --- a/plans/rewrite-evaluation/plan.md +++ b/plans/rewrite-evaluation/plan.md @@ -4,7 +4,7 @@ The rewrite evaluation has four related issues: -- **Evaluation is baked into `run()` / `preview()`** — the final judge (holistic privacy / quality / fluency scores) runs unconditionally as part of the rewrite pipeline. Replace mode separates this into a dedicated `anonymizer.evaluate()` call, letting users skip it during fast iteration and run it deliberately. Rewrite has no equivalent. +- **Judge scoring is baked into `run()` / `preview()`** — the final holistic judge (privacy / quality / style scores) runs unconditionally as part of the rewrite pipeline. Replace mode separates this into a dedicated `anonymizer.evaluate()` call. Rewrite has no equivalent, forcing users to pay the judge cost on every run even during fast iteration. - **No detection validity score in rewrite mode** — `anonymizer.evaluate()` produces a `detection_valid` column for replace mode (via `DetectionJudgeWorkflow`). Rewrite mode runs the same GLiNER + LLM detection pipeline but never scores its accuracy. - **Judge scores are 1–10 integers that saturate** — observed outputs cluster at the extremes (8–10 or 1–2), making the middle of the scale nearly unused. A 3-level categorical (`low` / `medium` / `high`) better matches the distribution, removes false precision, and makes rubric definitions more actionable. - **"Naturalness" is an ambiguous name** — the dimension measures writing style and readability of the output text, not faithfulness to the original or any other concept the word might suggest. `style` is a clearer term for this and is unambiguous. @@ -29,6 +29,15 @@ Rubric anchors for each dimension: --- +## Design Decision: `run()` vs `evaluate()` separation + +- `run()` outputs `utility_score`, `leakage_mass`, `weighted_leakage_rate`, `any_high_leaked`, and `needs_human_review` unchanged — the repair loop requires them and they are immediately useful after a run. +- `evaluate()` adds `detection_valid` (0–1) and the holistic judge scores (`privacy` / `quality` / `style`) on top of the existing `run()` output. + +This avoids re-running the repair loop in `evaluate()`: if `needs_human_review=True`, that is already the exhausted repair state and `evaluate()` simply reads the metrics already present. + +--- + ## Scope No new public API symbols beyond extending `EvaluateConfig` and `EvaluateModelSelection`. @@ -178,16 +187,14 @@ def evaluate( Inside `evaluate()`: -1. **Split entity vs passthrough rows** using the same `split_rows` / `_has_entities` pattern as `run()`. Passthrough rows (no detected entities) must be excluded from both judge calls — running `DetectionJudgeWorkflow` on them produces vacuously-valid scores (nothing to validate → trivially satisfied), and running `FinalJudgeWorkflow` on them produces misleadingly-high scores for records that were never anonymized. Passthrough rows receive `COL_DETECTION_VALID = None` and `COL_JUDGE_EVALUATION = None` as defaults. -2. Run `DetectionJudgeWorkflow` (already in `engine/evaluation/detection_judge.py`) against entity rows only, using `COL_ENTITIES_BY_VALUE` + `COL_TEXT`. The detection validity score is surfaced as a **0–1 value** (the LLM alignment score normalised from its raw percentage), matching the scale of `utility_score` and `leakage_mass`. This is the same score as in replace-mode evaluation, with the only difference being the 0–1 normalisation instead of exposing a raw percentage. `COL_DETECTION_VALID` is a **main output score** — it appears alongside `utility_score` and `leakage_mass` in the user dataframe and display, **not** grouped with the judge scores (privacy/quality/style). It does **not** influence `COL_NEEDS_HUMAN_REVIEW`; human review is determined solely by `leakage_mass` and `utility_score` thresholds, unchanged from the repair loop. -3. Run `FinalJudgeWorkflow` (already in `engine/rewrite/final_judge.py`) against entity rows only, for privacy / quality / style scores. +1. **Split entity vs passthrough rows** using the same `split_rows` / `_has_entities` pattern as `run()`. Passthrough rows receive `COL_DETECTION_VALID = None` and `COL_JUDGE_EVALUATION = None` as defaults — running either judge on them produces vacuously correct or misleadingly high scores for records that were never anonymized. +2. Run `DetectionJudgeWorkflow` against entity rows only, using `COL_ENTITIES_BY_VALUE` + `COL_TEXT`. The score is surfaced as a **0–1 value** matching the scale of `utility_score` and `leakage_mass`. `COL_DETECTION_VALID` appears alongside the objective scores in the user dataframe and display, **not** grouped with the judge scores (privacy/quality/style). It does **not** influence `COL_NEEDS_HUMAN_REVIEW`. +3. Run `FinalJudgeWorkflow` against entity rows only, for privacy / quality / style scores. 4. Merge entity and passthrough rows and return a new `RewriteResult`. ### `COL_NEEDS_HUMAN_REVIEW` must not be overwritten -`FinalJudgeWorkflow.columns()` currently emits a `CustomColumnConfig` for `COL_NEEDS_HUMAN_REVIEW` (via `_determine_needs_human_review`). This flag was already set correctly during `run()` based on objective metrics. If `evaluate()` calls `FinalJudgeWorkflow` as-is, `_join_judge_columns` unconditionally resets it to `True` for all rows before re-assigning from the judge output, silently clobbering the run-time value. - -Fix: remove `COL_NEEDS_HUMAN_REVIEW` from `FinalJudgeWorkflow.columns()` entirely. The `CustomColumnConfig` for it and `_determine_needs_human_review` move to `RewriteWorkflow._run_final_judge()` — but since that method is being removed, the column is instead produced at the end of the evaluate-repair loop in `_run_evaluate_repair_loop()`, where `HumanReviewParams` is already available. `evaluate()` never touches `COL_NEEDS_HUMAN_REVIEW`. +`COL_NEEDS_HUMAN_REVIEW` is set correctly during `run()` based on objective metrics (utility/leakage thresholds). `evaluate()` must never touch it — judge scores and detection validity do not influence the human review decision. Remove `COL_NEEDS_HUMAN_REVIEW` from `FinalJudgeWorkflow.columns()` and produce it instead at the end of `_run_evaluate_repair_loop()` in `rewrite_workflow.py`, where the threshold params are already available. ### Telemetry — `rewrite.judge` reference @@ -231,7 +238,7 @@ else: ### Update `_build_user_dataframe` -Add `COL_JUDGE_EVALUATION`, `COL_DETECTION_VALID`, and `COL_DETECTION_INVALID_ENTITIES` to the rewrite allowed set: +Add `COL_JUDGE_EVALUATION`, `COL_DETECTION_VALID`, and `COL_DETECTION_INVALID_ENTITIES` to the rewrite allowed set. The objective metrics (`utility_score`, `leakage_mass`, etc.) are already in the allowed set and remain there — they are present after `run()`. The new columns are only present after `evaluate()` and are silently omitted until then. ```python if f"{text_col}_rewritten" in t.columns: @@ -243,9 +250,9 @@ if f"{text_col}_rewritten" in t.columns: COL_WEIGHTED_LEAKAGE_RATE, COL_ANY_HIGH_LEAKED, COL_NEEDS_HUMAN_REVIEW, - COL_JUDGE_EVALUATION, # ← new, only present after evaluate() - COL_DETECTION_VALID, # ← new, only present after evaluate() - COL_DETECTION_INVALID_ENTITIES,# ← new, only present after evaluate() + COL_DETECTION_VALID, # ← new, only present after evaluate() + COL_DETECTION_INVALID_ENTITIES, # ← new, only present after evaluate() + COL_JUDGE_EVALUATION, # ← new, only present after evaluate() } ``` diff --git a/src/anonymizer/config/default_model_configs/evaluate.yaml b/src/anonymizer/config/default_model_configs/evaluate.yaml index b0e97302..37ad43db 100644 --- a/src/anonymizer/config/default_model_configs/evaluate.yaml +++ b/src/anonymizer/config/default_model_configs/evaluate.yaml @@ -6,7 +6,13 @@ # consume them. They are only resolved when the user opts into evaluation. selected_models: + # --- Shared --- detection_validity_judge: gpt-oss-120b + + # --- Replace evaluation --- replace_type_fidelity_judge: gpt-oss-120b replace_relational_consistency_judge: gpt-oss-120b replace_attribute_fidelity_judge: gpt-oss-120b + + # --- Rewrite evaluation --- + rewrite_judge: nemotron-30b-thinking diff --git a/src/anonymizer/config/default_model_configs/rewrite.yaml b/src/anonymizer/config/default_model_configs/rewrite.yaml index 447a589a..bfa04e49 100644 --- a/src/anonymizer/config/default_model_configs/rewrite.yaml +++ b/src/anonymizer/config/default_model_configs/rewrite.yaml @@ -9,4 +9,3 @@ selected_models: rewriter: gpt-oss-120b evaluator: nemotron-30b-thinking repairer: gpt-oss-120b - judge: nemotron-30b-thinking diff --git a/src/anonymizer/config/models.py b/src/anonymizer/config/models.py index 1cd03e13..0dc130d9 100644 --- a/src/anonymizer/config/models.py +++ b/src/anonymizer/config/models.py @@ -90,7 +90,6 @@ class RewriteModelSelection(BaseModel): rewriter: str evaluator: str repairer: str - judge: str class EvaluateModelSelection(BaseModel): @@ -106,6 +105,7 @@ class EvaluateModelSelection(BaseModel): replace_type_fidelity_judge: str replace_relational_consistency_judge: str replace_attribute_fidelity_judge: str + rewrite_judge: str class ModelSelection(BaseModel): diff --git a/src/anonymizer/engine/ndd/model_loader.py b/src/anonymizer/engine/ndd/model_loader.py index 01262687..febedd33 100644 --- a/src/anonymizer/engine/ndd/model_loader.py +++ b/src/anonymizer/engine/ndd/model_loader.py @@ -260,6 +260,8 @@ def validate_model_alias_references( "replace_attribute_fidelity_judge", ): _collect_role(roles_to_check, f"evaluate.{role}", evaluate_roles[role]) + if check_rewrite: + _collect_role(roles_to_check, "evaluate.rewrite_judge", evaluate_roles["rewrite_judge"]) unknown = {path: alias for path, alias in roles_to_check.items() if alias not in known_aliases} if unknown: diff --git a/src/anonymizer/engine/rewrite/final_judge.py b/src/anonymizer/engine/rewrite/final_judge.py index 6f039999..507605b3 100644 --- a/src/anonymizer/engine/rewrite/final_judge.py +++ b/src/anonymizer/engine/rewrite/final_judge.py @@ -3,38 +3,20 @@ from __future__ import annotations -from typing import Any - -from data_designer.config import custom_column_generator -from data_designer.config.column_configs import CustomColumnConfig, LLMJudgeColumnConfig, Score +from data_designer.config.column_configs import LLMJudgeColumnConfig, Score from data_designer.config.column_types import ColumnConfigT -from pydantic import BaseModel -from anonymizer.config.models import RewriteModelSelection -from anonymizer.config.rewrite import EvaluationCriteria, PrivacyGoal +from anonymizer.config.models import EvaluateModelSelection +from anonymizer.config.rewrite import PrivacyGoal from anonymizer.engine.constants import ( - COL_ANY_HIGH_LEAKED, COL_JUDGE_EVALUATION, - COL_LEAKAGE_MASS, - COL_NEEDS_HUMAN_REVIEW, COL_REWRITTEN_TEXT, COL_TEXT, - COL_UTILITY_SCORE, _jinja, ) from anonymizer.engine.ndd.model_loader import resolve_model_alias from anonymizer.engine.prompt_utils import substitute_placeholders -# --------------------------------------------------------------------------- -# Generator params -# --------------------------------------------------------------------------- - - -class HumanReviewParams(BaseModel): - flag_utility_below: float | None - flag_leakage_above: float | None - - # --------------------------------------------------------------------------- # Prompt # --------------------------------------------------------------------------- @@ -59,7 +41,7 @@ def _judge_prompt(privacy_goal: PrivacyGoal) -> str: Evaluate the rewritten text on three separate dimensions: 1. privacy protection, 2. content quality / meaning preservation, -3. naturalness of writing. +3. style of writing. Score each dimension independently. @@ -81,7 +63,7 @@ def _judge_prompt(privacy_goal: PrivacyGoal) -> str: - Treat preserved direct identifiers as high risk, but assess both: * their strength (how uniquely identifying they are), and * their role (whether they directly identify the subject or are secondary/relational, such as names of relatives) -- Secondary or relational identifiers and distinctive combinations increase linkage risk but typically require additional context. They should not alone imply easy re-identification and should only result in the lowest scores (1–2) when they provide a short, reliable, and low-ambiguity path to identifying the protected subject. +- Secondary or relational identifiers and distinctive combinations increase linkage risk but typically require additional context. They should not alone imply easy re-identification and should only result in a `low` score when they provide a short, reliable, and low-ambiguity path to identifying the protected subject. - Treat contextual attributes (e.g., organizations, locations, roles, affiliations) as quasi-identifiers unless they uniquely identify the individual on their own. @@ -90,11 +72,11 @@ def _judge_prompt(privacy_goal: PrivacyGoal) -> str: * Only penalize quasi-identifiers when they significantly reduce the candidate set or form a distinctive or rare combination. - Distinguish between levels of linkage risk: - * plausible linkage (requires effort, auxiliary information, or narrowing across many candidates) → scores 5–6 - * strong but indirect signals (significantly narrows candidates but still requires effort) → scores 3–4 - * likely or easy re-identification (directly identifying or easily traceable with minimal effort) → scores 1–2 + * high — original direct identifiers removed; remaining details create low linkage risk + * medium — no obvious direct identifiers, but a distinctive quasi-identifier bundle creates noticeable linkage risk + * low — one or more direct identifiers or near-equivalents remain, or the record is easily or near-certainly linkable -- A small number of weak or secondary identifiers (e.g., common names of relatives) should increase risk but should not by themselves result in low scores unless they substantially narrow the candidate set. +- A small number of weak or secondary identifiers (e.g., common names of relatives) should increase risk but should not by themselves result in a `low` score unless they substantially narrow the candidate set. - Apply the stated privacy goal. Do not require unnecessary over-redaction if the rewrite adequately achieves the intended level of protection. @@ -109,24 +91,30 @@ def _judge_prompt(privacy_goal: PrivacyGoal) -> str: conclusions of the original. - Penalize rewrites that omit important facts, distort the meaning, introduce contradictions, or materially change the core message. - - Minor loss of detail should reduce the score only slightly; major loss of important - information should reduce it substantially. - - Judge quality independently from privacy and naturalness. A rewrite can be privacy-protective + - Judge quality independently from privacy and style. A rewrite can be privacy-protective but still low quality if it loses important meaning, and it can be high quality even if some surface details are changed. + - Score as: + * high — important meaning, facts, and structure fully preserved + * medium — most content preserved; minor details lost or slightly distorted + * low — material loss of important information, contradictions, or distorted core meaning - - - Judge naturalness based on whether the rewritten text reads as fluent, coherent, and human-written. + + - Judge style based on whether the rewritten text reads as fluent, coherent, and human-written. - Focus on readability, grammatical correctness, clarity, and smooth phrasing. - Do NOT penalize the rewrite merely for using different wording, sentence structure, or level of specificity from the original. - Reward rewrites that sound natural and internally consistent. - Penalize awkward phrasing, repetitive wording, broken grammar, incoherence, unnatural insertions, placeholder-like language, or text that feels machine-generated. - - Judge naturalness independently from privacy and quality. A rewrite can be natural even if it + - Judge style independently from privacy and quality. A rewrite can be natural even if it changes content, and it can preserve content while still sounding awkward. - + - Score as: + * high — fluent, coherent, human-written prose + * medium — mostly readable; isolated awkward phrasing or stiff transitions + * low — noticeably unnatural; broken grammar, placeholder-like language, or machine-generated feel + """ return substitute_placeholders( prompt, @@ -139,89 +127,36 @@ def _judge_prompt(privacy_goal: PrivacyGoal) -> str: # --------------------------------------------------------------------------- -# Custom column generators -# --------------------------------------------------------------------------- - - -@custom_column_generator( - required_columns=[COL_REWRITTEN_TEXT, COL_UTILITY_SCORE, COL_LEAKAGE_MASS, COL_ANY_HIGH_LEAKED], -) -def _determine_needs_human_review(row: dict[str, Any], generator_params: HumanReviewParams) -> dict[str, Any]: - """Flag rows that need manual review based on objective metrics.""" - if row.get(COL_REWRITTEN_TEXT) is None: - row[COL_NEEDS_HUMAN_REVIEW] = True - return row - - if row[COL_ANY_HIGH_LEAKED]: - row[COL_NEEDS_HUMAN_REVIEW] = True - return row - - if generator_params.flag_utility_below is not None: - if float(row[COL_UTILITY_SCORE]) < generator_params.flag_utility_below: - row[COL_NEEDS_HUMAN_REVIEW] = True - return row - - if generator_params.flag_leakage_above is not None: - if float(row[COL_LEAKAGE_MASS]) > generator_params.flag_leakage_above: - row[COL_NEEDS_HUMAN_REVIEW] = True - return row - - row[COL_NEEDS_HUMAN_REVIEW] = False - return row - - -# --------------------------------------------------------------------------- -# Workflow +# Rubrics # --------------------------------------------------------------------------- PRIVACY_RUBRIC = Score( name="privacy", - description="Privacy protection effectiveness - how well the rewrite reduces the risk of linking the text back to the original person, case, listing, or record while preserving the stated privacy goal.", + description="Privacy protection — how well the rewrite removes linkage risk to the original record.", options={ - 10: "Excellent privacy protection. Original direct identifiers are removed or safely transformed, and remaining details are too generic to support realistic linkage.", - 9: "Very strong privacy protection. No meaningful original direct identifiers remain, and only weak, common contextual details are left.", - 8: "Strong privacy protection. Original direct identifiers are removed; some quasi-identifiers remain, but linkage risk is low.", - 7: "Good privacy protection. Original direct identifiers are removed; some contextual specifics remain, but re-identification would be difficult.", - 6: "Moderate privacy protection. Original direct identifiers are removed, but a distinctive factual bundle or several quasi-identifiers still create noticeable linkage risk.", - 5: "Mixed privacy protection. No obvious original direct identifiers remain, but the rewrite preserves enough distinctive detail that linkage is plausible.", - 4: "Weak privacy protection. Important identifying or highly distinctive details remain, making linkage fairly feasible.", - 3: "Poor privacy protection. The rewrite leaves strong identifying clues or a highly unique fact pattern that makes re-identification likely.", - 2: "Very poor privacy protection. One or more original direct identifiers or near-direct equivalents remain, or the text is easily linkable to the original.", - 1: "Severe privacy failure. Original direct identifiers are plainly exposed or the person, case, listing, or record is essentially still directly identifiable.", + "high": "Original direct identifiers removed; remaining quasi-identifiers create low linkage risk.", + "medium": "No obvious direct identifiers remain, but a distinctive quasi-identifier bundle creates noticeable linkage risk.", + "low": "The record is easily or near-certainly linkable back to the original: key direct identifiers remain, or enough identifying detail survives that re-identification requires minimal effort regardless of how many entities were successfully transformed.", }, ) QUALITY_RUBRIC = Score( name="quality", - description="Content quality and utility preservation - how well is important information retained?", + description="Content quality — how well important meaning, facts, and structure are preserved.", options={ - 10: "Perfect - complete information preservation", - 9: "Excellent - nearly all information retained", - 8: "Strong - content well preserved", - 7: "Good - minor details lost", - 6: "Fair - most information preserved", - 5: "Acceptable - some information loss but usable", - 4: "Below average - important details missing", - 3: "Poor - significant information loss", - 2: "Very poor - major content degradation", - 1: "Unusable - critical information lost", + "high": "Important meaning, facts, and structure fully preserved.", + "medium": "Most content preserved; minor details lost or slightly distorted.", + "low": "Material loss of important information, contradictions, or distorted core meaning.", }, ) -NATURALNESS_RUBRIC = Score( - name="naturalness", - description="Writing quality and naturalness - does it read like the original text?", +STYLE_RUBRIC = Score( + name="style", + description="Writing style — does the rewritten text read as fluent, coherent, human-written prose?", options={ - 10: "Perfect - indistinguishable from human-written", - 9: "Very natural - fluent writing", - 8: "Natural - reads well", - 7: "Good - minor awkwardness", - 6: "Fair - mostly natural", - 5: "Acceptable - readable but not smooth", - 4: "Below average - some unnatural phrases", - 3: "Awkward - noticeably artificial", - 2: "Very awkward - difficult to read", - 1: "Unreadable - severely unnatural", + "high": "Reads as fluent, coherent, human-written prose.", + "medium": "Mostly readable; isolated awkward phrasing or stiff transitions.", + "low": "Noticeably unnatural; broken grammar, placeholder-like language, or machine-generated feel.", }, ) @@ -232,35 +167,26 @@ def _determine_needs_human_review(row: dict[str, Any], generator_params: HumanRe class FinalJudgeWorkflow: - """Holistic LLM judge evaluation of privacy, quality, and naturalness. + """Holistic LLM judge evaluation of privacy, quality, and style. - Produces ``COL_JUDGE_EVALUATION`` (informational only -- not used for - automated decisions) and ``COL_NEEDS_HUMAN_REVIEW`` (based on objective - metrics from the evaluate step). + Produces ``COL_JUDGE_EVALUATION`` only — informational, not used for any + automated decisions. ``COL_NEEDS_HUMAN_REVIEW`` is computed separately in + the evaluate-repair loop based on objective metrics. """ def columns( self, *, - selected_models: RewriteModelSelection, + selected_models: EvaluateModelSelection, privacy_goal: PrivacyGoal, - evaluation: EvaluationCriteria, ) -> list[ColumnConfigT]: - judge_alias = resolve_model_alias("judge", selected_models) + judge_alias = resolve_model_alias("rewrite_judge", selected_models) return [ LLMJudgeColumnConfig( name=COL_JUDGE_EVALUATION, prompt=_judge_prompt(privacy_goal), model_alias=judge_alias, - scores=[PRIVACY_RUBRIC, QUALITY_RUBRIC, NATURALNESS_RUBRIC], - ), - CustomColumnConfig( - name=COL_NEEDS_HUMAN_REVIEW, - generator_function=_determine_needs_human_review, - generator_params=HumanReviewParams( - flag_utility_below=evaluation.flag_utility_below, - flag_leakage_above=evaluation.flag_leakage_above, - ), + scores=[PRIVACY_RUBRIC, QUALITY_RUBRIC, STYLE_RUBRIC], ), ] diff --git a/src/anonymizer/engine/rewrite/rewrite_workflow.py b/src/anonymizer/engine/rewrite/rewrite_workflow.py index 88c2b9c3..af222b25 100644 --- a/src/anonymizer/engine/rewrite/rewrite_workflow.py +++ b/src/anonymizer/engine/rewrite/rewrite_workflow.py @@ -9,10 +9,12 @@ import pandas as pd from data_designer.config.models import ModelConfig -from anonymizer.config.models import ReplaceModelSelection, RewriteModelSelection +from anonymizer.config.models import EvaluateModelSelection, ReplaceModelSelection, RewriteModelSelection from anonymizer.config.rewrite import EvaluationCriteria, PrivacyGoal from anonymizer.engine.constants import ( COL_ANY_HIGH_LEAKED, + COL_DETECTION_INVALID_ENTITIES, + COL_DETECTION_VALID, COL_ENTITIES_BY_VALUE, COL_JUDGE_EVALUATION, COL_LEAKAGE_MASS, @@ -25,6 +27,7 @@ COL_UTILITY_SCORE, COL_WEIGHTED_LEAKAGE_RATE, ) +from anonymizer.engine.evaluation.detection_judge import DetectionJudgeWorkflow from anonymizer.engine.ndd.adapter import RECORD_ID_COLUMN, FailedRecord, NddAdapter from anonymizer.engine.replace.llm_replace_workflow import LlmReplaceWorkflow from anonymizer.engine.rewrite.domain_classification import DomainClassificationWorkflow @@ -46,7 +49,6 @@ COL_WEIGHTED_LEAKAGE_RATE: 0.0, COL_ANY_HIGH_LEAKED: False, COL_NEEDS_HUMAN_REVIEW: False, - COL_JUDGE_EVALUATION: None, COL_REPAIR_ITERATIONS: 0, } @@ -108,15 +110,16 @@ def _join_new_columns( def _join_judge_columns(target: pd.DataFrame, source: pd.DataFrame) -> pd.DataFrame: """Merge judge columns preserving all rows -- judge is non-critical. - When judge returns fewer rows than target, the missing rows get - defaults (``COL_JUDGE_EVALUATION=None``, ``COL_NEEDS_HUMAN_REVIEW=True``) - instead of being dropped from the result. + When judge returns fewer rows than target, missing rows get + ``COL_JUDGE_EVALUATION=None`` instead of being dropped from the result. + ``COL_NEEDS_HUMAN_REVIEW`` is not touched here; it is set by the + evaluate-repair loop based on objective metrics. """ if len(source) == len(target): return _join_new_columns(target, source) logger.warning( - "Judge returned %d of %d rows; defaulting missing rows to needs_human_review=True.", + "Judge returned %d of %d rows; defaulting missing rows to judge_evaluation=None.", len(source), len(target), ) @@ -125,13 +128,11 @@ def _join_judge_columns(target: pd.DataFrame, source: pd.DataFrame) -> pd.DataFr known_ids = set(source_by_id.index) result[COL_JUDGE_EVALUATION] = None - result[COL_NEEDS_HUMAN_REVIEW] = True for idx, record_id in result[RECORD_ID_COLUMN].items(): if record_id in known_ids: row = source_by_id.loc[record_id] result.at[idx, COL_JUDGE_EVALUATION] = row.get(COL_JUDGE_EVALUATION) - result.at[idx, COL_NEEDS_HUMAN_REVIEW] = row.get(COL_NEEDS_HUMAN_REVIEW, True) return result @@ -170,7 +171,8 @@ class RewriteWorkflow: Chains all sub-workflows in order: domain classification, sensitivity disposition, QA generation, rewrite generation, - evaluate-repair loop, and final judge. + and the evaluate-repair loop. The final judge runs separately + via evaluate(). """ def __init__(self, adapter: NddAdapter) -> None: @@ -182,6 +184,7 @@ def __init__(self, adapter: NddAdapter) -> None: self._evaluate_wf = EvaluateWorkflow(adapter) self._repair_wf = RepairWorkflow(adapter) self._judge_wf = FinalJudgeWorkflow() + self._detection_judge_wf = DetectionJudgeWorkflow(adapter) def run( self, @@ -255,17 +258,6 @@ def run( ) all_failed.extend(eval_repair_failed) - # --- Step 6: final judge (non-critical) --- - entity_rows, judge_failed = self._run_final_judge( - entity_rows, - model_configs=model_configs, - selected_models=selected_models, - privacy_goal=privacy_goal, - evaluation=evaluation, - preview_num_records=preview_num_records, - ) - all_failed.extend(judge_failed) - # --- Merge and return --- _apply_passthrough_defaults(passthrough_rows) combined = merge_and_reorder(entity_rows, passthrough_rows) @@ -365,40 +357,82 @@ def _run_evaluate_repair_loop( df = pd.concat([passing_rows, failing_rows], ignore_index=True) + # Compute needs_human_review from objective metrics after the loop exhausts. + needs_review = df[COL_REWRITTEN_TEXT].isna() + needs_review = needs_review | df[COL_ANY_HIGH_LEAKED].apply(bool) + if evaluation.flag_utility_below is not None: + needs_review = needs_review | (df[COL_UTILITY_SCORE].apply(float) < evaluation.flag_utility_below) + if evaluation.flag_leakage_above is not None: + needs_review = needs_review | (df[COL_LEAKAGE_MASS].apply(float) > evaluation.flag_leakage_above) + df[COL_NEEDS_HUMAN_REVIEW] = needs_review + return df, all_failed # --------------------------------------------------------------------------- - # Final judge (non-critical) + # Evaluate (detection judge + final judge) # --------------------------------------------------------------------------- - def _run_final_judge( + def evaluate( self, df: pd.DataFrame, *, model_configs: list[ModelConfig], - selected_models: RewriteModelSelection, + selected_models: EvaluateModelSelection, privacy_goal: PrivacyGoal, - evaluation: EvaluationCriteria, - preview_num_records: int | None, - ) -> tuple[pd.DataFrame, list[FailedRecord]]: + preview_num_records: int | None = None, + ) -> RewriteResult: + """Run detection validity judge and holistic judge on a completed rewrite result. + + Takes the trace dataframe from a prior run() / preview() and appends + COL_DETECTION_VALID, COL_DETECTION_INVALID_ENTITIES, and COL_JUDGE_EVALUATION. + COL_NEEDS_HUMAN_REVIEW is not modified — it was set during run() based on + objective metrics and judge scores do not influence it. + """ + entity_rows, passthrough_rows = split_rows(df, column=COL_ENTITIES_BY_VALUE, predicate=_has_entities) + + passthrough_rows = passthrough_rows.copy() + passthrough_rows[COL_DETECTION_VALID] = None + passthrough_rows[COL_DETECTION_INVALID_ENTITIES] = [[] for _ in range(len(passthrough_rows))] + passthrough_rows[COL_JUDGE_EVALUATION] = None + + if entity_rows.empty: + combined = merge_and_reorder(passthrough_rows) + return RewriteResult(dataframe=combined, failed_records=[]) + + all_failed: list[FailedRecord] = [] + + # --- Detection validity judge --- + detection_result = self._detection_judge_wf.evaluate( + entity_rows, + model_configs=model_configs, + selected_models=selected_models, + preview_num_records=preview_num_records, + ) + entity_rows = _join_new_columns(entity_rows, detection_result.dataframe) + all_failed.extend(detection_result.failed_records) + + # --- Holistic judge (privacy / quality / style) --- try: judge_columns = self._judge_wf.columns( selected_models=selected_models, privacy_goal=privacy_goal, - evaluation=evaluation, ) - judge_seed = select_seed_cols(df, derive_seed_columns(judge_columns, df)) + effective_preview = ( + min(preview_num_records, len(entity_rows)) if preview_num_records is not None else None + ) + judge_seed = select_seed_cols(entity_rows, derive_seed_columns(judge_columns, entity_rows)) judge_result = self._adapter.run_workflow( judge_seed, model_configs=model_configs, columns=judge_columns, workflow_name="rewrite-final-judge", - preview_num_records=preview_num_records, + preview_num_records=effective_preview, ) - df = _join_judge_columns(df, judge_result.dataframe) - return df, judge_result.failed_records + entity_rows = _join_judge_columns(entity_rows, judge_result.dataframe) + all_failed.extend(judge_result.failed_records) except Exception: - logger.warning("Final judge step failed; populating defaults", exc_info=True) - df[COL_JUDGE_EVALUATION] = None - df[COL_NEEDS_HUMAN_REVIEW] = True - return df, [] + logger.warning("Final judge step failed; defaulting to judge_evaluation=None", exc_info=True) + entity_rows[COL_JUDGE_EVALUATION] = None + + combined = merge_and_reorder(entity_rows, passthrough_rows) + return RewriteResult(dataframe=combined, failed_records=all_failed) diff --git a/src/anonymizer/interface/anonymizer.py b/src/anonymizer/interface/anonymizer.py index ec08164a..afe25ba5 100644 --- a/src/anonymizer/interface/anonymizer.py +++ b/src/anonymizer/interface/anonymizer.py @@ -30,6 +30,7 @@ COL_DETECTION_INVALID_ENTITIES, COL_DETECTION_VALID, COL_FINAL_ENTITIES, + COL_JUDGE_EVALUATION, COL_LEAKAGE_MASS, COL_NEEDS_HUMAN_REVIEW, COL_RELATIONAL_CONSISTENCY_INVALID_RELATIONS, @@ -225,6 +226,7 @@ def preview( failed_records=result.failed_records, preview_num_records=num_records, replace_method=config.replace, + rewrite_config=config.rewrite.privacy_goal if config.rewrite is not None else None, ) except KeyboardInterrupt: status = TaskStatusEnum.CANCELED @@ -251,9 +253,8 @@ def evaluate( ) -> AnonymizerResult: """Run LLM-as-judge evaluation on a prior ``preview()`` / ``run()`` output. - The anonymization strategy is read from ``output.replace_method`` (set - when ``run()`` / ``preview()`` produced the result), so users don't - restate it and can't mis-state it. + The anonymization strategy is read from the result (set when ``run()`` / + ``preview()`` produced it), so users don't restate it and can't mis-state it. Typical flow:: @@ -275,19 +276,48 @@ def evaluate( Args: output: An :class:`AnonymizerResult` or :class:`PreviewResult` from a prior ``preview()`` / ``run()``. Carries the trace dataframe, - the resolved text-column name, and the replace strategy. + the resolved text-column name, and the anonymization config. config: Optional :class:`EvaluateConfig` for evaluation-specific knobs (placeholder today; reserved for metric selection, per-judge model/prompt overrides, etc.). """ _ = config # placeholder; no knobs to read yet + rewrite_config = getattr(output, "rewrite_config", None) replace_method = getattr(output, "replace_method", None) + + if rewrite_config is not None: + try: + validate_model_alias_references( + self._model_configs, + self._selected_models, + check_rewrite=True, + check_evaluate=True, + ) + except ValueError as exc: + raise InvalidConfigError(str(exc)) from exc + text_column = output.resolved_text_column + internal_df = _unrename_output_columns(output.trace_dataframe, resolved_text_column=text_column) + rewrite_result = self._rewrite_runner.evaluate( + internal_df, + model_configs=self._model_configs, + selected_models=self._selected_models.evaluate, + privacy_goal=rewrite_config, + ) + renamed_trace = _rename_output_columns(rewrite_result.dataframe, resolved_text_column=text_column) + return AnonymizerResult( + dataframe=_build_user_dataframe(renamed_trace, resolved_text_column=text_column), + trace_dataframe=renamed_trace, + resolved_text_column=text_column, + failed_records=rewrite_result.failed_records, + rewrite_config=rewrite_config, + ) + if replace_method is None: raise ValueError( - "Cannot evaluate this output — it has no associated replace strategy. " - "Pass an AnonymizerResult / PreviewResult produced by run() / preview() " - "on this branch (the strategy is recorded then). Hand-built or legacy " - "results need their `replace_method` attribute set before calling evaluate()." + "Cannot evaluate this output — it has no associated anonymization config. " + "Pass an AnonymizerResult / PreviewResult produced by run() / preview(). " + "Hand-built or legacy results need their `replace_method` or `rewrite_config` " + "attribute set before calling evaluate()." ) try: validate_model_alias_references( @@ -461,6 +491,7 @@ def _run_internal( resolved_text_column=text_col, failed_records=all_failures, replace_method=config.replace, + rewrite_config=config.rewrite.privacy_goal if config.rewrite is not None else None, ) def _validate_preflight_config(self, config: AnonymizerConfig) -> None: @@ -580,7 +611,6 @@ def _build_telemetry_event( rewriter_model=models["rewriter"], evaluator_model=models["evaluator"], repairer_model=models["repairer"], - judge_model=models["judge"], model_hosts=hosts, entity_detection_failure_count=failure_counts["entity_detection"], latent_detection_failure_count=failure_counts["latent_detection"], @@ -710,6 +740,9 @@ def _build_user_dataframe(trace_dataframe: pd.DataFrame, *, resolved_text_column COL_WEIGHTED_LEAKAGE_RATE, COL_ANY_HIGH_LEAKED, COL_NEEDS_HUMAN_REVIEW, + COL_DETECTION_VALID, # only present after evaluate() + COL_DETECTION_INVALID_ENTITIES, # only present after evaluate() + COL_JUDGE_EVALUATION, # only present after evaluate() } elif f"{text_col}_replaced" in t.columns: allowed = { @@ -797,7 +830,6 @@ def _collect_step_models( "rewriter": rewrite.rewriter if has_rewrite else NOT_APPLICABLE, "evaluator": rewrite.evaluator if has_rewrite else NOT_APPLICABLE, "repairer": rewrite.repairer if has_rewrite else NOT_APPLICABLE, - "judge": rewrite.judge if has_rewrite else NOT_APPLICABLE, } diff --git a/src/anonymizer/interface/display.py b/src/anonymizer/interface/display.py index ee63b2a7..d6e60579 100644 --- a/src/anonymizer/interface/display.py +++ b/src/anonymizer/interface/display.py @@ -431,11 +431,19 @@ def _render_scores_section(row: pd.Series) -> str: "Weighted Leakage Rate: " f"{weighted_leakage_rate:.2f}" ) + detection_valid = row.get(COL_DETECTION_VALID) + if detection_valid is not None: + parts.append( + f"Detection Validity: {float(detection_valid):.2f}" + ) + if needs_review is not None: + is_rewrite = "rewritten" in "".join(str(k) for k in row.index) + label = "Rewrite Need Review" if is_rewrite else "Needs Review" badge_color = "#ef4444" if needs_review else "#22c55e" badge_text = "Yes" if needs_review else "No" parts.append( - f"Needs Review: " + f"{label}: " f"{badge_text}" ) @@ -446,7 +454,7 @@ def _render_scores_section(row: pd.Series) -> str: "Judge evaluation present but produced no scores (unexpected shape: %s)", type(judge_raw).__name__ ) if judge_scores: - score_strs = [f"{name}: {score}/10" for name, score in judge_scores] + score_strs = [f"{name}: {score}" for name, score in judge_scores] parts.append(f"Judge: {html.escape(', '.join(score_strs))}") if not parts: @@ -454,22 +462,23 @@ def _render_scores_section(row: pd.Series) -> str: return "
" + "".join(parts) + "
" -def _extract_judge_scores(raw: object) -> list[tuple[str, int]]: +def _extract_judge_scores(raw: object) -> list[tuple[str, int | str]]: """Extract (name, score) pairs from the judge evaluation column. LLMJudgeColumnConfig output is a plain dict keyed by rubric name, each - value carrying ``{"score": , "reasoning": "..."}``. + value carrying ``{"score": , "reasoning": "..."}``. Scores are + returned as-is — callers must not assume int (rewrite mode uses strings). """ if not isinstance(raw, dict): return [] - result: list[tuple[str, int]] = [] + result: list[tuple[str, int | str]] = [] for name, value in raw.items(): if not isinstance(value, dict) or "score" not in value: continue - try: - result.append((str(name), int(value["score"]))) - except (ValueError, TypeError): + score = value["score"] + if score is None: continue + result.append((str(name), score)) return result diff --git a/src/anonymizer/interface/results.py b/src/anonymizer/interface/results.py index fa97c983..900653a6 100644 --- a/src/anonymizer/interface/results.py +++ b/src/anonymizer/interface/results.py @@ -8,6 +8,7 @@ import pandas as pd from anonymizer.config.replace_strategies import ReplaceMethod +from anonymizer.config.rewrite import PrivacyGoal from anonymizer.engine.ndd.adapter import FailedRecord from anonymizer.interface.display import render_record_html @@ -59,6 +60,10 @@ class AnonymizerResult(_DisplayMixin): ``run()`` / ``preview()``; consumed by ``evaluate()`` to dispatch the right judges. ``None`` on results that were constructed by hand or loaded from a pre-strategy-tracking format. + rewrite_config: The privacy goal that produced this result when rewrite + mode was used. Set by ``run()`` / ``preview()``; consumed by + ``evaluate()`` to dispatch the rewrite judges. Mutually exclusive + with ``replace_method``. """ dataframe: pd.DataFrame @@ -66,6 +71,7 @@ class AnonymizerResult(_DisplayMixin): resolved_text_column: str failed_records: list[FailedRecord] replace_method: ReplaceMethod | None = None + rewrite_config: PrivacyGoal | None = None _display_cycle_index: int = field(default=0, init=False, repr=False) def __repr__(self) -> str: @@ -96,6 +102,9 @@ class PreviewResult(_DisplayMixin): ``preview()``; consumed by ``evaluate()`` to dispatch the right judges. ``None`` on results that were constructed by hand or loaded from a pre-strategy-tracking format. + rewrite_config: The privacy goal that produced this preview when rewrite + mode was used. Set by ``preview()``; consumed by ``evaluate()`` to + dispatch the rewrite judges. Mutually exclusive with ``replace_method``. """ dataframe: pd.DataFrame @@ -104,6 +113,7 @@ class PreviewResult(_DisplayMixin): failed_records: list[FailedRecord] preview_num_records: int replace_method: ReplaceMethod | None = None + rewrite_config: PrivacyGoal | None = None _display_cycle_index: int = field(default=0, init=False, repr=False) def __repr__(self) -> str: From 59a7c93951514c59c66ca86374effbde1051a666 Mon Sep 17 00:00:00 2001 From: memadi Date: Wed, 10 Jun 2026 17:33:41 -0700 Subject: [PATCH 07/15] update evaluation-rewrite Signed-off-by: memadi --- src/anonymizer/engine/rewrite/final_judge.py | 12 +- .../engine/rewrite/rewrite_workflow.py | 4 +- src/anonymizer/interface/anonymizer.py | 4 +- tests/conftest.py | 2 +- tests/engine/test_final_judge.py | 168 +++--------------- tests/engine/test_model_loader.py | 5 +- tests/engine/test_rewrite_workflow.py | 125 ++++++------- tests/interface/test_anonymizer_telemetry.py | 3 +- tests/interface/test_display.py | 14 +- 9 files changed, 103 insertions(+), 234 deletions(-) diff --git a/src/anonymizer/engine/rewrite/final_judge.py b/src/anonymizer/engine/rewrite/final_judge.py index 507605b3..1196d0fd 100644 --- a/src/anonymizer/engine/rewrite/final_judge.py +++ b/src/anonymizer/engine/rewrite/final_judge.py @@ -134,9 +134,9 @@ def _judge_prompt(privacy_goal: PrivacyGoal) -> str: name="privacy", description="Privacy protection — how well the rewrite removes linkage risk to the original record.", options={ - "high": "Original direct identifiers removed; remaining quasi-identifiers create low linkage risk.", + "high": "Original direct identifiers removed; remaining quasi-identifiers create low linkage risk.", "medium": "No obvious direct identifiers remain, but a distinctive quasi-identifier bundle creates noticeable linkage risk.", - "low": "The record is easily or near-certainly linkable back to the original: key direct identifiers remain, or enough identifying detail survives that re-identification requires minimal effort regardless of how many entities were successfully transformed.", + "low": "The record is easily or near-certainly linkable back to the original: key direct identifiers remain, or enough identifying detail survives that re-identification requires minimal effort regardless of how many entities were successfully transformed.", }, ) @@ -144,9 +144,9 @@ def _judge_prompt(privacy_goal: PrivacyGoal) -> str: name="quality", description="Content quality — how well important meaning, facts, and structure are preserved.", options={ - "high": "Important meaning, facts, and structure fully preserved.", + "high": "Important meaning, facts, and structure fully preserved.", "medium": "Most content preserved; minor details lost or slightly distorted.", - "low": "Material loss of important information, contradictions, or distorted core meaning.", + "low": "Material loss of important information, contradictions, or distorted core meaning.", }, ) @@ -154,9 +154,9 @@ def _judge_prompt(privacy_goal: PrivacyGoal) -> str: name="style", description="Writing style — does the rewritten text read as fluent, coherent, human-written prose?", options={ - "high": "Reads as fluent, coherent, human-written prose.", + "high": "Reads as fluent, coherent, human-written prose.", "medium": "Mostly readable; isolated awkward phrasing or stiff transitions.", - "low": "Noticeably unnatural; broken grammar, placeholder-like language, or machine-generated feel.", + "low": "Noticeably unnatural; broken grammar, placeholder-like language, or machine-generated feel.", }, ) diff --git a/src/anonymizer/engine/rewrite/rewrite_workflow.py b/src/anonymizer/engine/rewrite/rewrite_workflow.py index af222b25..95491340 100644 --- a/src/anonymizer/engine/rewrite/rewrite_workflow.py +++ b/src/anonymizer/engine/rewrite/rewrite_workflow.py @@ -417,9 +417,7 @@ def evaluate( selected_models=selected_models, privacy_goal=privacy_goal, ) - effective_preview = ( - min(preview_num_records, len(entity_rows)) if preview_num_records is not None else None - ) + effective_preview = min(preview_num_records, len(entity_rows)) if preview_num_records is not None else None judge_seed = select_seed_cols(entity_rows, derive_seed_columns(judge_columns, entity_rows)) judge_result = self._adapter.run_workflow( judge_seed, diff --git a/src/anonymizer/interface/anonymizer.py b/src/anonymizer/interface/anonymizer.py index afe25ba5..299d8b7b 100644 --- a/src/anonymizer/interface/anonymizer.py +++ b/src/anonymizer/interface/anonymizer.py @@ -740,9 +740,9 @@ def _build_user_dataframe(trace_dataframe: pd.DataFrame, *, resolved_text_column COL_WEIGHTED_LEAKAGE_RATE, COL_ANY_HIGH_LEAKED, COL_NEEDS_HUMAN_REVIEW, - COL_DETECTION_VALID, # only present after evaluate() + COL_DETECTION_VALID, # only present after evaluate() COL_DETECTION_INVALID_ENTITIES, # only present after evaluate() - COL_JUDGE_EVALUATION, # only present after evaluate() + COL_JUDGE_EVALUATION, # only present after evaluate() } elif f"{text_col}_replaced" in t.columns: allowed = { diff --git a/tests/conftest.py b/tests/conftest.py index 8d374c95..391fb7ab 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -104,13 +104,13 @@ def stub_slim_model_selection() -> ModelSelection: rewriter="known", evaluator="known", repairer="known", - judge="known", ), evaluate=EvaluateModelSelection( detection_validity_judge="known", replace_type_fidelity_judge="known", replace_relational_consistency_judge="known", replace_attribute_fidelity_judge="known", + rewrite_judge="known", ), ) diff --git a/tests/engine/test_final_judge.py b/tests/engine/test_final_judge.py index a33cb7c5..b55e1375 100644 --- a/tests/engine/test_final_judge.py +++ b/tests/engine/test_final_judge.py @@ -3,24 +3,20 @@ from __future__ import annotations -import pytest -from data_designer.config.column_configs import CustomColumnConfig, LLMJudgeColumnConfig +from data_designer.config.column_configs import LLMJudgeColumnConfig -from anonymizer.config.models import RewriteModelSelection -from anonymizer.config.rewrite import EvaluationCriteria, PrivacyGoal +from anonymizer.config.models import EvaluateModelSelection +from anonymizer.config.rewrite import PrivacyGoal from anonymizer.engine.constants import ( - COL_ANY_HIGH_LEAKED, COL_JUDGE_EVALUATION, - COL_LEAKAGE_MASS, - COL_NEEDS_HUMAN_REVIEW, COL_REWRITTEN_TEXT, COL_TEXT, - COL_UTILITY_SCORE, ) from anonymizer.engine.rewrite.final_judge import ( FinalJudgeWorkflow, - HumanReviewParams, - _determine_needs_human_review, + PRIVACY_RUBRIC, + QUALITY_RUBRIC, + STYLE_RUBRIC, _judge_prompt, ) @@ -29,8 +25,6 @@ preserve="General utility, content quality, and semantic meaning of the original text", ) -_STUB_EVALUATION = EvaluationCriteria() - # --------------------------------------------------------------------------- # Tests: _judge_prompt @@ -66,164 +60,60 @@ def test_judge_prompt_references_required_columns() -> None: # --------------------------------------------------------------------------- -def test_columns_returns_two_configs( - stub_rewrite_model_selection: RewriteModelSelection, +def test_columns_returns_one_config( + stub_evaluate_model_selection: EvaluateModelSelection, ) -> None: wf = FinalJudgeWorkflow() cols = wf.columns( - selected_models=stub_rewrite_model_selection, + selected_models=stub_evaluate_model_selection, privacy_goal=_STUB_PRIVACY_GOAL, - evaluation=_STUB_EVALUATION, ) - assert len(cols) == 2 + assert len(cols) == 1 -def test_judge_column_uses_judge_alias( - stub_rewrite_model_selection: RewriteModelSelection, +def test_judge_column_uses_rewrite_judge_alias( + stub_evaluate_model_selection: EvaluateModelSelection, ) -> None: wf = FinalJudgeWorkflow() cols = wf.columns( - selected_models=stub_rewrite_model_selection, + selected_models=stub_evaluate_model_selection, privacy_goal=_STUB_PRIVACY_GOAL, - evaluation=_STUB_EVALUATION, ) judge_cols = [c for c in cols if isinstance(c, LLMJudgeColumnConfig)] assert len(judge_cols) == 1 - assert judge_cols[0].model_alias == stub_rewrite_model_selection.judge + assert judge_cols[0].model_alias == stub_evaluate_model_selection.rewrite_judge def test_judge_column_has_three_rubrics( - stub_rewrite_model_selection: RewriteModelSelection, + stub_evaluate_model_selection: EvaluateModelSelection, ) -> None: wf = FinalJudgeWorkflow() cols = wf.columns( - selected_models=stub_rewrite_model_selection, + selected_models=stub_evaluate_model_selection, privacy_goal=_STUB_PRIVACY_GOAL, - evaluation=_STUB_EVALUATION, ) judge_col = next(c for c in cols if isinstance(c, LLMJudgeColumnConfig)) assert judge_col.name == COL_JUDGE_EVALUATION score_names = {s.name for s in judge_col.scores} - assert score_names == {"privacy", "quality", "naturalness"} - for score in judge_col.scores: - assert 1 in score.options - assert 10 in score.options - - -def test_needs_human_review_column_present( - stub_rewrite_model_selection: RewriteModelSelection, -) -> None: - wf = FinalJudgeWorkflow() - cols = wf.columns( - selected_models=stub_rewrite_model_selection, - privacy_goal=_STUB_PRIVACY_GOAL, - evaluation=_STUB_EVALUATION, - ) - custom_cols = [c for c in cols if isinstance(c, CustomColumnConfig)] - assert len(custom_cols) == 1 - assert custom_cols[0].name == COL_NEEDS_HUMAN_REVIEW + assert score_names == {"privacy", "quality", "style"} -def test_needs_human_review_column_uses_evaluation_thresholds( - stub_rewrite_model_selection: RewriteModelSelection, +def test_judge_rubrics_use_categorical_scores( + stub_evaluate_model_selection: EvaluateModelSelection, ) -> None: wf = FinalJudgeWorkflow() - evaluation = EvaluationCriteria(risk_tolerance="minimal") cols = wf.columns( - selected_models=stub_rewrite_model_selection, + selected_models=stub_evaluate_model_selection, privacy_goal=_STUB_PRIVACY_GOAL, - evaluation=evaluation, ) - custom_col = next(c for c in cols if isinstance(c, CustomColumnConfig)) - params = HumanReviewParams.model_validate(custom_col.generator_params) - assert params.flag_utility_below == 0.6 - assert params.flag_leakage_above == 1.0 - - -# --------------------------------------------------------------------------- -# Tests: _determine_needs_human_review -# --------------------------------------------------------------------------- - - -def _make_row( - rewritten_text: str | None = "some rewritten text", - utility_score: float = 0.8, - leakage_mass: float = 0.5, - any_high_leaked: bool = False, -) -> dict: - return { - COL_REWRITTEN_TEXT: rewritten_text, - COL_UTILITY_SCORE: utility_score, - COL_LEAKAGE_MASS: leakage_mass, - COL_ANY_HIGH_LEAKED: any_high_leaked, - } - - -def test_needs_human_review_flags_none_rewrite() -> None: - row = _make_row(rewritten_text=None) - params = HumanReviewParams(flag_utility_below=0.50, flag_leakage_above=2.0) - result = _determine_needs_human_review(row, generator_params=params) - assert result[COL_NEEDS_HUMAN_REVIEW] is True - - -def test_needs_human_review_flags_low_utility() -> None: - row = _make_row(utility_score=0.3) - params = HumanReviewParams(flag_utility_below=0.50, flag_leakage_above=2.0) - result = _determine_needs_human_review(row, generator_params=params) - assert result[COL_NEEDS_HUMAN_REVIEW] is True - - -def test_needs_human_review_flags_high_leakage() -> None: - row = _make_row(leakage_mass=3.0) - params = HumanReviewParams(flag_utility_below=0.50, flag_leakage_above=2.0) - result = _determine_needs_human_review(row, generator_params=params) - assert result[COL_NEEDS_HUMAN_REVIEW] is True - - -def test_needs_human_review_flags_any_high_leaked() -> None: - row = _make_row(any_high_leaked=True) - params = HumanReviewParams(flag_utility_below=0.50, flag_leakage_above=2.0) - result = _determine_needs_human_review(row, generator_params=params) - assert result[COL_NEEDS_HUMAN_REVIEW] is True - - -def test_needs_human_review_false_when_all_good() -> None: - row = _make_row(utility_score=0.8, leakage_mass=0.5, any_high_leaked=False) - params = HumanReviewParams(flag_utility_below=0.50, flag_leakage_above=2.0) - result = _determine_needs_human_review(row, generator_params=params) - assert result[COL_NEEDS_HUMAN_REVIEW] is False - - -def test_needs_human_review_none_thresholds_skip_checks() -> None: - row = _make_row(utility_score=0.1, leakage_mass=10.0) - params = HumanReviewParams(flag_utility_below=None, flag_leakage_above=None) - result = _determine_needs_human_review(row, generator_params=params) - assert result[COL_NEEDS_HUMAN_REVIEW] is False - - -def test_needs_human_review_exact_threshold_utility() -> None: - row = _make_row(utility_score=0.50) - params = HumanReviewParams(flag_utility_below=0.50, flag_leakage_above=2.0) - result = _determine_needs_human_review(row, generator_params=params) - assert result[COL_NEEDS_HUMAN_REVIEW] is False - - -def test_needs_human_review_exact_threshold_leakage() -> None: - row = _make_row(leakage_mass=2.0) - params = HumanReviewParams(flag_utility_below=0.50, flag_leakage_above=2.0) - result = _determine_needs_human_review(row, generator_params=params) - assert result[COL_NEEDS_HUMAN_REVIEW] is False - - -def test_needs_human_review_raises_on_invalid_utility_score() -> None: - row = _make_row(utility_score=None) - params = HumanReviewParams(flag_utility_below=0.50, flag_leakage_above=2.0) - with pytest.raises(TypeError): - _determine_needs_human_review(row, generator_params=params) + judge_col = next(c for c in cols if isinstance(c, LLMJudgeColumnConfig)) + for score in judge_col.scores: + assert "low" in score.options + assert "medium" in score.options + assert "high" in score.options -def test_needs_human_review_raises_on_invalid_leakage_mass() -> None: - row = _make_row(leakage_mass=None) - params = HumanReviewParams(flag_utility_below=0.50, flag_leakage_above=2.0) - with pytest.raises(TypeError): - _determine_needs_human_review(row, generator_params=params) +def test_rubric_names_match_constants() -> None: + assert PRIVACY_RUBRIC.name == "privacy" + assert QUALITY_RUBRIC.name == "quality" + assert STYLE_RUBRIC.name == "style" diff --git a/tests/engine/test_model_loader.py b/tests/engine/test_model_loader.py index 3b754dd4..818082ed 100644 --- a/tests/engine/test_model_loader.py +++ b/tests/engine/test_model_loader.py @@ -154,7 +154,7 @@ def test_load_default_model_selection_populates_all_workflows() -> None: assert selection.detection.latent_detector # Replace assert selection.replace.replacement_generator - # Rewrite — all 8 roles must be populated + # Rewrite — all 7 roles must be populated assert selection.rewrite.domain_classifier assert selection.rewrite.disposition_analyzer assert selection.rewrite.meaning_extractor @@ -162,7 +162,8 @@ def test_load_default_model_selection_populates_all_workflows() -> None: assert selection.rewrite.rewriter assert selection.rewrite.evaluator assert selection.rewrite.repairer - assert selection.rewrite.judge + # Evaluate — includes rewrite_judge + assert selection.evaluate.rewrite_judge def test_parse_model_configs_none_uses_defaults() -> None: diff --git a/tests/engine/test_rewrite_workflow.py b/tests/engine/test_rewrite_workflow.py index 33bba839..3537d07c 100644 --- a/tests/engine/test_rewrite_workflow.py +++ b/tests/engine/test_rewrite_workflow.py @@ -213,8 +213,8 @@ def test_passthrough_defaults_populated( assert df[COL_WEIGHTED_LEAKAGE_RATE].tolist() == [0.0, 0.0] assert df[COL_ANY_HIGH_LEAKED].tolist() == [False, False] assert df[COL_NEEDS_HUMAN_REVIEW].tolist() == [False, False] - assert df[COL_JUDGE_EVALUATION].tolist() == [None, None] assert df[COL_REPAIR_ITERATIONS].tolist() == [0, 0] + assert COL_JUDGE_EVALUATION not in df.columns def test_has_entities_returns_true_when_present(stub_entities_by_value_with_entities: dict) -> None: @@ -268,7 +268,7 @@ def test_calls_sub_workflows_in_order( workflow_names = [call.kwargs["workflow_name"] for call in adapter.run_workflow.call_args_list] assert workflow_names[0] == "rewrite-pipeline" assert workflow_names[1].startswith("rewrite-evaluate") - assert workflow_names[-1] == "rewrite-final-judge" + assert "rewrite-final-judge" not in workflow_names assert len(result.dataframe) == 1 @@ -290,13 +290,11 @@ def test_failed_records_accumulated_across_steps( ) -> None: failed_pipeline = FailedRecord(record_id="a", step="rewrite-pipeline", reason="timeout") failed_eval = FailedRecord(record_id="b", step="rewrite-evaluate-0", reason="timeout") - failed_judge = FailedRecord(record_id="c", step="rewrite-final-judge", reason="timeout") adapter = Mock() adapter.run_workflow.side_effect = [ WorkflowRunResult(dataframe=stub_pipeline_df, failed_records=[failed_pipeline]), WorkflowRunResult(dataframe=stub_eval_df, failed_records=[failed_eval]), - WorkflowRunResult(dataframe=stub_judge_df, failed_records=[failed_judge]), ] with patch(_REPLACE_PATCH) as mock_replace_cls: @@ -315,7 +313,7 @@ def test_failed_records_accumulated_across_steps( ) record_ids = {f.record_id for f in result.failed_records} - assert record_ids == {"a", "b", "c", "d"} + assert record_ids == {"a", "b", "d"} # --------------------------------------------------------------------------- @@ -325,96 +323,77 @@ def test_failed_records_accumulated_across_steps( def test_judge_failure_does_not_propagate( stub_model_configs: list[ModelConfig], - stub_rewrite_model_selection: RewriteModelSelection, - stub_replace_model_selection: ReplaceModelSelection, - stub_df_with_entities: pd.DataFrame, - stub_replace_df: pd.DataFrame, - stub_pipeline_df: pd.DataFrame, + stub_evaluate_model_selection, stub_eval_df: pd.DataFrame, ) -> None: + """evaluate() holistic judge failure is non-fatal; rows get COL_JUDGE_EVALUATION=None.""" adapter = Mock() - adapter.run_workflow.side_effect = [ - WorkflowRunResult(dataframe=stub_pipeline_df, failed_records=[]), - WorkflowRunResult(dataframe=stub_eval_df, failed_records=[]), - RuntimeError("Judge LLM unavailable"), - ] - with patch(_REPLACE_PATCH) as mock_replace_cls: - _mock_replace(mock_replace_cls, stub_replace_df) - wf = RewriteWorkflow(adapter=adapter) - result = wf.run( - stub_df_with_entities, - model_configs=stub_model_configs, - selected_models=stub_rewrite_model_selection, - replace_model_selection=stub_replace_model_selection, - privacy_goal=_PRIVACY_GOAL, - evaluation=_EVALUATION, - ) + wf = RewriteWorkflow(adapter=adapter) + # Mock detection judge to return successfully + wf._detection_judge_wf = Mock() + wf._detection_judge_wf.evaluate.return_value = Mock(dataframe=stub_eval_df.copy(), failed_records=[]) + # Make the holistic judge adapter call raise + adapter.run_workflow.side_effect = RuntimeError("Judge LLM unavailable") + + result = wf.evaluate( + stub_eval_df, + model_configs=stub_model_configs, + selected_models=stub_evaluate_model_selection, + privacy_goal=_PRIVACY_GOAL, + ) assert len(result.dataframe) == 1 - assert result.dataframe[COL_NEEDS_HUMAN_REVIEW].iloc[0] assert result.dataframe[COL_JUDGE_EVALUATION].iloc[0] is None def test_judge_partial_row_loss_preserves_all_rows( stub_model_configs: list[ModelConfig], - stub_rewrite_model_selection: RewriteModelSelection, - stub_replace_model_selection: ReplaceModelSelection, + stub_evaluate_model_selection, stub_df_two_entities: pd.DataFrame, ) -> None: - """Judge drops 1 of 2 rows -- surviving row gets scores, missing row gets defaults.""" - df = stub_df_two_entities + """evaluate() judge drops 1 of 2 rows — surviving row gets scores, missing row gets None.""" + df = stub_df_two_entities.copy() + df["_anonymizer_record_id"] = ["rec-0", "rec-1"] + + # Build a run()-style result dataframe with all required columns + run_result_df = df.copy() + run_result_df[COL_REWRITTEN_TEXT] = ["Maria works here", "Rob works there"] + run_result_df[COL_NEEDS_REPAIR] = False + run_result_df[COL_UTILITY_SCORE] = [0.9, 0.8] + run_result_df[COL_LEAKAGE_MASS] = [0.1, 0.2] + run_result_df[COL_ANY_HIGH_LEAKED] = False + run_result_df[COL_NEEDS_HUMAN_REVIEW] = False + run_result_df[COL_REPAIR_ITERATIONS] = 0 adapter = Mock() - pre_gen_df = df.copy() - pre_gen_df[COL_DOMAIN] = "BIOGRAPHY_PROFILE" - pre_gen_df["_anonymizer_row_order"] = [0, 1] - pre_gen_df["_anonymizer_record_id"] = ["rec-0", "rec-1"] - - rewrite_gen_df = pre_gen_df.copy() - rewrite_gen_df[COL_REWRITTEN_TEXT] = ["Maria works here", "Rob works there"] - rewrite_gen_df[COL_REPAIR_ITERATIONS] = 0 - - eval_df = rewrite_gen_df.copy() - eval_df[COL_NEEDS_REPAIR] = False - eval_df[COL_UTILITY_SCORE] = [0.9, 0.8] - eval_df[COL_LEAKAGE_MASS] = [0.1, 0.2] - eval_df[COL_ANY_HIGH_LEAKED] = False - - judge_df = eval_df.iloc[[0]].copy().reset_index(drop=True) - judge_df[COL_JUDGE_EVALUATION] = [{"privacy": {"score": 8}, "quality": {"score": 9}, "naturalness": {"score": 7}}] - judge_df[COL_NEEDS_HUMAN_REVIEW] = False - - replace_df = df.copy() - replace_df["_replacement_map"] = [{"replacements": []}, {"replacements": []}] - - adapter.run_workflow.side_effect = [ - WorkflowRunResult(dataframe=rewrite_gen_df, failed_records=[]), - WorkflowRunResult(dataframe=eval_df, failed_records=[]), - WorkflowRunResult( - dataframe=judge_df, - failed_records=[FailedRecord(record_id="rec-1", step="rewrite-final-judge", reason="timeout")], - ), + # detection judge returns both rows + det_df = run_result_df.copy() + # holistic judge returns only first row + judge_df = run_result_df.iloc[[0]].copy().reset_index(drop=True) + judge_df[COL_JUDGE_EVALUATION] = [ + {"privacy": {"score": "high"}, "quality": {"score": "high"}, "style": {"score": "medium"}} ] - with patch(_REPLACE_PATCH) as mock_replace_cls: - _mock_replace(mock_replace_cls, replace_df) - wf = RewriteWorkflow(adapter=adapter) - result = wf.run( - df, - model_configs=stub_model_configs, - selected_models=stub_rewrite_model_selection, - replace_model_selection=stub_replace_model_selection, - privacy_goal=_PRIVACY_GOAL, - evaluation=_EVALUATION, - ) + wf = RewriteWorkflow(adapter=adapter) + wf._detection_judge_wf = Mock() + wf._detection_judge_wf.evaluate.return_value = Mock(dataframe=det_df, failed_records=[]) + adapter.run_workflow.return_value = WorkflowRunResult( + dataframe=judge_df, + failed_records=[FailedRecord(record_id="rec-1", step="rewrite-final-judge", reason="timeout")], + ) + + result = wf.evaluate( + run_result_df, + model_configs=stub_model_configs, + selected_models=stub_evaluate_model_selection, + privacy_goal=_PRIVACY_GOAL, + ) assert len(result.dataframe) == 2 assert result.dataframe[COL_JUDGE_EVALUATION].iloc[0] is not None - assert not result.dataframe[COL_NEEDS_HUMAN_REVIEW].iloc[0] assert result.dataframe[COL_JUDGE_EVALUATION].iloc[1] is None - assert result.dataframe[COL_NEEDS_HUMAN_REVIEW].iloc[1] # --------------------------------------------------------------------------- diff --git a/tests/interface/test_anonymizer_telemetry.py b/tests/interface/test_anonymizer_telemetry.py index 9e4c2b05..a4864866 100644 --- a/tests/interface/test_anonymizer_telemetry.py +++ b/tests/interface/test_anonymizer_telemetry.py @@ -272,10 +272,11 @@ def test_rewrite_populates_rewrite_models( event = captured_events[0] assert event.transformation_type == "rewrite" assert event.rewriter_model != NOT_APPLICABLE - assert event.judge_model != NOT_APPLICABLE assert event.repairer_model != NOT_APPLICABLE assert event.max_repair_iterations == 2 assert event.strict_entity_protection is True + # judge runs in evaluate(), not run() — stays not_applicable here + assert event.judge_model == NOT_APPLICABLE # Substitute-only field stays not_applicable assert event.replacement_generator_model == NOT_APPLICABLE diff --git a/tests/interface/test_display.py b/tests/interface/test_display.py index fddc6028..199d1d86 100644 --- a/tests/interface/test_display.py +++ b/tests/interface/test_display.py @@ -16,7 +16,7 @@ COL_REPLACEMENT_MAP, COL_SENSITIVITY_DISPOSITION, ) -from anonymizer.engine.rewrite.final_judge import NATURALNESS_RUBRIC, PRIVACY_RUBRIC, QUALITY_RUBRIC +from anonymizer.engine.rewrite.final_judge import PRIVACY_RUBRIC, QUALITY_RUBRIC, STYLE_RUBRIC from anonymizer.engine.schemas import EntitiesSchema, EntitySchema from anonymizer.engine.schemas.rewrite import EntityDispositionSchema, SensitivityDispositionSchema from anonymizer.interface.display import ( @@ -548,9 +548,9 @@ def test_render_record_html_rewrite_mode_shows_rewrite_layout() -> None: def test_render_record_html_rewrite_mode_with_judge_scores() -> None: # Derive keys from the actual rubric configs so test↔runtime drift is impossible. judge_eval = { - PRIVACY_RUBRIC.name: {"score": 8, "reasoning": "good privacy"}, - QUALITY_RUBRIC.name: {"score": 9, "reasoning": "high quality"}, - NATURALNESS_RUBRIC.name: {"score": 7, "reasoning": "mostly natural"}, + PRIVACY_RUBRIC.name: {"score": "high", "reasoning": "good privacy"}, + QUALITY_RUBRIC.name: {"score": "high", "reasoning": "high quality"}, + STYLE_RUBRIC.name: {"score": "medium", "reasoning": "mostly natural"}, } row = pd.Series( { @@ -564,9 +564,9 @@ def test_render_record_html_rewrite_mode_with_judge_scores() -> None: } ) result = render_record_html(row, record_index=0) - assert f"{PRIVACY_RUBRIC.name}: 8/10" in result - assert f"{QUALITY_RUBRIC.name}: 9/10" in result - assert f"{NATURALNESS_RUBRIC.name}: 7/10" in result + assert f"{PRIVACY_RUBRIC.name}: high" in result + assert f"{QUALITY_RUBRIC.name}: high" in result + assert f"{STYLE_RUBRIC.name}: medium" in result def test_render_record_html_rewrite_mode_nan_judge_column_does_not_warn( From 763c3a6855d673afeb17fa39a88a36b2f942d00c Mon Sep 17 00:00:00 2001 From: memadi Date: Thu, 11 Jun 2026 12:27:36 -0700 Subject: [PATCH 08/15] nit Signed-off-by: memadi --- .../engine/rewrite/rewrite_workflow.py | 29 +++++++++++++ src/anonymizer/interface/display.py | 42 +++++++++++++++++-- 2 files changed, 68 insertions(+), 3 deletions(-) diff --git a/src/anonymizer/engine/rewrite/rewrite_workflow.py b/src/anonymizer/engine/rewrite/rewrite_workflow.py index 95491340..23920bbc 100644 --- a/src/anonymizer/engine/rewrite/rewrite_workflow.py +++ b/src/anonymizer/engine/rewrite/rewrite_workflow.py @@ -29,6 +29,7 @@ ) from anonymizer.engine.evaluation.detection_judge import DetectionJudgeWorkflow from anonymizer.engine.ndd.adapter import RECORD_ID_COLUMN, FailedRecord, NddAdapter +from anonymizer.engine.schemas import EntitiesByValueSchema from anonymizer.engine.replace.llm_replace_workflow import LlmReplaceWorkflow from anonymizer.engine.rewrite.domain_classification import DomainClassificationWorkflow from anonymizer.engine.rewrite.evaluate import EvaluateWorkflow @@ -53,6 +54,31 @@ } +def _detection_valid_fraction(row: pd.Series) -> float | None: + """Convert bool COL_DETECTION_VALID to a 0–1 fraction for rewrite evaluate output. + + The detection judge stores a bool (all_valid) but rewrite evaluate surfaces + a fraction so it sits on the same scale as utility_score and leakage_mass. + """ + valid = row.get(COL_DETECTION_VALID) + if valid is None: + return None + if bool(valid): + return 1.0 + invalid = row.get(COL_DETECTION_INVALID_ENTITIES) + invalid_count = len(invalid) if isinstance(invalid, list) else 0 + try: + total = sum( + len(e.labels) + for e in EntitiesByValueSchema.from_raw(row.get(COL_ENTITIES_BY_VALUE)).entities_by_value + ) + except Exception: + total = 0 + if total == 0: + return 1.0 + return max(0.0, (total - invalid_count) / total) + + def _has_entities(entities_by_value: object) -> bool: """Return True if this record has at least one detected entity.""" entities_by_value = normalize_payload(entities_by_value) @@ -410,6 +436,9 @@ def evaluate( ) entity_rows = _join_new_columns(entity_rows, detection_result.dataframe) all_failed.extend(detection_result.failed_records) + # Convert bool all_valid → 0–1 fraction so detection validity sits on the + # same scale as utility_score and leakage_mass in the rewrite scores section. + entity_rows[COL_DETECTION_VALID] = entity_rows.apply(_detection_valid_fraction, axis=1) # --- Holistic judge (privacy / quality / style) --- try: diff --git a/src/anonymizer/interface/display.py b/src/anonymizer/interface/display.py index d6e60579..296e3e9f 100644 --- a/src/anonymizer/interface/display.py +++ b/src/anonymizer/interface/display.py @@ -454,12 +454,48 @@ def _render_scores_section(row: pd.Series) -> str: "Judge evaluation present but produced no scores (unexpected shape: %s)", type(judge_raw).__name__ ) if judge_scores: - score_strs = [f"{name}: {score}" for name, score in judge_scores] - parts.append(f"Judge: {html.escape(', '.join(score_strs))}") + score_strs = [f"{html.escape(str(name))}: {html.escape(str(score))}" for name, score in judge_scores] + parts.append(f"Judge: {', '.join(score_strs)}") if not parts: return "

No scores available.

" - return "
" + "".join(parts) + "
" + + scores_html = "
" + "".join(parts) + "
" + + if detection_valid is not None and float(detection_valid) < 1.0: + invalid_entries = _normalize_invalid_entities(row.get(COL_DETECTION_INVALID_ENTITIES)) + if invalid_entries: + rows_html: list[str] = [] + for entry in invalid_entries: + value = html.escape(str(entry.get("value", ""))) + label = html.escape(str(entry.get("label", ""))) + reasoning = html.escape(str(entry.get("reasoning", ""))) + _, border_color = _color_for_label(entry.get("label", "")) + rows_html.append( + "" + f"{value}" + f"" + f"{label}" + f"{reasoning}" + "" + ) + scores_html += ( + "
" + f"Show {len(invalid_entries)} flagged " + "detection(s)" + "" + "" + "" + "" + "" + "" + f"{''.join(rows_html)}" + "
ValueLabelReason
" + "
" + ) + + return scores_html def _extract_judge_scores(raw: object) -> list[tuple[str, int | str]]: From fb37234e9a915c1343d4630b4cc37044538ad9f8 Mon Sep 17 00:00:00 2001 From: memadi Date: Thu, 11 Jun 2026 12:27:53 -0700 Subject: [PATCH 09/15] make format Signed-off-by: memadi --- src/anonymizer/engine/rewrite/rewrite_workflow.py | 5 ++--- tests/engine/test_final_judge.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/anonymizer/engine/rewrite/rewrite_workflow.py b/src/anonymizer/engine/rewrite/rewrite_workflow.py index 23920bbc..8807bb96 100644 --- a/src/anonymizer/engine/rewrite/rewrite_workflow.py +++ b/src/anonymizer/engine/rewrite/rewrite_workflow.py @@ -29,7 +29,6 @@ ) from anonymizer.engine.evaluation.detection_judge import DetectionJudgeWorkflow from anonymizer.engine.ndd.adapter import RECORD_ID_COLUMN, FailedRecord, NddAdapter -from anonymizer.engine.schemas import EntitiesByValueSchema from anonymizer.engine.replace.llm_replace_workflow import LlmReplaceWorkflow from anonymizer.engine.rewrite.domain_classification import DomainClassificationWorkflow from anonymizer.engine.rewrite.evaluate import EvaluateWorkflow @@ -41,6 +40,7 @@ from anonymizer.engine.rewrite.sensitivity_disposition import SensitivityDispositionWorkflow from anonymizer.engine.rewrite.workflow_utils import derive_seed_columns, select_seed_cols from anonymizer.engine.row_partitioning import merge_and_reorder, split_rows +from anonymizer.engine.schemas import EntitiesByValueSchema logger = logging.getLogger("anonymizer.rewrite.workflow") @@ -69,8 +69,7 @@ def _detection_valid_fraction(row: pd.Series) -> float | None: invalid_count = len(invalid) if isinstance(invalid, list) else 0 try: total = sum( - len(e.labels) - for e in EntitiesByValueSchema.from_raw(row.get(COL_ENTITIES_BY_VALUE)).entities_by_value + len(e.labels) for e in EntitiesByValueSchema.from_raw(row.get(COL_ENTITIES_BY_VALUE)).entities_by_value ) except Exception: total = 0 diff --git a/tests/engine/test_final_judge.py b/tests/engine/test_final_judge.py index b55e1375..344fa1e8 100644 --- a/tests/engine/test_final_judge.py +++ b/tests/engine/test_final_judge.py @@ -13,10 +13,10 @@ COL_TEXT, ) from anonymizer.engine.rewrite.final_judge import ( - FinalJudgeWorkflow, PRIVACY_RUBRIC, QUALITY_RUBRIC, STYLE_RUBRIC, + FinalJudgeWorkflow, _judge_prompt, ) From 28cb50f666e7797c6267754289f3fa67143de011 Mon Sep 17 00:00:00 2001 From: memadi Date: Thu, 11 Jun 2026 16:42:42 -0700 Subject: [PATCH 10/15] update notebooks according to the change Signed-off-by: memadi --- docs/concepts/evaluation.md | 133 ++++++++++++- docs/concepts/rewrite.md | 30 +-- .../04_rewriting_biographies.py | 18 +- .../05_rewriting_legal_documents.py | 14 ++ docs/notebooks/04_rewriting_biographies.ipynb | 41 ++++- .../05_rewriting_legal_documents.ipynb | 33 +++- plans/rewrite-evaluation/plan.md | 4 + src/anonymizer/interface/display.py | 121 +++++++----- tests/engine/test_final_judge.py | 13 ++ tests/engine/test_rewrite_workflow.py | 174 ++++++++++++++++++ tests/interface/test_anonymizer_interface.py | 85 +++++++++ tests/interface/test_display.py | 69 ++++++- 12 files changed, 654 insertions(+), 81 deletions(-) diff --git a/docs/concepts/evaluation.md b/docs/concepts/evaluation.md index b45661a2..25021b23 100644 --- a/docs/concepts/evaluation.md +++ b/docs/concepts/evaluation.md @@ -8,7 +8,7 @@ Anonymizer provides LLM-as-judge evaluation for both modes, replace and rewrite, | Mode | How evaluation runs | |------|---------------------| | **Replace** | Post-hoc, via a separate `Anonymizer.evaluate()` call after `run()` / `preview()`. | -| **Rewrite** | Runs automatically as part of every `run()` / `preview()` call. A dedicated post-hoc `evaluate()` call, matching replace mode, is planned for a future release. | +| **Rewrite** | Automatic leakage/utility scoring runs as part of every `run()` / `preview()` call. A separate `Anonymizer.evaluate()` call adds LLM-as-judge quality scoring. | --- @@ -161,6 +161,133 @@ selected_models: ## Rewrite Evaluation -Rewrite evaluation is part of the pipeline and runs automatically — there is no separate call. After the rewritten text is generated, an evaluate–repair loop scores each record for **utility** (how much semantic content was preserved) and **leakage mass** (how much sensitive information survived). Records that exceed the leakage threshold are sent back for repair, up to `max_repair_iterations` times. A final judge then produces a qualitative assessment and flags records that still need human review. +Rewrite evaluation has two layers: -The key output columns are `utility_score`, `leakage_mass`, `weighted_leakage_rate`, `any_high_leaked`, and `needs_human_review`. See [Rewrite](rewrite.md) for more details. +1. **Automatic (always runs)** — leakage mass, utility score, weighted leakage rate, and `needs_human_review` are computed as part of every `run()` / `preview()` call. See [Rewrite](rewrite.md) for the repair loop and output columns. + +2. **Post-hoc LLM judges (optional)** — call `Anonymizer.evaluate()` on a completed rewrite result to add the entity detection judge and three holistic quality rubrics. + +```python +from anonymizer import Anonymizer, AnonymizerConfig, AnonymizerInput, Rewrite + +anonymizer = Anonymizer() +cfg = AnonymizerConfig(rewrite=Rewrite()) +src = AnonymizerInput(source="data.csv", text_column="text") + +result = anonymizer.run(config=cfg, data=src) +evaluated = anonymizer.evaluate(result) +evaluated.display_record(0) +``` + +Both `run()` and `preview()` results can be saved and evaluated in a separate session: + +```python +import pickle + +preview = anonymizer.preview(config=cfg, data=src, num_records=15) + +with open("/tmp/preview.pkl", "wb") as f: + pickle.dump(preview, f) + +# … later … +with open("/tmp/preview.pkl", "rb") as f: + loaded = pickle.load(f) + +evaluated = anonymizer.evaluate(loaded) +``` + +--- + +### Entity Detection Judge + +Same judge as in replace mode — see [Entity Detection Judge](#entity-detection-judge) above. In rewrite mode, `detection_valid` is returned as a **0–1 fraction** (the share of detected entities that passed), rather than a boolean. A value of `1.0` means all detections are valid; lower values indicate the fraction of entities the judge flagged as incorrect. + +| Output column | Type | Description | +|---|---|---| +| `detection_valid` | `float \| None` | 1.0 if all detections pass; fraction of valid entities otherwise; `None` if the judge was unavailable. | +| `detection_invalid_entities` | `list` | Each flagged detection with value, label, and one-sentence reasoning. | + +--- + +### Rewrite Quality Judges + +Three rubrics evaluate the holistic quality of the rewritten text. All three run as a single LLM judge call and are stored together under `judge_evaluation`. + +#### Privacy + +> "Does the rewritten text adequately remove linkage risk to the original record?" + +Scores residual linkage risk after the rewrite — comparing rewritten values to originals, distinguishing direct identifiers from quasi-identifiers, and assessing whether remaining details narrow the candidate set of plausible matches. + +| Score | Meaning | +|-------|---------| +| `high` | Original direct identifiers removed; remaining quasi-identifiers create low linkage risk. | +| `medium` | No obvious direct identifiers remain, but a distinctive quasi-identifier bundle creates noticeable linkage risk. | +| `low` | Easily or near-certainly linkable — direct identifiers remain or enough detail survives that re-identification requires minimal effort. | + +#### Quality + +> "How well does the rewritten text preserve important meaning, facts, and structure?" + +Evaluates content preservation independent of privacy and style. Changes made for privacy reasons are not penalized when the core meaning is intact. + +| Score | Meaning | +|-------|---------| +| `high` | Important meaning, facts, and structure fully preserved. | +| `medium` | Most content preserved; minor details lost or slightly distorted. | +| `low` | Material loss of important information, contradictions, or distorted core meaning. | + +#### Style + +> "Does the rewritten text read as fluent, coherent, and human-written prose?" + +Evaluates readability, grammatical correctness, clarity, and phrasing — independent of content changes. + +| Score | Meaning | +|-------|---------| +| `high` | Fluent, coherent, human-written prose. | +| `medium` | Mostly readable; isolated awkward phrasing or stiff transitions. | +| `low` | Noticeably unnatural; broken grammar, placeholder-like language, or machine-generated feel. | + +The three rubric scores are stored together under the `judge_evaluation` column as a dict: + +```python +# Example judge_evaluation value for a single record +{ + "privacy": {"score": "high", "reasoning": "All direct identifiers removed..."}, + "quality": {"score": "medium", "reasoning": "Key facts preserved but some details lost..."}, + "style": {"score": "high", "reasoning": "Reads naturally throughout..."}, +} +``` + +--- + +## Reading rewrite evaluation results + +`display_record()` renders a formatted per-record view that includes the detection validity fraction and all three judge rubrics alongside the rewritten text: + +```python +evaluated.display_record(0) +``` + +For a tabular overview across all records: + +```python +evaluated.dataframe[["detection_valid", "judge_evaluation"]] +``` + +Use `trace_dataframe` for the full internal trace including raw judge outputs. + +--- + +## Model roles (rewrite evaluation) + +The rewrite quality judge defaults to `nemotron-30b-thinking`. The detection validity judge shares the `detection_validity_judge` role used by replace evaluation. Defaults are defined in [`evaluate.yaml`](https://github.com/NVIDIA-NeMo/Anonymizer/blob/main/src/anonymizer/config/default_model_configs/evaluate.yaml). Override them via `model_configs`: + +```yaml +# my_models.yaml +selected_models: + evaluate: + detection_validity_judge: your-model-alias + rewrite_judge: your-model-alias +``` diff --git a/docs/concepts/rewrite.md b/docs/concepts/rewrite.md index 50a540ce..c07038aa 100644 --- a/docs/concepts/rewrite.md +++ b/docs/concepts/rewrite.md @@ -11,7 +11,7 @@ Instead of replacing individual entities, rewrite mode transforms the entire tex [Detection](detection.md) runs first (same as [Replace mode](replace.md), plus latent entity detection for context-inferable information). This includes identifying signals that may not be explicitly tagged but can be deduced from combinations of details (e.g., location inferred from contextual cues). The text is then classified by domain, and each entity or attribute is assigned a sensitivity disposition based on contextual risk, recognizing that quasi-identifiers can emerge from any aspect of the text. -The text is then rewritten to reduce identifiability, applying targeted transformations that disrupt inference (e.g., weakening or removing linking details) rather than simply rewording content. The rewritten output is evaluated for both quality and privacy leakage using adversarial testing. If thresholds are exceeded, the system automatically refines the rewrite. A final judge provides a qualitative assessment of the rewritten record. Any records that failed to meet standards are tagged for human review. +The text is then rewritten to reduce identifiability, applying targeted transformations that disrupt inference (e.g., weakening or removing linking details) rather than simply rewording content. The rewritten output is evaluated for both quality and privacy leakage using adversarial testing. If thresholds are exceeded, the system automatically refines the rewrite. Any records that failed to meet standards are tagged for human review. --- @@ -128,16 +128,19 @@ config = AnonymizerConfig( ## Output columns -| Column | Description | -|--------|-------------| -| `{text_col}_rewritten` | The privacy-safe rewritten text. | -| `utility_score` | Quality preservation (0.0--1.0). Higher is better. | -| `leakage_mass` | Weighted privacy leakage. Lower is better. | -| `weighted_leakage_rate` | Normalized leakage (0.0--1.0) relative to the maximum possible leakage mass. | -| `any_high_leaked` | Whether any high-sensitivity entity leaked through. | -| `needs_human_review` | Flag for records that may need manual review. | +| Column | When available | Description | +|--------|---------------|-------------| +| `{text_col}_rewritten` | Always | The privacy-safe rewritten text. | +| `utility_score` | Always | Quality preservation (0.0--1.0). Higher is better. | +| `leakage_mass` | Always | Weighted privacy leakage. Lower is better. | +| `weighted_leakage_rate` | Always | Normalized leakage (0.0--1.0) relative to the maximum possible leakage mass. | +| `any_high_leaked` | Always | Whether any high-sensitivity entity leaked through. | +| `needs_human_review` | Always | Flag for records that may need manual review. | +| `detection_valid` | After `evaluate()` | Fraction of detected entities that passed the detection judge (0.0--1.0); `None` if judge unavailable. | +| `detection_invalid_entities` | After `evaluate()` | Flagged detections with value, label, and one-sentence reasoning. | +| `judge_evaluation` | After `evaluate()` | Dict with `privacy`, `quality`, and `style` rubric scores and reasoning. | -Use `preview.trace_dataframe` for the full pipeline trace (domain, disposition, QA pairs, repair iterations, judge evaluation). +Use `preview.trace_dataframe` for the full pipeline trace (domain, disposition, QA pairs, repair iterations). !!! note "No entities? No rewrite." @@ -180,4 +183,9 @@ Rewrite uses multiple LLM roles. All default to models in the [default config](m | `rewriter` | `gpt-oss-120b` | Generates the rewritten text. | | `evaluator` | `nemotron-30b-thinking` | Evaluates quality and leakage. | | `repairer` | `gpt-oss-120b` | Repairs high-leakage rewrites. | -| `judge` | `nemotron-30b-thinking` | Final quality/privacy judge. | + +--- + +## Evaluating rewrite output + +After running rewrite, you can score detection quality and the holistic rewrite quality using LLM-as-judge evaluation. See [Evaluation](evaluation.md) for details on the detection judge and the three rewrite quality rubrics (privacy, quality, style), and how to call `Anonymizer.evaluate()`. diff --git a/docs/notebook_source/04_rewriting_biographies.py b/docs/notebook_source/04_rewriting_biographies.py index 39400ee7..38d782ce 100644 --- a/docs/notebook_source/04_rewriting_biographies.py +++ b/docs/notebook_source/04_rewriting_biographies.py @@ -24,7 +24,8 @@ # 2. Classifies the domain and assigns sensitivity dispositions # 3. Generates a rewritten version that obscures sensitive entities # 4. Evaluates quality (utility) and privacy (leakage) with an automated repair loop -# 5. Runs a final LLM judge for informational scores +# +# After `run()`, call `Anonymizer.evaluate()` for optional LLM-as-judge scoring. # # #### 📚 What you'll learn # @@ -32,6 +33,7 @@ # - Set evaluation criteria and risk tolerance for automated quality checks # - Preview rewritten text and inspect utility / leakage scores # - Triage flagged records with `needs_human_review` +# - Run `evaluate()` for detection validity and holistic judge scores (privacy, quality, style) # # > **Tip:** First time running notebooks? Start with # > [setup instructions](https://nvidia-nemo.github.io/Anonymizer/latest/tutorials/). @@ -153,11 +155,25 @@ print(f"{len(flagged)} of {len(df)} records flagged for human review") flagged.head() +# %% [markdown] +# ## 🔬 Evaluate (optional) +# +# Call `evaluate()` to run LLM-as-judge scoring on the rewrite result — detection validity and three quality rubrics (privacy, quality, style). +# See [Evaluation](../../concepts/evaluation/#rewrite-evaluation) for details. + +# %% +evaluated = anonymizer.evaluate(result) + +# %% +evaluated.display_record(0) + # %% [markdown] # ## ⏭️ Next steps # # - **[⚖️ Rewriting Legal Documents](../05_rewriting_legal_documents/)** -- # rewrite legal text with custom entity labels and domain-specific privacy goals. +# - **[📊 Evaluation](../../concepts/evaluation/#rewrite-evaluation)** -- +# learn about the detection validity and rewrite quality judges in detail. # - **[🎯 Choosing a Replacement Strategy](../03_choosing_a_replacement_strategy/)** -- # compare Redact, Annotate, Hash, and Substitute if you prefer token-level replacement. # - **[🔍 Inspecting Detected Entities](../02_inspecting_detected_entities/)** -- diff --git a/docs/notebook_source/05_rewriting_legal_documents.py b/docs/notebook_source/05_rewriting_legal_documents.py index 560032a0..d75d790a 100644 --- a/docs/notebook_source/05_rewriting_legal_documents.py +++ b/docs/notebook_source/05_rewriting_legal_documents.py @@ -179,9 +179,23 @@ print(f"{len(flagged)} of {len(df)} records flagged for human review") flagged.head() +# %% [markdown] +# ## 🔬 Evaluate (optional) +# +# Call `evaluate()` to run LLM-as-judge scoring on the rewrite result — detection validity and three quality rubrics (privacy, quality, style). +# See [Evaluation](../../concepts/evaluation/#rewrite-evaluation) for details. + +# %% +evaluated = anonymizer.evaluate(result) + +# %% +evaluated.display_record(0) + # %% [markdown] # ## ⏭️ Next steps # +# - **[📊 Evaluation](../../concepts/evaluation/#rewrite-evaluation)** -- +# learn about the detection validity and rewrite quality judges in detail. # - **[🔍 Inspecting Detected Entities](../02_inspecting_detected_entities/)** -- # debug what the detection pipeline found before rewriting. # - **Try it on your own data!** Swap in your CSV, define entity labels for your diff --git a/docs/notebooks/04_rewriting_biographies.ipynb b/docs/notebooks/04_rewriting_biographies.ipynb index 924b20f2..67fd1532 100644 --- a/docs/notebooks/04_rewriting_biographies.ipynb +++ b/docs/notebooks/04_rewriting_biographies.ipynb @@ -14,7 +14,8 @@ "2. Classifies the domain and assigns sensitivity dispositions\n", "3. Generates a rewritten version that obscures sensitive entities\n", "4. Evaluates quality (utility) and privacy (leakage) with an automated repair loop\n", - "5. Runs a final LLM judge for informational scores\n", + "\n", + "After `run()`, call `Anonymizer.evaluate()` for optional LLM-as-judge scoring.\n", "\n", "#### 📚 What you'll learn\n", "\n", @@ -22,6 +23,7 @@ "- Set evaluation criteria and risk tolerance for automated quality checks\n", "- Preview rewritten text and inspect utility / leakage scores\n", "- Triage flagged records with `needs_human_review`\n", + "- Run `evaluate()` for detection validity and holistic judge scores (privacy, quality, style)\n", "\n", "> **Tip:** First time running notebooks? Start with\n", "> [setup instructions](https://nvidia-nemo.github.io/Anonymizer/latest/tutorials/)." @@ -755,6 +757,37 @@ "flagged.head()" ] }, + { + "cell_type": "markdown", + "id": "e1ad0026", + "metadata": {}, + "source": [ + "## 🔬 Evaluate (optional)\n", + "\n", + "Call `evaluate()` to run LLM-as-judge scoring on the rewrite result — detection validity and three quality rubrics (privacy, quality, style).\n", + "See [Evaluation](../../concepts/evaluation/#rewrite-evaluation) for details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a36d3c26", + "metadata": {}, + "outputs": [], + "source": [ + "evaluated = anonymizer.evaluate(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13126cd1", + "metadata": {}, + "outputs": [], + "source": [ + "evaluated.display_record(0)" + ] + }, { "cell_type": "markdown", "id": "e601cc9d", @@ -764,6 +797,8 @@ "\n", "- **[⚖️ Rewriting Legal Documents](../05_rewriting_legal_documents/)** --\n", " rewrite legal text with custom entity labels and domain-specific privacy goals.\n", + "- **[📊 Evaluation](../../concepts/evaluation/#rewrite-evaluation)** --\n", + " learn about the detection validity and rewrite quality judges in detail.\n", "- **[🎯 Choosing a Replacement Strategy](../03_choosing_a_replacement_strategy/)** --\n", " compare Redact, Annotate, Hash, and Substitute if you prefer token-level replacement.\n", "- **[🔍 Inspecting Detected Entities](../02_inspecting_detected_entities/)** --\n", @@ -773,7 +808,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": ".venv (3.11.13)", "language": "python", "name": "python3" }, @@ -787,7 +822,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/docs/notebooks/05_rewriting_legal_documents.ipynb b/docs/notebooks/05_rewriting_legal_documents.ipynb index 9f930618..aa249767 100644 --- a/docs/notebooks/05_rewriting_legal_documents.ipynb +++ b/docs/notebooks/05_rewriting_legal_documents.ipynb @@ -1027,18 +1027,33 @@ "flagged.head()" ] }, + { + "cell_type": "markdown", + "id": "4d9eb0a5", + "source": "## 🔬 Evaluate (optional)\n\nCall `evaluate()` to run LLM-as-judge scoring on the rewrite result — detection validity and three quality rubrics (privacy, quality, style).\nSee [Evaluation](../../concepts/evaluation/#rewrite-evaluation) for details.", + "metadata": {} + }, + { + "cell_type": "code", + "id": "d5fd6424", + "source": "evaluated = anonymizer.evaluate(result)", + "metadata": {}, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "id": "3626d34c", + "source": "evaluated.display_record(0)", + "metadata": {}, + "execution_count": null, + "outputs": [] + }, { "cell_type": "markdown", "id": "31bd00f6", "metadata": {}, - "source": [ - "## ⏭️ Next steps\n", - "\n", - "- **[🔍 Inspecting Detected Entities](../02_inspecting_detected_entities/)** --\n", - " debug what the detection pipeline found before rewriting.\n", - "- **Try it on your own data!** Swap in your CSV, define entity labels for your\n", - " domain, and set a `PrivacyGoal` that fits -- you've got all the building blocks." - ] + "source": "## ⏭️ Next steps\n\n- **[📊 Evaluation](../../concepts/evaluation/#rewrite-evaluation)** --\n learn about the detection validity and rewrite quality judges in detail.\n- **[🔍 Inspecting Detected Entities](../02_inspecting_detected_entities/)** --\n debug what the detection pipeline found before rewriting.\n- **Try it on your own data!** Swap in your CSV, define entity labels for your\n domain, and set a `PrivacyGoal` that fits -- you've got all the building blocks." } ], "metadata": { @@ -1062,4 +1077,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/plans/rewrite-evaluation/plan.md b/plans/rewrite-evaluation/plan.md index cb3fc497..ec7a241f 100644 --- a/plans/rewrite-evaluation/plan.md +++ b/plans/rewrite-evaluation/plan.md @@ -303,6 +303,10 @@ score_strs = [f"{name}: {score}" for name, score in judge_scores] In rewrite mode the `COL_NEEDS_HUMAN_REVIEW` column must be displayed as **"Rewrite Need Review"** (not the generic "Needs Review" used in replace mode). Update the label resolution in `display.py` to emit the rewrite-specific label when rendering a rewrite result. +### Invalid-entities expandable table + +When `COL_DETECTION_VALID < 1.0` and `COL_DETECTION_INVALID_ENTITIES` contains entries, append a collapsed `
` block below the scores `
`. The summary line reads "Show N flagged detection(s)"; expanding it reveals a three-column table (Value / Label / Reason). This is conditional — records with `detection_valid == 1.0` or no invalid-entity entries show no extra element. The `_extract_judge_scores` fix (remove `int()` cast) is required for string scores to appear at all; without it, `ValueError` was silently swallowed for `"low"/"medium"/"high"` values, causing the judge section to never render. + --- ## Step 6 — Docs and skills diff --git a/src/anonymizer/interface/display.py b/src/anonymizer/interface/display.py index 296e3e9f..487444a4 100644 --- a/src/anonymizer/interface/display.py +++ b/src/anonymizer/interface/display.py @@ -414,8 +414,11 @@ def _normalize_replacement_map(raw: str | dict | object) -> list[dict[str, str]] def _render_scores_section(row: pd.Series) -> str: - """Render utility/leakage metrics and optional judge scores.""" - parts: list[str] = [] + """Render scores in up to three rows: objective metrics | detection validity | judge.""" + section_rows: list[str] = [] + + # --- Row 1: objective metrics + needs-review badge --- + metric_parts: list[str] = [] utility = row.get("utility_score") leakage = row.get("leakage_mass") @@ -423,30 +426,71 @@ def _render_scores_section(row: pd.Series) -> str: needs_review = row.get("needs_human_review") if utility is not None: - parts.append(f"Utility: {utility:.2f}") + metric_parts.append(f"Utility: {utility:.2f}") if leakage is not None: - parts.append(f"Leakage: {leakage:.2f}") + metric_parts.append(f"Leakage: {leakage:.2f}") if weighted_leakage_rate is not None: - parts.append( + metric_parts.append( "Weighted Leakage Rate: " f"{weighted_leakage_rate:.2f}" ) - detection_valid = row.get(COL_DETECTION_VALID) - if detection_valid is not None: - parts.append( - f"Detection Validity: {float(detection_valid):.2f}" - ) - if needs_review is not None: is_rewrite = "rewritten" in "".join(str(k) for k in row.index) label = "Rewrite Need Review" if is_rewrite else "Needs Review" badge_color = "#ef4444" if needs_review else "#22c55e" badge_text = "Yes" if needs_review else "No" - parts.append( + metric_parts.append( f"{label}: " f"{badge_text}" ) + if metric_parts: + section_rows.append("
" + "".join(metric_parts) + "
") + # --- Row 2: detection validity inline with flagged-entities dropdown --- + detection_valid = row.get(COL_DETECTION_VALID) + if detection_valid is not None: + det_span = ( + f"Detection Validity: {float(detection_valid):.2f}" + ) + details_html = "" + if float(detection_valid) < 1.0: + invalid_entries = _normalize_invalid_entities(row.get(COL_DETECTION_INVALID_ENTITIES)) + if invalid_entries: + rows_html: list[str] = [] + for entry in invalid_entries: + value = html.escape(str(entry.get("value", ""))) + label = html.escape(str(entry.get("label", ""))) + reasoning = html.escape(str(entry.get("reasoning", ""))) + _, border_color = _color_for_label(entry.get("label", "")) + rows_html.append( + "" + f"{value}" + f"" + f"{label}" + f"{reasoning}" + "" + ) + details_html = ( + "
" + f"Show {len(invalid_entries)} flagged " + "detection(s)" + "" + "" + "" + "" + "" + "" + f"{''.join(rows_html)}" + "
ValueLabelReason
" + "
" + ) + section_rows.append( + "
" + f"{det_span}{details_html}
" + ) + + # --- Row 3: judge scores with highlighted criterion names --- judge_raw = row.get(COL_JUDGE_EVALUATION) judge_scores = _extract_judge_scores(judge_raw) if isinstance(judge_raw, dict) and not judge_scores: @@ -454,48 +498,23 @@ def _render_scores_section(row: pd.Series) -> str: "Judge evaluation present but produced no scores (unexpected shape: %s)", type(judge_raw).__name__ ) if judge_scores: - score_strs = [f"{html.escape(str(name))}: {html.escape(str(score))}" for name, score in judge_scores] - parts.append(f"Judge: {', '.join(score_strs)}") + score_parts = [ + f"" + f"{html.escape(str(name))}: {html.escape(str(score))}" + for name, score in judge_scores + ] + section_rows.append( + "
" + "
Judge
" + "
" + "".join(score_parts) + "
" + "
" + ) - if not parts: + if not section_rows: return "

No scores available.

" - scores_html = "
" + "".join(parts) + "
" - - if detection_valid is not None and float(detection_valid) < 1.0: - invalid_entries = _normalize_invalid_entities(row.get(COL_DETECTION_INVALID_ENTITIES)) - if invalid_entries: - rows_html: list[str] = [] - for entry in invalid_entries: - value = html.escape(str(entry.get("value", ""))) - label = html.escape(str(entry.get("label", ""))) - reasoning = html.escape(str(entry.get("reasoning", ""))) - _, border_color = _color_for_label(entry.get("label", "")) - rows_html.append( - "" - f"{value}" - f"" - f"{label}" - f"{reasoning}" - "" - ) - scores_html += ( - "
" - f"Show {len(invalid_entries)} flagged " - "detection(s)" - "" - "" - "" - "" - "" - "" - f"{''.join(rows_html)}" - "
ValueLabelReason
" - "
" - ) - - return scores_html + return "
" + "".join(section_rows) + "
" def _extract_judge_scores(raw: object) -> list[tuple[str, int | str]]: diff --git a/tests/engine/test_final_judge.py b/tests/engine/test_final_judge.py index 344fa1e8..fd8c3ce0 100644 --- a/tests/engine/test_final_judge.py +++ b/tests/engine/test_final_judge.py @@ -117,3 +117,16 @@ def test_rubric_names_match_constants() -> None: assert PRIVACY_RUBRIC.name == "privacy" assert QUALITY_RUBRIC.name == "quality" assert STYLE_RUBRIC.name == "style" + + +def test_judge_prompt_references_style_not_naturalness() -> None: + prompt = _judge_prompt(_STUB_PRIVACY_GOAL) + assert "style" in prompt.lower() + assert "naturalness" not in prompt.lower() + + +def test_judge_prompt_references_categorical_scale() -> None: + prompt = _judge_prompt(_STUB_PRIVACY_GOAL) + assert "high" in prompt + assert "medium" in prompt + assert "low" in prompt diff --git a/tests/engine/test_rewrite_workflow.py b/tests/engine/test_rewrite_workflow.py index 3537d07c..716e0b07 100644 --- a/tests/engine/test_rewrite_workflow.py +++ b/tests/engine/test_rewrite_workflow.py @@ -13,6 +13,7 @@ from anonymizer.config.rewrite import EvaluationCriteria, PrivacyGoal from anonymizer.engine.constants import ( COL_ANY_HIGH_LEAKED, + COL_DETECTION_VALID, COL_DOMAIN, COL_ENTITIES_BY_VALUE, COL_JUDGE_EVALUATION, @@ -866,3 +867,176 @@ def test_passthrough_rows_get_defaults( assert df[COL_UTILITY_SCORE].iloc[1] == 1.0 assert df[COL_LEAKAGE_MASS].iloc[1] == 0.0 assert not df[COL_NEEDS_HUMAN_REVIEW].iloc[1] + + +# --------------------------------------------------------------------------- +# Tests: evaluate() — happy path and column presence +# --------------------------------------------------------------------------- + + +def test_evaluate_produces_judge_evaluation_column( + stub_model_configs: list[ModelConfig], + stub_evaluate_model_selection, + stub_eval_df: pd.DataFrame, +) -> None: + adapter = Mock() + wf = RewriteWorkflow(adapter=adapter) + + det_df = stub_eval_df.copy() + det_df[COL_DETECTION_VALID] = True + wf._detection_judge_wf = Mock() + wf._detection_judge_wf.evaluate.return_value = Mock(dataframe=det_df, failed_records=[]) + + judge_df = stub_eval_df.copy() + judge_df[COL_JUDGE_EVALUATION] = [ + {"privacy": {"score": "high"}, "quality": {"score": "high"}, "style": {"score": "medium"}} + ] + adapter.run_workflow.return_value = WorkflowRunResult(dataframe=judge_df, failed_records=[]) + + result = wf.evaluate( + stub_eval_df, + model_configs=stub_model_configs, + selected_models=stub_evaluate_model_selection, + privacy_goal=_PRIVACY_GOAL, + ) + + assert COL_JUDGE_EVALUATION in result.dataframe.columns + assert result.dataframe[COL_JUDGE_EVALUATION].iloc[0] is not None + + +def test_evaluate_produces_detection_valid_column( + stub_model_configs: list[ModelConfig], + stub_evaluate_model_selection, + stub_eval_df: pd.DataFrame, +) -> None: + adapter = Mock() + wf = RewriteWorkflow(adapter=adapter) + + det_df = stub_eval_df.copy() + det_df[COL_DETECTION_VALID] = True + wf._detection_judge_wf = Mock() + wf._detection_judge_wf.evaluate.return_value = Mock(dataframe=det_df, failed_records=[]) + + judge_df = stub_eval_df.copy() + judge_df[COL_JUDGE_EVALUATION] = [None] + adapter.run_workflow.return_value = WorkflowRunResult(dataframe=judge_df, failed_records=[]) + + result = wf.evaluate( + stub_eval_df, + model_configs=stub_model_configs, + selected_models=stub_evaluate_model_selection, + privacy_goal=_PRIVACY_GOAL, + ) + + assert COL_DETECTION_VALID in result.dataframe.columns + + +# --------------------------------------------------------------------------- +# Tests: evaluate() — passthrough rows +# --------------------------------------------------------------------------- + + +def test_evaluate_skips_passthrough_rows( + stub_model_configs: list[ModelConfig], + stub_evaluate_model_selection, + stub_eval_df: pd.DataFrame, +) -> None: + """evaluate() must only send entity rows to the judges, not passthrough rows.""" + passthrough_row = stub_eval_df.iloc[0].to_dict() + passthrough_row[COL_ENTITIES_BY_VALUE] = {"entities_by_value": []} + mixed_df = pd.concat([stub_eval_df, pd.DataFrame([passthrough_row])], ignore_index=True) + + adapter = Mock() + wf = RewriteWorkflow(adapter=adapter) + + det_df = stub_eval_df.copy() + det_df[COL_DETECTION_VALID] = True + wf._detection_judge_wf = Mock() + wf._detection_judge_wf.evaluate.return_value = Mock(dataframe=det_df, failed_records=[]) + + judge_df = stub_eval_df.copy() + judge_df[COL_JUDGE_EVALUATION] = [None] + adapter.run_workflow.return_value = WorkflowRunResult(dataframe=judge_df, failed_records=[]) + + result = wf.evaluate( + mixed_df, + model_configs=stub_model_configs, + selected_models=stub_evaluate_model_selection, + privacy_goal=_PRIVACY_GOAL, + ) + + detection_call_df = wf._detection_judge_wf.evaluate.call_args.args[0] + assert len(detection_call_df) == 1 + assert len(result.dataframe) == 2 + + +def test_evaluate_passthrough_rows_get_none_judge_defaults( + stub_model_configs: list[ModelConfig], + stub_evaluate_model_selection, + stub_eval_df: pd.DataFrame, +) -> None: + """Passthrough rows must have COL_JUDGE_EVALUATION=None and COL_DETECTION_VALID=None.""" + passthrough_row = stub_eval_df.iloc[0].to_dict() + passthrough_row[COL_ENTITIES_BY_VALUE] = {"entities_by_value": []} + mixed_df = pd.concat([stub_eval_df, pd.DataFrame([passthrough_row])], ignore_index=True) + + adapter = Mock() + wf = RewriteWorkflow(adapter=adapter) + + det_df = stub_eval_df.copy() + det_df[COL_DETECTION_VALID] = True + wf._detection_judge_wf = Mock() + wf._detection_judge_wf.evaluate.return_value = Mock(dataframe=det_df, failed_records=[]) + + judge_df = stub_eval_df.copy() + judge_df[COL_JUDGE_EVALUATION] = [None] + adapter.run_workflow.return_value = WorkflowRunResult(dataframe=judge_df, failed_records=[]) + + result = wf.evaluate( + mixed_df, + model_configs=stub_model_configs, + selected_models=stub_evaluate_model_selection, + privacy_goal=_PRIVACY_GOAL, + ) + + passthrough_result = result.dataframe[ + result.dataframe[COL_ENTITIES_BY_VALUE].apply(lambda x: len(x.get("entities_by_value", [])) == 0) + ] + assert passthrough_result[COL_JUDGE_EVALUATION].iloc[0] is None + assert pd.isna(passthrough_result[COL_DETECTION_VALID].iloc[0]) + + +# --------------------------------------------------------------------------- +# Tests: needs_human_review not overwritten by evaluate() +# --------------------------------------------------------------------------- + + +def test_run_needs_human_review_not_overwritten_by_evaluate( + stub_model_configs: list[ModelConfig], + stub_evaluate_model_selection, + stub_eval_df: pd.DataFrame, +) -> None: + """COL_NEEDS_HUMAN_REVIEW set during run() must not be modified by evaluate().""" + run_df = stub_eval_df.copy() + run_df[COL_NEEDS_HUMAN_REVIEW] = True + + adapter = Mock() + wf = RewriteWorkflow(adapter=adapter) + + det_df = run_df.copy() + det_df[COL_DETECTION_VALID] = True + wf._detection_judge_wf = Mock() + wf._detection_judge_wf.evaluate.return_value = Mock(dataframe=det_df, failed_records=[]) + + judge_df = run_df.copy() + judge_df[COL_JUDGE_EVALUATION] = [{"privacy": {"score": "high"}}] + adapter.run_workflow.return_value = WorkflowRunResult(dataframe=judge_df, failed_records=[]) + + result = wf.evaluate( + run_df, + model_configs=stub_model_configs, + selected_models=stub_evaluate_model_selection, + privacy_goal=_PRIVACY_GOAL, + ) + + assert bool(result.dataframe[COL_NEEDS_HUMAN_REVIEW].iloc[0]) is True diff --git a/tests/interface/test_anonymizer_interface.py b/tests/interface/test_anonymizer_interface.py index f892285c..fc1ea879 100644 --- a/tests/interface/test_anonymizer_interface.py +++ b/tests/interface/test_anonymizer_interface.py @@ -16,7 +16,9 @@ from anonymizer.config.replace_strategies import Redact, Substitute from anonymizer.engine.constants import ( COL_DETECTED_ENTITIES, + COL_DETECTION_VALID, COL_FINAL_ENTITIES, + COL_JUDGE_EVALUATION, COL_REPLACED_TEXT, COL_REPLACEMENT_MAP, COL_REWRITTEN_TEXT, @@ -705,3 +707,86 @@ def test_evaluate_raises_value_error_on_legacy_result_without_replace_method() - with pytest.raises(ValueError, match="replace_method"): anonymizer.evaluate(legacy_result) # type: ignore[arg-type] + + +# --------------------------------------------------------------------------- +# Tests: Anonymizer.evaluate() for rewrite results +# --------------------------------------------------------------------------- + + +def test_run_rewrite_does_not_include_judge_in_user_dataframe(stub_input: AnonymizerInput) -> None: + """run() output must not include COL_JUDGE_EVALUATION — it only appears after evaluate().""" + config = AnonymizerConfig(rewrite=Rewrite()) + anonymizer, _, _, _ = _make_anonymizer() + + result = anonymizer.run(config=config, data=stub_input) + + assert COL_JUDGE_EVALUATION not in result.dataframe.columns + + +def test_evaluate_rewrite_result_adds_judge_columns(stub_input: AnonymizerInput) -> None: + """anonymizer.evaluate() on a rewrite result must add COL_JUDGE_EVALUATION.""" + config = AnonymizerConfig(rewrite=Rewrite()) + anonymizer, _, _, rewrite_runner = _make_anonymizer() + + run_result = anonymizer.run(config=config, data=stub_input) + + eval_df = pd.DataFrame( + { + COL_TEXT: ["Alice works at Acme"], + COL_REWRITTEN_TEXT: ["Beth works at Globex"], + "utility_score": [0.85], + "leakage_mass": [0.3], + "weighted_leakage_rate": [0.23], + "any_high_leaked": [False], + "needs_human_review": [False], + COL_JUDGE_EVALUATION: [{"privacy": {"score": "high"}}], + COL_DETECTION_VALID: [1.0], + } + ) + rewrite_runner.evaluate.return_value = RewriteResult(dataframe=eval_df, failed_records=[]) + + evaluated = anonymizer.evaluate(run_result) + + assert COL_JUDGE_EVALUATION in evaluated.dataframe.columns + + +def test_evaluate_rewrite_result_adds_detection_valid(stub_input: AnonymizerInput) -> None: + """anonymizer.evaluate() on a rewrite result must add COL_DETECTION_VALID.""" + config = AnonymizerConfig(rewrite=Rewrite()) + anonymizer, _, _, rewrite_runner = _make_anonymizer() + + run_result = anonymizer.run(config=config, data=stub_input) + + eval_df = pd.DataFrame( + { + COL_TEXT: ["Alice works at Acme"], + COL_REWRITTEN_TEXT: ["Beth works at Globex"], + "utility_score": [0.85], + "leakage_mass": [0.3], + "weighted_leakage_rate": [0.23], + "any_high_leaked": [False], + "needs_human_review": [False], + COL_JUDGE_EVALUATION: [None], + COL_DETECTION_VALID: [0.9], + } + ) + rewrite_runner.evaluate.return_value = RewriteResult(dataframe=eval_df, failed_records=[]) + + evaluated = anonymizer.evaluate(run_result) + + assert COL_DETECTION_VALID in evaluated.dataframe.columns + + +def test_evaluate_rewrite_raises_without_rewrite_config() -> None: + """evaluate() must raise ValueError when result has no rewrite_config and no replace_method.""" + anonymizer, _, _, _ = _make_anonymizer() + bare_result = SimpleNamespace( + dataframe=pd.DataFrame(), + trace_dataframe=pd.DataFrame(), + resolved_text_column="text", + rewrite_config=None, + ) + + with pytest.raises(ValueError): + anonymizer.evaluate(bare_result) # type: ignore[arg-type] diff --git a/tests/interface/test_display.py b/tests/interface/test_display.py index 199d1d86..c2cb1399 100644 --- a/tests/interface/test_display.py +++ b/tests/interface/test_display.py @@ -11,6 +11,7 @@ from anonymizer.engine.constants import ( COL_DETECTED_ENTITIES, + COL_DETECTION_VALID, COL_FINAL_ENTITIES, COL_JUDGE_EVALUATION, COL_REPLACEMENT_MAP, @@ -21,6 +22,7 @@ from anonymizer.engine.schemas.rewrite import EntityDispositionSchema, SensitivityDispositionSchema from anonymizer.interface.display import ( _build_replaced_entities, + _extract_judge_scores, _normalize_replacement_map, _render_highlighted_text, _verdict_badge, @@ -564,9 +566,9 @@ def test_render_record_html_rewrite_mode_with_judge_scores() -> None: } ) result = render_record_html(row, record_index=0) - assert f"{PRIVACY_RUBRIC.name}: high" in result - assert f"{QUALITY_RUBRIC.name}: high" in result - assert f"{STYLE_RUBRIC.name}: medium" in result + assert f"{PRIVACY_RUBRIC.name}: high" in result + assert f"{QUALITY_RUBRIC.name}: high" in result + assert f"{STYLE_RUBRIC.name}: medium" in result def test_render_record_html_rewrite_mode_nan_judge_column_does_not_warn( @@ -665,3 +667,64 @@ def test_render_record_html_replace_mode_unchanged_when_no_rewritten_column() -> assert "Replacement Map" in result assert "Rewritten" not in result assert "Scores" not in result + + +# --------------------------------------------------------------------------- +# Tests: _extract_judge_scores +# --------------------------------------------------------------------------- + + +def test_extract_judge_scores_returns_string_scores() -> None: + raw = { + "privacy": {"score": "high", "reasoning": "good"}, + "quality": {"score": "medium", "reasoning": "ok"}, + "style": {"score": "low", "reasoning": "rough"}, + } + result = _extract_judge_scores(raw) + assert result == [("privacy", "high"), ("quality", "medium"), ("style", "low")] + + +def test_extract_judge_scores_categorical_not_silently_empty() -> None: + """String scores must not be silently dropped (old int() cast raised ValueError).""" + raw = {"privacy": {"score": "high", "reasoning": "..."}} + result = _extract_judge_scores(raw) + assert len(result) == 1 + assert result[0] == ("privacy", "high") + + +# --------------------------------------------------------------------------- +# Tests: detection_valid and label rendering +# --------------------------------------------------------------------------- + + +def test_detection_valid_rendered_in_main_scores_section() -> None: + row = pd.Series( + { + "text": "Alice works at Acme", + "text_rewritten": "Beth works at Globex", + COL_DETECTED_ENTITIES: {"entities": []}, + "utility_score": 0.9, + "leakage_mass": 0.1, + "needs_human_review": False, + COL_DETECTION_VALID: 0.75, + } + ) + result = render_record_html(row, record_index=0) + assert "Detection Validity" in result + assert "0.75" in result + + +def test_rewrite_needs_human_review_label_is_rewrite_need_review() -> None: + row = pd.Series( + { + "text": "Alice works at Acme", + "text_rewritten": "Beth works at Globex", + COL_DETECTED_ENTITIES: {"entities": []}, + "utility_score": 0.9, + "leakage_mass": 0.1, + "needs_human_review": True, + } + ) + result = render_record_html(row, record_index=0) + assert "Rewrite Need Review" in result + assert "Needs Review:" not in result From 8a1a01397c41620bfa6be967ab9746e2345518c5 Mon Sep 17 00:00:00 2001 From: memadi Date: Fri, 12 Jun 2026 10:22:07 -0700 Subject: [PATCH 11/15] remove plan Signed-off-by: memadi --- plans/rewrite-evaluation/plan.md | 391 ------------------------------- 1 file changed, 391 deletions(-) delete mode 100644 plans/rewrite-evaluation/plan.md diff --git a/plans/rewrite-evaluation/plan.md b/plans/rewrite-evaluation/plan.md deleted file mode 100644 index ec7a241f..00000000 --- a/plans/rewrite-evaluation/plan.md +++ /dev/null @@ -1,391 +0,0 @@ -# Rewrite Evaluation Improvements — Implementation Plan - -## Problem - -The rewrite evaluation has four related issues: - -- **Judge scoring is baked into `run()` / `preview()`** — the final holistic judge (privacy / quality / style scores) runs unconditionally as part of the rewrite pipeline. Replace mode separates this into a dedicated `anonymizer.evaluate()` call. Rewrite has no equivalent, forcing users to pay the judge cost on every run even during fast iteration. -- **No detection validity score in rewrite mode** — `anonymizer.evaluate()` produces a `detection_valid` column for replace mode (via `DetectionJudgeWorkflow`). Rewrite mode runs the same GLiNER + LLM detection pipeline but never scores its accuracy. -- **Judge scores are 1–10 integers that saturate** — observed outputs cluster at the extremes (8–10 or 1–2), making the middle of the scale nearly unused. A 3-level categorical (`low` / `medium` / `high`) better matches the distribution, removes false precision, and makes rubric definitions more actionable. -- **"Naturalness" is an ambiguous name** — the dimension measures writing style and readability of the output text, not faithfulness to the original or any other concept the word might suggest. `style` is a clearer term for this and is unambiguous. - ---- - -## Design Decision: categorical scale - -**`low / medium / high`** is preferred over a boolean (`pass / fail`): - -- Boolean formally acknowledges the binary behaviour but loses the middle bucket entirely. A rewrite that is "mostly fluent but has one awkward sentence" must be called either passing or failing, which is worse for human review triage. -- `low / medium / high` gives reviewers actionable signal: `medium` means "worth a look but not a clear failure", which is exactly the category that surfaces regressions in prompt or model changes. -- The `Score` class already accepts string option keys, so no framework changes are needed. - -Rubric anchors for each dimension: - -| Score | Privacy | Quality | Style | -|---|---|---|---| -| `high` | Original direct identifiers removed; remaining quasi-identifiers create low linkage risk | Important meaning, facts, and structure fully preserved | Reads as natural, coherent, human-written prose | -| `medium` | No obvious direct identifiers remain, but a distinctive quasi-identifier bundle creates noticeable linkage risk | Most content preserved; minor details lost or slightly distorted | Mostly readable; isolated awkward phrasing or stiff transitions | -| `low` | One or more original direct identifiers or near-equivalents remain, or the record is easily linkable | Material loss of important information, contradictions, or distorted core meaning | Noticeably unnatural; broken grammar, placeholder-like language, or machine-generated feel | - ---- - -## Design Decision: `run()` vs `evaluate()` separation - -- `run()` outputs `utility_score`, `leakage_mass`, `weighted_leakage_rate`, `any_high_leaked`, and `needs_human_review` unchanged — the repair loop requires them and they are immediately useful after a run. -- `evaluate()` adds `detection_valid` (0–1) and the holistic judge scores (`privacy` / `quality` / `style`) on top of the existing `run()` output. - -This avoids re-running the repair loop in `evaluate()`: if `needs_human_review=True`, that is already the exhausted repair state and `evaluate()` simply reads the metrics already present. - ---- - -## Scope - -No new public API symbols beyond extending `EvaluateConfig` and `EvaluateModelSelection`. -All changes are backwards-compatible for replace-mode users. - ---- - -## Files Changed - -| File | Change | -|---|---| -| `src/anonymizer/engine/rewrite/final_judge.py` | Rename `NATURALNESS_RUBRIC` → `STYLE_RUBRIC`; change options to `low/medium/high`; update `_judge_prompt` scoring instructions; update `scores=` list; update `FinalJudgeWorkflow.columns()` signature to accept `EvaluateModelSelection` instead of `RewriteModelSelection`; remove `COL_NEEDS_HUMAN_REVIEW` from its column output (see Step 2) | -| `src/anonymizer/engine/rewrite/rewrite_workflow.py` | Remove `_run_final_judge` call from `run()`; add `evaluate()` method that runs detection judge + final judge | -| `src/anonymizer/interface/results.py` | Add `rewrite_config: PrivacyGoal \| None = None` field to `AnonymizerResult` and `PreviewResult`; set it during rewrite `run()` analogous to `replace_method` | -| `src/anonymizer/interface/anonymizer.py` | Extend `evaluate()` to dispatch on `rewrite_config`; add `COL_JUDGE_EVALUATION` + `COL_DETECTION_VALID` to the rewrite allowed-column set in `_build_user_dataframe` | -| `src/anonymizer/config/models.py` | Add `rewrite_judge` alias to `EvaluateModelSelection`; remove `judge` from `RewriteModelSelection` | -| `src/anonymizer/config/anonymizer_config.py` | `EvaluateConfig` is no longer a placeholder — add a docstring clarifying it covers both replace and rewrite evaluation | -| `src/anonymizer/config/default_model_configs/evaluate.yaml` | Add `rewrite_judge: nemotron-30b-thinking` — required to avoid a Pydantic startup crash when the new field lands in `EvaluateModelSelection` | -| `src/anonymizer/config/default_model_configs/rewrite.yaml` | Remove `judge` entry — it moves to `evaluate.yaml` | -| `src/anonymizer/engine/ndd/model_loader.py` | Update `validate_model_alias_references` to check `evaluate.rewrite_judge` when `check_evaluate=True` on a rewrite result | -| `src/anonymizer/interface/display.py` | Update `_render_scores_section` to not append `/10`; fix `_extract_judge_scores` to not cast `value["score"]` through `int()` — for string keys `"low"/"medium"/"high"` this raises `ValueError` which is silently swallowed, causing the judge section to never render; update return type to `list[tuple[str, int \| str]]` | -| `src/anonymizer/engine/schemas/rewrite.py` | Update any schema or docstring that references the 1–10 scale or "naturalness" | -| `docs/concepts/rewrite.md` | Update judge score documentation (rename naturalness → fluency, describe categorical scale, move judge to evaluate step) | -| `skills/anonymizer/SKILL.md` | Update evaluate workflow section with rewrite evaluate example | -| `tests/engine/rewrite/test_final_judge.py` | Update rubric tests; add tests for categorical output parsing | -| `tests/interface/test_anonymizer.py` | Add tests for `evaluate()` on a rewrite result | - -`src/anonymizer/engine/evaluation/detection_judge.py` — no changes needed; already works on any trace dataframe that has `COL_TEXT` and `COL_ENTITIES_BY_VALUE`. - ---- - -## Step 1 — Rename and recategorise the judge rubrics (`final_judge.py`) - -### Rename - -```python -# before -NATURALNESS_RUBRIC = Score(name="naturalness", ...) - -# after -STYLE_RUBRIC = Score(name="style", ...) -``` - -Update `scores=` list in `FinalJudgeWorkflow.columns()`: - -```python -scores=[PRIVACY_RUBRIC, QUALITY_RUBRIC, STYLE_RUBRIC], -``` - -### Change options to `low / medium / high` - -Replace the integer-keyed `options` dicts with string keys on all three rubrics: - -```python -PRIVACY_RUBRIC = Score( - name="privacy", - description="Privacy protection — how well the rewrite removes linkage risk to the original record.", - options={ - "high": "Original direct identifiers removed; remaining quasi-identifiers create low linkage risk.", - "medium": "No obvious direct identifiers remain, but a distinctive quasi-identifier bundle creates noticeable linkage risk.", - "low" : "The record is easily or near-certainly linkable back to the original: key direct identifiers remain, or enough identifying detail survives that re-identification requires minimal effort regardless of how many entities were successfully transformed.", - }, -) - -QUALITY_RUBRIC = Score( - name="quality", - description="Content quality — how well important meaning, facts, and structure are preserved.", - options={ - "high": "Important meaning, facts, and structure fully preserved.", - "medium": "Most content preserved; minor details lost or slightly distorted.", - "low": "Material loss of important information, contradictions, or distorted core meaning.", - }, -) - -STYLE_RUBRIC = Score( - name="style", - description="Writing style — does the rewritten text read as natural, grammatically correct, human-written prose?", - options={ - "high": "Reads as natural, coherent, human-written prose.", - "medium": "Mostly readable; isolated awkward phrasing or stiff transitions.", - "low": "Noticeably unnatural; broken grammar, placeholder-like language, or machine-generated feel.", - }, -) -``` - -### Update `_judge_prompt` - -Replace the three `<*_scoring_instructions>` blocks to match the new categorical rubric anchors. The core guidance (assess independently, don't penalise necessary changes, etc.) is preserved — only the scale reference changes: - -``` - - ...existing contextual guidance (linkage risk, quasi-identifiers, etc.) preserved verbatim... - - Score as: - - high — original direct identifiers removed; remaining details create low linkage risk - - medium — no obvious direct identifiers, but a distinctive quasi-identifier bundle creates - noticeable linkage risk - - low — one or more direct identifiers or near-equivalents remain, or easily linkable - - - - ...existing guidance preserved... - - Score as: - - high — important meaning, facts, and structure fully preserved - - medium — most content preserved; minor details lost or slightly distorted - - low — material loss of important information, contradictions, or distorted core meaning - - - - ...naturalness guidance renamed and preserved... - - Score as: - - high — natural, coherent, human-written prose - - medium — mostly readable; isolated awkward phrasing or stiff transitions - - low — noticeably unnatural; broken grammar, placeholder-like language, or machine feel - -``` - -The `` block changes "naturalness of writing" to "style of writing". - ---- - -## Step 2 — Move final judge out of `run()` (`rewrite_workflow.py`) - -Remove the `_run_final_judge` call from `RewriteWorkflow.run()` and the `COL_JUDGE_EVALUATION` default from `_PASSTHROUGH_DEFAULTS`. - -Add a standalone `evaluate()` method on `RewriteWorkflow`: - -```python -def evaluate( - self, - df: pd.DataFrame, - *, - model_configs: list[ModelConfig], - selected_models: EvaluateModelSelection, - privacy_goal: PrivacyGoal, - preview_num_records: int | None = None, -) -> RewriteResult: - """Run detection validity judge and final holistic judge on a completed rewrite result. - - Mirrors ReplacementWorkflow.evaluate(): takes the trace dataframe from a - prior run() / preview() and appends COL_DETECTION_VALID, - COL_DETECTION_INVALID_ENTITIES, and COL_JUDGE_EVALUATION. - """ -``` - -Inside `evaluate()`: - -1. **Split entity vs passthrough rows** using the same `split_rows` / `_has_entities` pattern as `run()`. Passthrough rows receive `COL_DETECTION_VALID = None` and `COL_JUDGE_EVALUATION = None` as defaults — running either judge on them produces vacuously correct or misleadingly high scores for records that were never anonymized. -2. Run `DetectionJudgeWorkflow` against entity rows only, using `COL_ENTITIES_BY_VALUE` + `COL_TEXT`. The score is surfaced as a **0–1 value** matching the scale of `utility_score` and `leakage_mass`. `COL_DETECTION_VALID` appears alongside the objective scores in the user dataframe and display, **not** grouped with the judge scores (privacy/quality/style). It does **not** influence `COL_NEEDS_HUMAN_REVIEW`. -3. Run `FinalJudgeWorkflow` against entity rows only, for privacy / quality / style scores. -4. Merge entity and passthrough rows and return a new `RewriteResult`. - -### `COL_NEEDS_HUMAN_REVIEW` must not be overwritten - -`COL_NEEDS_HUMAN_REVIEW` is set correctly during `run()` based on objective metrics (utility/leakage thresholds). `evaluate()` must never touch it — judge scores and detection validity do not influence the human review decision. Remove `COL_NEEDS_HUMAN_REVIEW` from `FinalJudgeWorkflow.columns()` and produce it instead at the end of `_run_evaluate_repair_loop()` in `rewrite_workflow.py`, where the threshold params are already available. - -### Telemetry — `rewrite.judge` reference - -`anonymizer.py` references `rewrite.judge` in two places that will break when the field is removed from `RewriteModelSelection`: - -- `_collect_step_models()` — `"judge": rewrite.judge if has_rewrite else NOT_APPLICABLE` -- `_build_telemetry_event()` — `judge_model=models["judge"]` - -Resolution: drop the `judge` key from the `_collect_step_models` rewrite block and from `_build_telemetry_event`. The judge is now an evaluate-time role, not a run-time role, so it doesn't belong in run telemetry. `anonymizer.py` is already in the Files Changed table for the `evaluate()` dispatch; this telemetry fix is part of the same change. - ---- - -## Step 3 — Wire `Anonymizer.evaluate()` for rewrite results (`anonymizer.py`) - -The existing `evaluate()` currently raises if `output.replace_method` is `None`: - -```python -# before -replace_method = getattr(output, "replace_method", None) -if replace_method is None: - raise ValueError(...) -``` - -Extend the dispatch: - -```python -rewrite_config = getattr(output, "rewrite_config", None) -replace_method = getattr(output, "replace_method", None) - -if rewrite_config is not None: - # Rewrite evaluate path - ...call self._rewrite_runner.evaluate(...) -elif replace_method is not None: - # Replace evaluate path (unchanged) - ... -else: - raise ValueError(...) -``` - -`AnonymizerResult` / `PreviewResult` in `results.py` need a `rewrite_config` field (carrying `PrivacyGoal`) set during `run()` in rewrite mode — analogous to how `replace_method` is set in replace mode. - -### Update `_build_user_dataframe` - -Add `COL_JUDGE_EVALUATION`, `COL_DETECTION_VALID`, and `COL_DETECTION_INVALID_ENTITIES` to the rewrite allowed set. The objective metrics (`utility_score`, `leakage_mass`, etc.) are already in the allowed set and remain there — they are present after `run()`. The new columns are only present after `evaluate()` and are silently omitted until then. - -```python -if f"{text_col}_rewritten" in t.columns: - allowed = { - text_col, - f"{text_col}_rewritten", - COL_UTILITY_SCORE, - COL_LEAKAGE_MASS, - COL_WEIGHTED_LEAKAGE_RATE, - COL_ANY_HIGH_LEAKED, - COL_NEEDS_HUMAN_REVIEW, - COL_DETECTION_VALID, # ← new, only present after evaluate() - COL_DETECTION_INVALID_ENTITIES, # ← new, only present after evaluate() - COL_JUDGE_EVALUATION, # ← new, only present after evaluate() - } -``` - ---- - -## Step 4 — Update model selection (`models.py`) - -Move the `judge` alias out of `RewriteModelSelection` and into `EvaluateModelSelection`: - -```python -class EvaluateModelSelection(BaseModel): - detection_validity_judge: str - replace_type_fidelity_judge: str - replace_relational_consistency_judge: str - replace_attribute_fidelity_judge: str - rewrite_judge: str # ← new: holistic privacy/quality/fluency judge for rewrite evaluate -``` - -`RewriteModelSelection.judge` is removed (or kept with a deprecation note if model YAML defaults need a phased migration). - -Update `engine/ndd/model_loader.py` validation to check `evaluate.rewrite_judge` when `check_evaluate=True` and the output is a rewrite result. - -Update `FinalJudgeWorkflow.columns()` to accept `EvaluateModelSelection` instead of `RewriteModelSelection`, and resolve the judge alias via `evaluate.rewrite_judge` instead of `rewrite.judge`. - ---- - -## Step 5 — Fix display rendering (`display.py`) - -Line 449 currently renders: - -```python -score_strs = [f"{name}: {score}/10" for name, score in judge_scores] -``` - -Change to: - -```python -score_strs = [f"{name}: {score}" for name, score in judge_scores] -``` - -`_extract_judge_scores` returns `list[tuple[str, int]]` — update the return type to `list[tuple[str, int | str]]` since scores are now strings. - -### Detection validity placement - -`COL_DETECTION_VALID` must be rendered in the **main scores section** (alongside `utility_score` and `leakage_mass`), not inside the judge scores block. Update `_render_scores_section` to include it there when present. The value is already 0–1 so no `/10` suffix is needed and no additional scaling is required. - -### "Rewrite Need Review" label - -In rewrite mode the `COL_NEEDS_HUMAN_REVIEW` column must be displayed as **"Rewrite Need Review"** (not the generic "Needs Review" used in replace mode). Update the label resolution in `display.py` to emit the rewrite-specific label when rendering a rewrite result. - -### Invalid-entities expandable table - -When `COL_DETECTION_VALID < 1.0` and `COL_DETECTION_INVALID_ENTITIES` contains entries, append a collapsed `
` block below the scores `
`. The summary line reads "Show N flagged detection(s)"; expanding it reveals a three-column table (Value / Label / Reason). This is conditional — records with `detection_valid == 1.0` or no invalid-entity entries show no extra element. The `_extract_judge_scores` fix (remove `int()` cast) is required for string scores to appear at all; without it, `ValueError` was silently swallowed for `"low"/"medium"/"high"` values, causing the judge section to never render. - ---- - -## Step 6 — Docs and skills - -### `docs/concepts/rewrite.md` - -- Output columns table: remove `judge evaluation` from the `run()` output section; add a new **Evaluation** subsection (parallel to the existing replace evaluate docs) showing the `evaluate()` call pattern and what columns it adds. -- Update the judge score description: rename "naturalness" → "style", describe `low/medium/high` scale; note detection validity appears in the main scores section (0–1) not judge scores; note `COL_NEEDS_HUMAN_REVIEW` is labelled "Rewrite Need Review" in the output column table. -- Model roles table: move `judge` from the rewrite pipeline roles to the evaluate roles. - -### `skills/anonymizer/SKILL.md` - -Add a rewrite evaluate workflow example alongside the existing replace evaluate example: - -```python -# after rewrite run / preview: -evaluated = anonymizer.evaluate(result) -evaluated.display_record(0) -# → adds detection_valid (0–1, main scores section), judge evaluation (privacy/quality/style: low/medium/high) -``` - ---- - -## Step 7 — Tests - -### Update existing tests - -- `tests/engine/rewrite/test_final_judge.py` — update rubric option assertions for `low/medium/high`; update any test that checks score parsing for integer values; rename all `naturalness` references to `style`. -- `tests/interface/test_anonymizer.py` — update assertions that check `COL_JUDGE_EVALUATION` is in the `run()` output (it now only appears after `evaluate()`). - -### New tests to add - -``` -# final_judge.py -test_style_rubric_has_low_medium_high_options -test_privacy_rubric_has_low_medium_high_options -test_quality_rubric_has_low_medium_high_options -test_judge_prompt_references_style_not_naturalness -test_judge_prompt_references_categorical_scale - -# rewrite_workflow.py -test_run_does_not_produce_judge_evaluation_column -test_evaluate_produces_judge_evaluation_column -test_evaluate_produces_detection_valid_column - -# anonymizer.py -test_evaluate_rewrite_result_adds_judge_columns -test_evaluate_rewrite_result_adds_detection_valid -test_evaluate_rewrite_raises_without_rewrite_config -test_run_rewrite_does_not_include_judge_in_user_dataframe - -# display.py -test_render_scores_section_categorical_no_slash_10 -test_extract_judge_scores_returns_string_scores -test_extract_judge_scores_categorical_not_silently_empty -test_detection_valid_rendered_in_main_scores_section -test_rewrite_needs_human_review_label_is_rewrite_need_review - -# rewrite_workflow.py — passthrough + needs_human_review -test_evaluate_skips_passthrough_rows -test_evaluate_passthrough_rows_get_none_judge_defaults -test_run_needs_human_review_not_overwritten_by_evaluate - -# anonymizer.py — telemetry -test_run_rewrite_telemetry_has_no_judge_field -``` - -All new tests construct result objects directly — no real pipeline or LLM calls. - ---- - -## Implementation Order - -1. Update rubrics and prompt in `final_judge.py` (rename naturalness → style, 1-10 → low/medium/high) -2. Move `_run_final_judge` out of `RewriteWorkflow.run()`; add `RewriteWorkflow.evaluate()` -3. Add `rewrite_config` field to `AnonymizerResult` / `PreviewResult`; wire `Anonymizer.evaluate()` for rewrite -4. Move `judge` alias from `RewriteModelSelection` to `EvaluateModelSelection` (as `rewrite_judge`); update model loader validation -5. Update `_build_user_dataframe` allowed columns for rewrite -6. Fix `display.py` score rendering -7. Update `docs/concepts/rewrite.md` and `skills/anonymizer/SKILL.md` -8. Update existing tests; add new tests -9. Run `make format && make typecheck && make test` From 52e7cbbcca322c3617ffd2be23966eb3c8c249ac Mon Sep 17 00:00:00 2001 From: memadi Date: Fri, 12 Jun 2026 11:28:54 -0700 Subject: [PATCH 12/15] make judge_evaluation a user facing value Signed-off-by: memadi --- docs/concepts/evaluation.md | 8 ++++---- docs/troubleshooting.md | 2 +- src/anonymizer/engine/constants.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/concepts/evaluation.md b/docs/concepts/evaluation.md index 25021b23..0316dfff 100644 --- a/docs/concepts/evaluation.md +++ b/docs/concepts/evaluation.md @@ -116,7 +116,7 @@ Relationships inspected include geographic pairings (city ↔ state, city ↔ po --- -## Reading replace evaluation results +### Reading replace evaluation results `display_record()` renders a formatted per-record view that includes all four judge verdicts alongside the replacement map: @@ -141,7 +141,7 @@ Use `trace_dataframe` for the full internal trace including raw judge outputs. --- -## Model roles +### Model roles All four judges default to `gpt-oss-120b`. Defaults are defined in [`evaluate.yaml`](https://github.com/NVIDIA-NeMo/Anonymizer/blob/main/src/anonymizer/config/default_model_configs/evaluate.yaml). Override them by passing a `model_configs` YAML to `Anonymizer(model_configs=...)` — see [Models](models.md) for the full override pattern. @@ -262,7 +262,7 @@ The three rubric scores are stored together under the `judge_evaluation` column --- -## Reading rewrite evaluation results +### Reading rewrite evaluation results `display_record()` renders a formatted per-record view that includes the detection validity fraction and all three judge rubrics alongside the rewritten text: @@ -280,7 +280,7 @@ Use `trace_dataframe` for the full internal trace including raw judge outputs. --- -## Model roles (rewrite evaluation) +### Model roles The rewrite quality judge defaults to `nemotron-30b-thinking`. The detection validity judge shares the `detection_validity_judge` role used by replace evaluation. Defaults are defined in [`evaluate.yaml`](https://github.com/NVIDIA-NeMo/Anonymizer/blob/main/src/anonymizer/config/default_model_configs/evaluate.yaml). Override them via `model_configs`: diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 068bc7bb..7bf655bb 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -76,7 +76,7 @@ Key columns: | `weighted_leakage_rate` | Leakage normalized by maximum possible leakage | | `any_high_leaked` | Whether any high-sensitivity entity leaked through | | `needs_human_review` | Crossed the configured threshold | -| `_judge_evaluation` | Final-judge qualitative comments | +| `judge_evaluation` | Final-judge qualitative comments | ### Re-run with `Annotate` to see detection output diff --git a/src/anonymizer/engine/constants.py b/src/anonymizer/engine/constants.py index fdfecadf..7f4f5bf3 100644 --- a/src/anonymizer/engine/constants.py +++ b/src/anonymizer/engine/constants.py @@ -107,7 +107,7 @@ COL_LEAKED_PRIVACY_ITEMS = "_leaked_privacy_items" COL_REWRITTEN_TEXT_NEXT = COL_REWRITTEN_TEXT + "__next" COL_REPAIR_ITERATIONS = "_repair_iterations" -COL_JUDGE_EVALUATION = "_judge_evaluation" +COL_JUDGE_EVALUATION = "judge_evaluation" # User-facing output columns COL_UTILITY_SCORE = "utility_score" From 7853fbd4d07e080e5b2b541ff4d2fcd5e0f20d2f Mon Sep 17 00:00:00 2001 From: memadi Date: Fri, 12 Jun 2026 12:52:52 -0700 Subject: [PATCH 13/15] nit Signed-off-by: memadi --- docs/concepts/evaluation.md | 2 +- docs/notebook_source/04_rewriting_biographies.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/concepts/evaluation.md b/docs/concepts/evaluation.md index 0316dfff..a2adb301 100644 --- a/docs/concepts/evaluation.md +++ b/docs/concepts/evaluation.md @@ -200,7 +200,7 @@ evaluated = anonymizer.evaluate(loaded) ### Entity Detection Judge -Same judge as in replace mode — see [Entity Detection Judge](#entity-detection-judge) above. In rewrite mode, `detection_valid` is returned as a **0–1 fraction** (the share of detected entities that passed), rather than a boolean. A value of `1.0` means all detections are valid; lower values indicate the fraction of entities the judge flagged as incorrect. +Same judge as in replace mode — see [Entity Detection Judge](#entity-detection-judge) above. In rewrite mode, `detection_valid` is returned as a **0–1 fraction** (the share of detected entities that passed), rather than a boolean. A value of `1.0` means all detections are valid; lower values mean more entities were flagged — the value itself is the fraction that passed. | Output column | Type | Description | |---|---|---| diff --git a/docs/notebook_source/04_rewriting_biographies.py b/docs/notebook_source/04_rewriting_biographies.py index 38d782ce..9a961a3d 100644 --- a/docs/notebook_source/04_rewriting_biographies.py +++ b/docs/notebook_source/04_rewriting_biographies.py @@ -24,8 +24,8 @@ # 2. Classifies the domain and assigns sensitivity dispositions # 3. Generates a rewritten version that obscures sensitive entities # 4. Evaluates quality (utility) and privacy (leakage) with an automated repair loop +# 5. Runs a final optional LLM judge for informational scores # -# After `run()`, call `Anonymizer.evaluate()` for optional LLM-as-judge scoring. # # #### 📚 What you'll learn # From e078d39c65d56c79c67672ed8fd208276205e44e Mon Sep 17 00:00:00 2001 From: memadi Date: Fri, 12 Jun 2026 14:01:10 -0700 Subject: [PATCH 14/15] fix some bugs-add some tests Signed-off-by: memadi --- .../engine/evaluation/detection_judge.py | 18 +++- .../engine/rewrite/rewrite_workflow.py | 19 ++++- src/anonymizer/interface/anonymizer.py | 2 +- src/anonymizer/interface/display.py | 13 ++- tests/engine/test_rewrite_workflow.py | 83 +++++++++++++++---- tests/interface/test_anonymizer_interface.py | 47 +++++++++++ tests/interface/test_display.py | 53 ++++++++++++ 7 files changed, 212 insertions(+), 23 deletions(-) diff --git a/src/anonymizer/engine/evaluation/detection_judge.py b/src/anonymizer/engine/evaluation/detection_judge.py index 5d0349fe..8f9495ec 100644 --- a/src/anonymizer/engine/evaluation/detection_judge.py +++ b/src/anonymizer/engine/evaluation/detection_judge.py @@ -249,7 +249,18 @@ def prepare( ``_entity_examples_for_detection_judge`` populated. """ working_df = dataframe.copy() - parsed = working_df[entities_column].apply(EntitiesByValueSchema.from_raw) + + def _safe_parse(raw: object) -> EntitiesByValueSchema: + try: + return EntitiesByValueSchema.from_raw(raw) + except Exception: + logger.warning( + "Could not parse entities_by_value for a row; treating as no entities.", + exc_info=True, + ) + return EntitiesByValueSchema(entities_by_value=[]) + + parsed = working_df[entities_column].apply(_safe_parse) working_df[_ENTITIES_FOR_JUDGE_COL] = parsed.apply(_entities_for_judge) working_df[_ENTITY_EXAMPLES_FOR_JUDGE_COL] = parsed.apply(_label_examples_for_judge) return working_df @@ -311,6 +322,11 @@ def evaluate( working_df = self.prepare(dataframe, entities_column=entities_column) entity_rows, passthrough_rows = split_rows(working_df, column=_ENTITIES_FOR_JUDGE_COL, predicate=bool) + if not passthrough_rows.empty: + logger.info( + "%d passthrough row(s) have no detected entities — detection_valid set to True (trivially valid).", + len(passthrough_rows), + ) passthrough_rows[COL_DETECTION_JUDGE] = [ {"all_valid": True, "invalid_entities": []} for _ in range(len(passthrough_rows)) ] diff --git a/src/anonymizer/engine/rewrite/rewrite_workflow.py b/src/anonymizer/engine/rewrite/rewrite_workflow.py index 8807bb96..6c2fe7e1 100644 --- a/src/anonymizer/engine/rewrite/rewrite_workflow.py +++ b/src/anonymizer/engine/rewrite/rewrite_workflow.py @@ -59,6 +59,8 @@ def _detection_valid_fraction(row: pd.Series) -> float | None: The detection judge stores a bool (all_valid) but rewrite evaluate surfaces a fraction so it sits on the same scale as utility_score and leakage_mass. + Returns None when the score cannot be computed (judge unavailable or entity + parsing failed). """ valid = row.get(COL_DETECTION_VALID) if valid is None: @@ -72,9 +74,15 @@ def _detection_valid_fraction(row: pd.Series) -> float | None: len(e.labels) for e in EntitiesByValueSchema.from_raw(row.get(COL_ENTITIES_BY_VALUE)).entities_by_value ) except Exception: - total = 0 + logger.warning( + "Could not parse entities_by_value to compute detection_valid fraction; defaulting to None.", + exc_info=True, + ) + return None if total == 0: - return 1.0 + # Reachable only when valid is False (True returns early above). + # Judge flagged invalid detections but no entities were found — score is uncomputable. + return None return max(0.0, (total - invalid_count) / total) @@ -416,7 +424,12 @@ def evaluate( entity_rows, passthrough_rows = split_rows(df, column=COL_ENTITIES_BY_VALUE, predicate=_has_entities) passthrough_rows = passthrough_rows.copy() - passthrough_rows[COL_DETECTION_VALID] = None + if not passthrough_rows.empty: + logger.info( + "%d passthrough row(s) have no detected entities — detection_valid set to 1.0 (trivially valid).", + len(passthrough_rows), + ) + passthrough_rows[COL_DETECTION_VALID] = 1.0 passthrough_rows[COL_DETECTION_INVALID_ENTITIES] = [[] for _ in range(len(passthrough_rows))] passthrough_rows[COL_JUDGE_EVALUATION] = None diff --git a/src/anonymizer/interface/anonymizer.py b/src/anonymizer/interface/anonymizer.py index 299d8b7b..098de896 100644 --- a/src/anonymizer/interface/anonymizer.py +++ b/src/anonymizer/interface/anonymizer.py @@ -290,7 +290,7 @@ def evaluate( validate_model_alias_references( self._model_configs, self._selected_models, - check_rewrite=True, + check_rewrite=False, check_evaluate=True, ) except ValueError as exc: diff --git a/src/anonymizer/interface/display.py b/src/anonymizer/interface/display.py index 487444a4..659f3453 100644 --- a/src/anonymizer/interface/display.py +++ b/src/anonymizer/interface/display.py @@ -139,7 +139,7 @@ def _render_rewrite_html(row: pd.Series, *, text_col: str, record_index: int | N entities = _resolve_display_entities(row) original_html = _render_highlighted_text(text, entities) rewritten_html = f"{html.escape(rewritten_text)}" - scores_html = _render_scores_section(row) + scores_html = _render_scores_section(row, is_rewrite=True) disposition_html = _render_disposition_table(row) index_label = f" (record {record_index})" if record_index is not None else "" @@ -413,7 +413,7 @@ def _normalize_replacement_map(raw: str | dict | object) -> list[dict[str, str]] return result -def _render_scores_section(row: pd.Series) -> str: +def _render_scores_section(row: pd.Series, *, is_rewrite: bool = False) -> str: """Render scores in up to three rows: objective metrics | detection validity | judge.""" section_rows: list[str] = [] @@ -435,7 +435,6 @@ def _render_scores_section(row: pd.Series) -> str: f"{weighted_leakage_rate:.2f}" ) if needs_review is not None: - is_rewrite = "rewritten" in "".join(str(k) for k in row.index) label = "Rewrite Need Review" if is_rewrite else "Needs Review" badge_color = "#ef4444" if needs_review else "#22c55e" badge_text = "Yes" if needs_review else "No" @@ -448,7 +447,13 @@ def _render_scores_section(row: pd.Series) -> str: # --- Row 2: detection validity inline with flagged-entities dropdown --- detection_valid = row.get(COL_DETECTION_VALID) - if detection_valid is not None: + if COL_DETECTION_VALID in row.index and (detection_valid is None or pd.isna(detection_valid)): + section_rows.append( + "
" + "Detection Validity: " + "Unavailable
" + ) + elif detection_valid is not None and not pd.isna(detection_valid): det_span = ( f"Detection Validity: {float(detection_valid):.2f}" ) diff --git a/tests/engine/test_rewrite_workflow.py b/tests/engine/test_rewrite_workflow.py index 716e0b07..c47af807 100644 --- a/tests/engine/test_rewrite_workflow.py +++ b/tests/engine/test_rewrite_workflow.py @@ -27,8 +27,9 @@ COL_UTILITY_SCORE, COL_WEIGHTED_LEAKAGE_RATE, ) +from anonymizer.engine.constants import COL_DETECTION_INVALID_ENTITIES from anonymizer.engine.ndd.adapter import RECORD_ID_COLUMN, FailedRecord, WorkflowRunResult -from anonymizer.engine.rewrite.rewrite_workflow import RewriteWorkflow +from anonymizer.engine.rewrite.rewrite_workflow import RewriteWorkflow, _detection_valid_fraction _REPLACE_PATCH = "anonymizer.engine.rewrite.rewrite_workflow.LlmReplaceWorkflow" @@ -137,22 +138,23 @@ def stub_eval_df(stub_pipeline_df: pd.DataFrame) -> pd.DataFrame: @pytest.fixture def stub_judge_df(stub_eval_df: pd.DataFrame) -> pd.DataFrame: + """Fixture used by evaluate() tests — only judge-produced columns here.""" df = stub_eval_df.copy() df[COL_JUDGE_EVALUATION] = None - df[COL_NEEDS_HUMAN_REVIEW] = False return df def _standard_side_effect( pipeline_df: pd.DataFrame, eval_df: pd.DataFrame, - judge_df: pd.DataFrame, ) -> list[WorkflowRunResult]: - """Happy-path adapter side_effect: pipeline, evaluate, judge.""" + """Happy-path adapter side_effect for run(): pipeline then evaluate. + + The final-judge no longer runs inside run() — it only runs via evaluate(). + """ return [ WorkflowRunResult(dataframe=pipeline_df, failed_records=[]), WorkflowRunResult(dataframe=eval_df, failed_records=[]), - WorkflowRunResult(dataframe=judge_df, failed_records=[]), ] @@ -249,10 +251,9 @@ def test_calls_sub_workflows_in_order( stub_replace_df: pd.DataFrame, stub_pipeline_df: pd.DataFrame, stub_eval_df: pd.DataFrame, - stub_judge_df: pd.DataFrame, ) -> None: adapter = Mock() - adapter.run_workflow.side_effect = _standard_side_effect(stub_pipeline_df, stub_eval_df, stub_judge_df) + adapter.run_workflow.side_effect = _standard_side_effect(stub_pipeline_df, stub_eval_df) with patch(_REPLACE_PATCH) as mock_replace_cls: _mock_replace(mock_replace_cls, stub_replace_df) @@ -287,7 +288,6 @@ def test_failed_records_accumulated_across_steps( stub_replace_df: pd.DataFrame, stub_pipeline_df: pd.DataFrame, stub_eval_df: pd.DataFrame, - stub_judge_df: pd.DataFrame, ) -> None: failed_pipeline = FailedRecord(record_id="a", step="rewrite-pipeline", reason="timeout") failed_eval = FailedRecord(record_id="b", step="rewrite-evaluate-0", reason="timeout") @@ -410,10 +410,9 @@ def test_repair_loop_exits_early_when_no_rows_need_repair( stub_replace_df: pd.DataFrame, stub_pipeline_df: pd.DataFrame, stub_eval_df: pd.DataFrame, - stub_judge_df: pd.DataFrame, ) -> None: adapter = Mock() - adapter.run_workflow.side_effect = _standard_side_effect(stub_pipeline_df, stub_eval_df, stub_judge_df) + adapter.run_workflow.side_effect = _standard_side_effect(stub_pipeline_df, stub_eval_df) with patch(_REPLACE_PATCH) as mock_replace_cls: _mock_replace(mock_replace_cls, stub_replace_df) @@ -642,10 +641,9 @@ def test_zero_max_repair_iterations_still_evaluates( stub_replace_df: pd.DataFrame, stub_pipeline_df: pd.DataFrame, stub_eval_df: pd.DataFrame, - stub_judge_df: pd.DataFrame, ) -> None: adapter = Mock() - adapter.run_workflow.side_effect = _standard_side_effect(stub_pipeline_df, stub_eval_df, stub_judge_df) + adapter.run_workflow.side_effect = _standard_side_effect(stub_pipeline_df, stub_eval_df) with patch(_REPLACE_PATCH) as mock_replace_cls: _mock_replace(mock_replace_cls, stub_replace_df) @@ -975,7 +973,7 @@ def test_evaluate_passthrough_rows_get_none_judge_defaults( stub_evaluate_model_selection, stub_eval_df: pd.DataFrame, ) -> None: - """Passthrough rows must have COL_JUDGE_EVALUATION=None and COL_DETECTION_VALID=None.""" + """Passthrough rows must have COL_JUDGE_EVALUATION=None and COL_DETECTION_VALID=1.0 (trivially valid).""" passthrough_row = stub_eval_df.iloc[0].to_dict() passthrough_row[COL_ENTITIES_BY_VALUE] = {"entities_by_value": []} mixed_df = pd.concat([stub_eval_df, pd.DataFrame([passthrough_row])], ignore_index=True) @@ -1003,7 +1001,7 @@ def test_evaluate_passthrough_rows_get_none_judge_defaults( result.dataframe[COL_ENTITIES_BY_VALUE].apply(lambda x: len(x.get("entities_by_value", [])) == 0) ] assert passthrough_result[COL_JUDGE_EVALUATION].iloc[0] is None - assert pd.isna(passthrough_result[COL_DETECTION_VALID].iloc[0]) + assert passthrough_result[COL_DETECTION_VALID].iloc[0] == 1.0 # --------------------------------------------------------------------------- @@ -1011,6 +1009,63 @@ def test_evaluate_passthrough_rows_get_none_judge_defaults( # --------------------------------------------------------------------------- +# --------------------------------------------------------------------------- +# Tests: _detection_valid_fraction +# --------------------------------------------------------------------------- + + +def test_detection_valid_fraction_returns_none_when_valid_is_none() -> None: + row = pd.Series({COL_DETECTION_VALID: None, COL_DETECTION_INVALID_ENTITIES: [], COL_ENTITIES_BY_VALUE: {}}) + assert _detection_valid_fraction(row) is None + + +def test_detection_valid_fraction_returns_1_when_all_valid() -> None: + row = pd.Series( + { + COL_DETECTION_VALID: True, + COL_DETECTION_INVALID_ENTITIES: [], + COL_ENTITIES_BY_VALUE: {"entities_by_value": [{"value": "Alice", "labels": ["first_name"]}]}, + } + ) + assert _detection_valid_fraction(row) == 1.0 + + +def test_detection_valid_fraction_computes_correct_fraction() -> None: + entities = {"entities_by_value": [{"value": "Alice", "labels": ["first_name", "full_name"]}]} + row = pd.Series( + { + COL_DETECTION_VALID: False, + COL_DETECTION_INVALID_ENTITIES: [{"value": "Alice", "label": "full_name", "reasoning": "wrong label"}], + COL_ENTITIES_BY_VALUE: entities, + } + ) + result = _detection_valid_fraction(row) + assert result == pytest.approx(0.5) + + +def test_detection_valid_fraction_returns_none_on_parse_failure() -> None: + row = pd.Series( + { + COL_DETECTION_VALID: False, + COL_DETECTION_INVALID_ENTITIES: [{"value": "x", "label": "y", "reasoning": "z"}], + COL_ENTITIES_BY_VALUE: "not a valid schema payload <<<", + } + ) + assert _detection_valid_fraction(row) is None + + +def test_detection_valid_fraction_returns_none_when_total_is_zero_and_valid_false() -> None: + """valid=False with an empty entity list — judge flagged invalid but no entities found.""" + row = pd.Series( + { + COL_DETECTION_VALID: False, + COL_DETECTION_INVALID_ENTITIES: [], + COL_ENTITIES_BY_VALUE: {"entities_by_value": []}, + } + ) + assert _detection_valid_fraction(row) is None + + def test_run_needs_human_review_not_overwritten_by_evaluate( stub_model_configs: list[ModelConfig], stub_evaluate_model_selection, diff --git a/tests/interface/test_anonymizer_interface.py b/tests/interface/test_anonymizer_interface.py index fc1ea879..8f6fb85b 100644 --- a/tests/interface/test_anonymizer_interface.py +++ b/tests/interface/test_anonymizer_interface.py @@ -29,6 +29,7 @@ from anonymizer.engine.ndd.adapter import FailedRecord from anonymizer.engine.replace.replace_runner import ReplacementResult, ReplacementWorkflow from anonymizer.engine.rewrite.rewrite_workflow import RewriteResult, RewriteWorkflow +from anonymizer.engine.ndd.model_loader import validate_model_alias_references from anonymizer.interface.anonymizer import Anonymizer, _resolve_model_providers from anonymizer.interface.errors import InvalidConfigError, InvalidInputError @@ -790,3 +791,49 @@ def test_evaluate_rewrite_raises_without_rewrite_config() -> None: with pytest.raises(ValueError): anonymizer.evaluate(bare_result) # type: ignore[arg-type] + + +def test_evaluate_rewrite_calls_validate_with_check_rewrite_false(stub_input: AnonymizerInput) -> None: + """evaluate() on a rewrite result must NOT validate rewrite pipeline model aliases. + + Passing check_rewrite=True would require domain-classifier / rewrite-generator + aliases that are irrelevant for post-hoc evaluation. This test asserts the call + uses check_rewrite=False so users with evaluate-only configs are not blocked. + """ + from unittest.mock import patch as _patch + + config = AnonymizerConfig(rewrite=Rewrite()) + anonymizer, _, _, rewrite_runner = _make_anonymizer() + + run_result = anonymizer.run(config=config, data=stub_input) + + eval_df = pd.DataFrame( + { + COL_TEXT: ["Alice works at Acme"], + COL_REWRITTEN_TEXT: ["Beth works at Globex"], + "utility_score": [0.85], + "leakage_mass": [0.3], + "weighted_leakage_rate": [0.23], + "any_high_leaked": [False], + "needs_human_review": [False], + COL_JUDGE_EVALUATION: [None], + COL_DETECTION_VALID: [1.0], + } + ) + rewrite_runner.evaluate.return_value = RewriteResult(dataframe=eval_df, failed_records=[]) + + with _patch( + "anonymizer.interface.anonymizer.validate_model_alias_references", + wraps=validate_model_alias_references, + ) as mock_validate: + anonymizer.evaluate(run_result) + + rewrite_eval_calls = [ + call for call in mock_validate.call_args_list if call.kwargs.get("check_evaluate") is True + ] + assert rewrite_eval_calls, "validate_model_alias_references was not called with check_evaluate=True" + for call in rewrite_eval_calls: + assert call.kwargs.get("check_rewrite") is False, ( + "evaluate() on a rewrite result must pass check_rewrite=False to avoid " + "requiring rewrite pipeline model aliases that are unused during evaluation" + ) diff --git a/tests/interface/test_display.py b/tests/interface/test_display.py index c2cb1399..6242fc0b 100644 --- a/tests/interface/test_display.py +++ b/tests/interface/test_display.py @@ -714,6 +714,59 @@ def test_detection_valid_rendered_in_main_scores_section() -> None: assert "0.75" in result +def test_render_record_html_rewrite_mode_detection_valid_none_shows_unavailable() -> None: + """When evaluate() ran but detection_valid is None, display renders 'Unavailable' not a score.""" + row = pd.Series( + { + "text": "Alice works at Acme", + "text_rewritten": "Beth works at Globex", + COL_DETECTED_ENTITIES: {"entities": []}, + "utility_score": 0.9, + "leakage_mass": 0.1, + "needs_human_review": False, + COL_DETECTION_VALID: None, + } + ) + result = render_record_html(row, record_index=0) + assert "Detection Validity" in result + assert "Unavailable" in result + assert "0." not in result.split("Detection Validity")[1].split("
")[0] + + +def test_render_record_html_rewrite_mode_detection_valid_nan_shows_unavailable() -> None: + """NaN in COL_DETECTION_VALID (pandas missing-value sentinel) renders 'Unavailable'.""" + row = pd.Series( + { + "text": "Alice works at Acme", + "text_rewritten": "Beth works at Globex", + COL_DETECTED_ENTITIES: {"entities": []}, + "utility_score": 0.9, + "leakage_mass": 0.1, + "needs_human_review": False, + COL_DETECTION_VALID: np.nan, + } + ) + result = render_record_html(row, record_index=0) + assert "Detection Validity" in result + assert "Unavailable" in result + + +def test_render_record_html_rewrite_mode_no_detection_valid_column_omits_section() -> None: + """When COL_DETECTION_VALID is absent (evaluate() never called), the row is omitted entirely.""" + row = pd.Series( + { + "text": "Alice works at Acme", + "text_rewritten": "Beth works at Globex", + COL_DETECTED_ENTITIES: {"entities": []}, + "utility_score": 0.9, + "leakage_mass": 0.1, + "needs_human_review": False, + } + ) + result = render_record_html(row, record_index=0) + assert "Detection Validity" not in result + + def test_rewrite_needs_human_review_label_is_rewrite_need_review() -> None: row = pd.Series( { From 63d84506361a1a37b74f0c399b50603cf571524e Mon Sep 17 00:00:00 2001 From: memadi Date: Fri, 12 Jun 2026 14:06:14 -0700 Subject: [PATCH 15/15] update docs according to the changes Signed-off-by: memadi --- docs/concepts/evaluation.md | 20 +++++++++++++++++++- tests/engine/test_rewrite_workflow.py | 2 +- tests/interface/test_anonymizer_interface.py | 6 ++---- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/docs/concepts/evaluation.md b/docs/concepts/evaluation.md index a2adb301..1c2e9ec1 100644 --- a/docs/concepts/evaluation.md +++ b/docs/concepts/evaluation.md @@ -68,6 +68,15 @@ This judge runs regardless of which replace mode was used. It looks at each dete | `detection_valid` | `bool \| None` | `True` if all detections pass; `None` if the judge was unavailable. | | `detection_invalid_entities` | `list` | Each flagged detection with value, label, and one-sentence reasoning. | +**Special values:** + +| Scenario | `detection_valid` | Display | Log | +|---|---|---|---| +| No entities detected in this record | `True` | Satisfied | `INFO`: "N passthrough row(s) have no detected entities — detection_valid set to True (trivially valid)" | +| Judge ran and all detections passed | `True` | Satisfied | — | +| Judge ran and flagged one or more detections | `False` | Not Satisfied / Partially Satisfied | — | +| Judge call failed or returned a malformed response | `None` | Unavailable | — | + --- ### Entity Replacement Judges @@ -204,9 +213,18 @@ Same judge as in replace mode — see [Entity Detection Judge](#entity-detection | Output column | Type | Description | |---|---|---| -| `detection_valid` | `float \| None` | 1.0 if all detections pass; fraction of valid entities otherwise; `None` if the judge was unavailable. | +| `detection_valid` | `float \| None` | 1.0 if all detections pass; fraction of valid entities otherwise; `None` if the score is unavailable. | | `detection_invalid_entities` | `list` | Each flagged detection with value, label, and one-sentence reasoning. | +**Special values:** + +| Scenario | `detection_valid` | Display | Log | +|---|---|---|---| +| No entities detected in this record | `1.0` | 1.00 | `INFO`: "N passthrough row(s) have no detected entities — detection_valid set to 1.0 (trivially valid)" | +| Judge ran and all detections passed | `1.0` | 1.00 | — | +| Judge ran and flagged one or more detections | 0–1 fraction | numeric score | — | +| Judge call failed or entity data unreadable | `None` | Unavailable | `WARNING`: "Could not parse entities_by_value to compute detection_valid fraction" | + --- ### Rewrite Quality Judges diff --git a/tests/engine/test_rewrite_workflow.py b/tests/engine/test_rewrite_workflow.py index c47af807..8b77db8d 100644 --- a/tests/engine/test_rewrite_workflow.py +++ b/tests/engine/test_rewrite_workflow.py @@ -13,6 +13,7 @@ from anonymizer.config.rewrite import EvaluationCriteria, PrivacyGoal from anonymizer.engine.constants import ( COL_ANY_HIGH_LEAKED, + COL_DETECTION_INVALID_ENTITIES, COL_DETECTION_VALID, COL_DOMAIN, COL_ENTITIES_BY_VALUE, @@ -27,7 +28,6 @@ COL_UTILITY_SCORE, COL_WEIGHTED_LEAKAGE_RATE, ) -from anonymizer.engine.constants import COL_DETECTION_INVALID_ENTITIES from anonymizer.engine.ndd.adapter import RECORD_ID_COLUMN, FailedRecord, WorkflowRunResult from anonymizer.engine.rewrite.rewrite_workflow import RewriteWorkflow, _detection_valid_fraction diff --git a/tests/interface/test_anonymizer_interface.py b/tests/interface/test_anonymizer_interface.py index 8f6fb85b..b4fcf961 100644 --- a/tests/interface/test_anonymizer_interface.py +++ b/tests/interface/test_anonymizer_interface.py @@ -27,9 +27,9 @@ ) from anonymizer.engine.detection.detection_workflow import EntityDetectionResult, EntityDetectionWorkflow from anonymizer.engine.ndd.adapter import FailedRecord +from anonymizer.engine.ndd.model_loader import validate_model_alias_references from anonymizer.engine.replace.replace_runner import ReplacementResult, ReplacementWorkflow from anonymizer.engine.rewrite.rewrite_workflow import RewriteResult, RewriteWorkflow -from anonymizer.engine.ndd.model_loader import validate_model_alias_references from anonymizer.interface.anonymizer import Anonymizer, _resolve_model_providers from anonymizer.interface.errors import InvalidConfigError, InvalidInputError @@ -828,9 +828,7 @@ def test_evaluate_rewrite_calls_validate_with_check_rewrite_false(stub_input: An ) as mock_validate: anonymizer.evaluate(run_result) - rewrite_eval_calls = [ - call for call in mock_validate.call_args_list if call.kwargs.get("check_evaluate") is True - ] + rewrite_eval_calls = [call for call in mock_validate.call_args_list if call.kwargs.get("check_evaluate") is True] assert rewrite_eval_calls, "validate_model_alias_references was not called with check_evaluate=True" for call in rewrite_eval_calls: assert call.kwargs.get("check_rewrite") is False, (