diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 6e52e3f8..87fabaf9 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -15,19 +15,18 @@ multi-ecosystem-groups: python: schedule: interval: "weekly" + open-pull-requests-limit: 5 updates: - package-ecosystem: "pip" directory: "/" multi-ecosystem-group: python patterns: ["*"] - open-pull-requests-limit: 5 - package-ecosystem: "uv" directory: "/" multi-ecosystem-group: python patterns: ["*"] - open-pull-requests-limit: 5 - package-ecosystem: "github-actions" directory: "/" diff --git a/skills/anonymizer/SKILL.md b/skills/anonymizer/SKILL.md index ba40fe51..acc524fa 100644 --- a/skills/anonymizer/SKILL.md +++ b/skills/anonymizer/SKILL.md @@ -16,7 +16,7 @@ Anonymize a text dataset using NeMo Anonymizer in the way the user describes: $ARGUMENTS -The output is a single runnable Python script that builds an `AnonymizerConfig`, previews results on a few rows, inspects failures and quality metrics, and (on user approval) runs the full pipeline. The script is the durable artifact — the user keeps it for re-runs, version control, and production. +The output is a single runnable Python script that builds an `AnonymizerConfig`, previews results on a few rows, inspects failures and quality metrics, optionally scores Replace output with LLM-as-judge evaluation, and (on user approval) runs the full pipeline. The script is the durable artifact — the user keeps it for re-runs, version control, and production. # Workflow @@ -30,6 +30,7 @@ Read `workflows/interactive.md` and follow it. Anonymization is high-stakes — - **For cross-record consistency** (same value → same replacement everywhere), use `Hash`, not `Substitute`. `Substitute` is consistent within a row only. - **In Replace mode, default to `Substitute`** if the user hasn't specified a strategy. It's the most general-purpose choice and matches the bulk of production usage. - **`Annotate` is for inspection, not production.** Its output keeps the original entity text and is not privacy-safe. Use it during iteration to confirm detection is working, then switch. +- **Evaluation is opt-in and runs as a separate step** (Replace mode). After `preview()` / `run()`, call `anonymizer.evaluate(result)` to score the output with LLM-as-judge. `Substitute` gets four judges (detection validity, type fidelity, relational consistency, attribute fidelity); `Redact` / `Annotate` / `Hash` get the detection-validity judge only. Evaluation is diagnostic — it scores quality, it does not change the anonymized output. - **Always set `AnonymizerInput.data_summary`**, even briefly. It is the single cheapest quality lever and it improves both detection and rewrite. - **Never claim privacy guarantees.** Anonymizer is best-effort. Outputs may need human review depending on `risk_tolerance`. Tell the user this when you finalize. @@ -41,6 +42,9 @@ Read `workflows/interactive.md` and follow it. Anonymization is high-stakes — - **`risk_tolerance` only applies to Rewrite mode**, not Replace. - **`PrivacyGoal.protect` and `.preserve` must each be 10–1000 chars and at least 3 words.** Be specific (categories, named identifiers, structural facets); avoid generic phrasing like "preserve meaning". - **Validator pool is the only model role with built-in load-spreading.** Set `entity_validator: [a, b, c]` in `models.yaml` if rate limits drop rows. Other roles (rewriter, evaluator, etc.) are single-alias. +- **The evaluation judges use their own model roles** (`detection_validity_judge`, `replace_type_fidelity_judge`, `replace_relational_consistency_judge`, `replace_attribute_fidelity_judge`), configured in the `evaluate` section of `models.yaml`. They are **not** consumed by `preview()` / `run()`, so a config that anonymizes fine can still fail validation at `evaluate()` if those roles are unset. Defaults ship in `src/anonymizer/config/default_model_configs/evaluate.yaml`. +- **`*_valid` verdict columns are `True` / `False` / `None`.** `None` means the judge was unavailable (model/infra failure), **not** that the row passed — treat it as "unscored", never as a pass. Inspect verdicts per record with `evaluated.display_record(i)`. +- **`EvaluateConfig` is an empty placeholder today** — no knobs to set. `anonymizer.evaluate(result)` is the whole API; pass nothing else. # Reference Docs @@ -69,8 +73,10 @@ Write a Python script to the current directory. Name it after the dataset (e.g. Generated by the anonymizer agent skill. Usage: - python .py # preview on 5 rows (fast, cheap) - python .py --full # run on the full dataset + python .py # preview on 5 rows (fast, cheap) + python .py --full # run on the full dataset + python .py --evaluate # preview 5 rows, then LLM-judge-score those rows + python .py --full --evaluate # run full dataset, then score the full output """ from __future__ import annotations @@ -134,6 +140,11 @@ def main() -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--full", action="store_true", help="Run on full dataset (default: preview 5 rows)") parser.add_argument("--num-records", type=int, default=5, help="Rows to preview (ignored with --full)") + parser.add_argument( + "--evaluate", + action="store_true", + help="LLM-judge-score the output produced this run (preview rows, or full output with --full)", + ) args = parser.parse_args() anonymizer = Anonymizer() @@ -164,6 +175,28 @@ def main() -> None: print("\nFix dropped rows before tweaking strategy. See docs/troubleshooting.md.") sys.exit(1) + # Optional LLM-as-judge evaluation (Replace mode). Opt-in, separate step: + # scores how well detection + replacement worked without changing the + # output. Substitute -> 4 judges (detection validity, type fidelity, + # relational consistency, attribute fidelity); Redact/Annotate/Hash -> + # detection-validity judge only. Needs the `evaluate` model roles in + # models.yaml (see src/anonymizer/config/default_model_configs/evaluate.yaml). + if args.evaluate and config.replace is not None: + result = anonymizer.evaluate(result) + df = result.dataframe + for col in ( + "detection_valid", + "type_fidelity_valid", + "relational_consistency_valid", + "attribute_fidelity_valid", + ): + if col in df.columns: + passed = int(df[col].eq(True).sum()) # None = unscored, never a pass + scored = int(df[col].notna().sum()) + print(f"{col}: {passed}/{scored} passed ({len(df) - scored} unscored)") + # In a notebook, inspect per-record verdicts visually: + # result.display_record(0) + # Rewrite-mode quality summary (skip for Replace mode). if config.rewrite is not None: df = result.dataframe diff --git a/src/anonymizer/engine/evaluation/detection_judge.py b/src/anonymizer/engine/evaluation/detection_judge.py index 5d0349fe..93bc029a 100644 --- a/src/anonymizer/engine/evaluation/detection_judge.py +++ b/src/anonymizer/engine/evaluation/detection_judge.py @@ -5,14 +5,11 @@ import json import logging -from dataclasses import dataclass +from typing import ClassVar import pandas as pd -from data_designer.config.column_configs import LLMStructuredColumnConfig -from data_designer.config.models import ModelConfig from pydantic import BaseModel, Field -from anonymizer.config.models import EvaluateModelSelection from anonymizer.engine.constants import ( COL_DETECTION_INVALID_ENTITIES, COL_DETECTION_JUDGE, @@ -22,10 +19,8 @@ ENTITY_LABEL_EXAMPLES, _jinja, ) -from anonymizer.engine.ndd.adapter import FailedRecord, NddAdapter -from anonymizer.engine.ndd.model_loader import resolve_model_alias +from anonymizer.engine.evaluation.judge_base import _BaseJudgeWorkflow from anonymizer.engine.prompt_utils import substitute_placeholders -from anonymizer.engine.row_partitioning import merge_and_reorder, split_rows from anonymizer.engine.schemas import EntitiesByValueSchema logger = logging.getLogger("anonymizer.evaluation.detection_judge") @@ -57,17 +52,6 @@ class DetectionJudgmentSchema(BaseModel): ) -# --------------------------------------------------------------------------- -# Result -# --------------------------------------------------------------------------- - - -@dataclass(frozen=True) -class DetectionJudgeResult: - dataframe: pd.DataFrame - failed_records: list[FailedRecord] - - # --------------------------------------------------------------------------- # Prompt # --------------------------------------------------------------------------- @@ -192,36 +176,12 @@ def _label_examples_for_judge(parsed: EntitiesByValueSchema) -> str: return json.dumps(examples, ensure_ascii=True) -def _flatten_judgment(raw: object) -> tuple[bool | None, list[dict[str, str]]]: - """Normalize an LLM judge output into (all_valid, invalid_entities). - - Returns ``(None, [])`` for any malformed or missing payload so downstream - display can render "judge unavailable" rather than fabricate a verdict. - """ - if raw is None: - return None, [] - if hasattr(raw, "model_dump"): - raw = raw.model_dump(mode="python") - if isinstance(raw, str): - try: - raw = json.loads(raw) - except (json.JSONDecodeError, ValueError): - return None, [] - if not isinstance(raw, dict): - return None, [] - try: - parsed = DetectionJudgmentSchema.model_validate(raw) - except Exception: - return None, [] - return parsed.all_valid, [entry.model_dump() for entry in parsed.invalid_entities] - - # --------------------------------------------------------------------------- # Workflow # --------------------------------------------------------------------------- -class DetectionJudgeWorkflow: +class DetectionJudgeWorkflow(_BaseJudgeWorkflow): """LLM-as-judge evaluator that flags invalid PII detections per record. Runs after replacement and validates the detection step that fed the @@ -229,113 +189,31 @@ class DetectionJudgeWorkflow: ``COL_DETECTION_INVALID_ENTITIES`` (list of {value, label, reasoning}). """ - def __init__(self, adapter: NddAdapter) -> None: - self._adapter = adapter - - # ------------------------------------------------------------------------ - # Decomposed pieces — the orchestrator in ReplacementWorkflow uses these - # to merge all 4 judges into a single adapter.run_workflow() call. - # ------------------------------------------------------------------------ - - def prepare( - self, - dataframe: pd.DataFrame, - *, - entities_column: str = COL_ENTITIES_BY_VALUE, - ) -> pd.DataFrame: - """Add the intermediate columns this judge's prompt template references. - - Returns a copy of ``dataframe`` with ``_entities_for_detection_judge`` and - ``_entity_examples_for_detection_judge`` populated. - """ + RAW_COL: ClassVar[str] = COL_DETECTION_JUDGE + VALID_COL: ClassVar[str] = COL_DETECTION_VALID + INVALID_COL: ClassVar[str] = COL_DETECTION_INVALID_ENTITIES + SCHEMA: ClassVar[type[BaseModel]] = DetectionJudgmentSchema + VERDICT_FIELD: ClassVar[str] = "all_valid" + DEFAULT_PAYLOAD: ClassVar[dict] = {"all_valid": True, "invalid_entities": []} + MODEL_ROLE: ClassVar[str] = "detection_validity_judge" + WORKFLOW_NAME: ClassVar[str] = "replace-detection-judge" + + def prepare(self, dataframe: pd.DataFrame) -> pd.DataFrame: working_df = dataframe.copy() - parsed = working_df[entities_column].apply(EntitiesByValueSchema.from_raw) + parsed = working_df[COL_ENTITIES_BY_VALUE].apply(EntitiesByValueSchema.from_raw) working_df[_ENTITIES_FOR_JUDGE_COL] = parsed.apply(_entities_for_judge) working_df[_ENTITY_EXAMPLES_FOR_JUDGE_COL] = parsed.apply(_label_examples_for_judge) return working_df - def column_config(self, selected_models: EvaluateModelSelection) -> LLMStructuredColumnConfig: - """The DD column config — name, prompt, model alias, structured-output schema.""" - return LLMStructuredColumnConfig( - name=COL_DETECTION_JUDGE, - prompt=_judge_prompt(), - model_alias=resolve_model_alias("detection_validity_judge", selected_models), - output_format=DetectionJudgmentSchema, - ) - - def postprocess(self, dataframe: pd.DataFrame) -> pd.DataFrame: - """Flatten the raw judge output into VALID / INVALID columns and apply - the passthrough default (rows with no detected entities trivially pass). - """ - out = dataframe.copy() - flattened = out[COL_DETECTION_JUDGE].apply(_flatten_judgment) if COL_DETECTION_JUDGE in out.columns else None + def _passthrough_mask(self, dataframe: pd.DataFrame) -> pd.Series: # `items` may be a numpy array after a parquet round-trip via DD, so use # `len()` rather than `bool()` (which is ambiguous on multi-element arrays). - passthrough_mask = out[_ENTITIES_FOR_JUDGE_COL].apply(lambda items: items is None or len(items) == 0) - - valid: list[bool | None] = [] - invalid: list[list[dict[str, str]]] = [] - for idx in out.index: - if passthrough_mask.loc[idx]: - valid.append(True) - invalid.append([]) - elif flattened is not None: - v, inv = flattened.loc[idx] - valid.append(v) - invalid.append(inv) - else: - valid.append(None) - invalid.append([]) - out[COL_DETECTION_VALID] = valid - out[COL_DETECTION_INVALID_ENTITIES] = invalid - # Stamp passthrough rows with the default raw judge payload so display logic stays consistent. - if COL_DETECTION_JUDGE in out.columns: - out.loc[passthrough_mask, COL_DETECTION_JUDGE] = [{"all_valid": True, "invalid_entities": []}] * int( - passthrough_mask.sum() - ) - return out - - # ------------------------------------------------------------------------ - # Legacy single-judge entry point. Kept so existing callers/tests still work. - # ------------------------------------------------------------------------ - - def evaluate( - self, - dataframe: pd.DataFrame, - *, - model_configs: list[ModelConfig], - selected_models: EvaluateModelSelection, - entities_column: str = COL_ENTITIES_BY_VALUE, - preview_num_records: int | None = None, - ) -> DetectionJudgeResult: - working_df = self.prepare(dataframe, entities_column=entities_column) - - entity_rows, passthrough_rows = split_rows(working_df, column=_ENTITIES_FOR_JUDGE_COL, predicate=bool) - passthrough_rows[COL_DETECTION_JUDGE] = [ - {"all_valid": True, "invalid_entities": []} for _ in range(len(passthrough_rows)) - ] - passthrough_rows[COL_DETECTION_VALID] = True - passthrough_rows[COL_DETECTION_INVALID_ENTITIES] = [[] for _ in range(len(passthrough_rows))] - - if entity_rows.empty: - combined = merge_and_reorder(passthrough_rows) - return DetectionJudgeResult(dataframe=combined, failed_records=[]) - - effective_preview_num_records = ( - min(preview_num_records, len(entity_rows)) if preview_num_records is not None else None - ) - run_result = self._adapter.run_workflow( - entity_rows, - model_configs=model_configs, - columns=[self.column_config(selected_models)], - workflow_name="replace-detection-judge", - preview_num_records=effective_preview_num_records, - ) - - judged_df = run_result.dataframe.copy() - flattened = judged_df[COL_DETECTION_JUDGE].apply(_flatten_judgment) - judged_df[COL_DETECTION_VALID] = flattened.apply(lambda pair: pair[0]) - judged_df[COL_DETECTION_INVALID_ENTITIES] = flattened.apply(lambda pair: pair[1]) - - combined = merge_and_reorder(judged_df, passthrough_rows) - return DetectionJudgeResult(dataframe=combined, failed_records=run_result.failed_records) + return dataframe[_ENTITIES_FOR_JUDGE_COL].apply(lambda items: items is None or len(items) == 0) + + @classmethod + def _build_prompt(cls) -> str: + return _judge_prompt() + + @classmethod + def _extract_invalid(cls, parsed: BaseModel) -> list[dict[str, object]]: + return [entry.model_dump() for entry in parsed.invalid_entities] diff --git a/src/anonymizer/engine/evaluation/judge_base.py b/src/anonymizer/engine/evaluation/judge_base.py new file mode 100644 index 00000000..8fa107cd --- /dev/null +++ b/src/anonymizer/engine/evaluation/judge_base.py @@ -0,0 +1,196 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Base class for LLM-as-judge workflows. + +Each judge follows the same shape: ``prepare`` adds intermediate columns the +prompt references, the LLM runs as one DataDesigner column, then +``postprocess`` flattens the raw payload into a boolean ``*_valid`` column +plus a list of invalid entries. The standalone ``evaluate`` entry point wraps +that pipeline for callers that want to run a single judge in isolation. + +Subclasses declare only what's actually unique to a judge — column names, +schema, model role, prompt builder, and the per-row passthrough rule. +""" + +from __future__ import annotations + +import json +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import ClassVar + +import pandas as pd +from data_designer.config.column_configs import LLMStructuredColumnConfig +from data_designer.config.models import ModelConfig +from pydantic import BaseModel + +from anonymizer.config.models import EvaluateModelSelection +from anonymizer.engine.ndd.adapter import FailedRecord, NddAdapter +from anonymizer.engine.ndd.model_loader import resolve_model_alias +from anonymizer.engine.row_partitioning import ROW_ORDER_COL, merge_and_reorder + + +@dataclass(frozen=True) +class JudgeResult: + """Result of a standalone single-judge ``evaluate()`` call.""" + + dataframe: pd.DataFrame + failed_records: list[FailedRecord] + + +class _BaseJudgeWorkflow(ABC): + """Common scaffolding for the four LLM-as-judge workflows.""" + + # Column names this judge reads and writes. + RAW_COL: ClassVar[str] + VALID_COL: ClassVar[str] + INVALID_COL: ClassVar[str] + + # Structured-output schema and the verdict field name on it. + SCHEMA: ClassVar[type[BaseModel]] + VERDICT_FIELD: ClassVar[str] + + # Payload used to stamp passthrough rows so display logic stays uniform. + DEFAULT_PAYLOAD: ClassVar[dict] + + # Model alias role consulted on EvaluateModelSelection. + MODEL_ROLE: ClassVar[str] + + # Logical workflow name surfaced in logs and FailedRecord entries. + WORKFLOW_NAME: ClassVar[str] + + def __init__(self, adapter: NddAdapter) -> None: + self._adapter = adapter + + # ------------------------------------------------------------------ hooks + + @abstractmethod + def prepare(self, dataframe: pd.DataFrame) -> pd.DataFrame: + """Return a copy of ``dataframe`` with the intermediate columns this + judge's prompt template references.""" + + @abstractmethod + def _passthrough_mask(self, dataframe: pd.DataFrame) -> pd.Series: + """Boolean Series — True for rows that trivially pass (no checkable content).""" + + @classmethod + @abstractmethod + def _build_prompt(cls) -> str: + """Return the column prompt. Called per ``column_config()`` so dynamic + values (e.g. current year) are resolved at evaluate time.""" + + @classmethod + @abstractmethod + def _extract_invalid(cls, parsed: BaseModel) -> list[dict[str, object]]: + """Extract the invalid-entries list from a parsed schema instance.""" + + # ----------------------------------------------------------------- shared + + def column_config(self, selected_models: EvaluateModelSelection) -> LLMStructuredColumnConfig: + return LLMStructuredColumnConfig( + name=self.RAW_COL, + prompt=self._build_prompt(), + model_alias=resolve_model_alias(self.MODEL_ROLE, selected_models), + output_format=self.SCHEMA, + ) + + @classmethod + def _flatten_judgment(cls, raw: object) -> tuple[bool | None, list[dict[str, object]]]: + """Normalize an LLM judge output into ``(verdict, invalid_entries)``. + + Returns ``(None, [])`` for any malformed or missing payload so downstream + display renders "judge unavailable" rather than fabricating a verdict. + """ + if raw is None: + return None, [] + if hasattr(raw, "model_dump"): + raw = raw.model_dump(mode="python") + if isinstance(raw, str): + try: + raw = json.loads(raw) + except (json.JSONDecodeError, ValueError): + return None, [] + if not isinstance(raw, dict): + return None, [] + try: + parsed = cls.SCHEMA.model_validate(raw) + except Exception: + return None, [] + return getattr(parsed, cls.VERDICT_FIELD), cls._extract_invalid(parsed) + + def postprocess(self, dataframe: pd.DataFrame) -> pd.DataFrame: + """Flatten the raw judge output into VALID / INVALID columns and apply + the passthrough default (rows with no checkable content trivially pass). + """ + out = dataframe.copy() + flattened = out[self.RAW_COL].apply(self._flatten_judgment) if self.RAW_COL in out.columns else None + passthrough_mask = self._passthrough_mask(out) + + valid: list[bool | None] = [] + invalid: list[list[dict[str, object]]] = [] + for idx in out.index: + if passthrough_mask.loc[idx]: + valid.append(True) + invalid.append([]) + elif flattened is not None: + v, inv = flattened.loc[idx] + valid.append(v) + invalid.append(inv) + else: + valid.append(None) + invalid.append([]) + out[self.VALID_COL] = valid + out[self.INVALID_COL] = invalid + if self.RAW_COL in out.columns: + out.loc[passthrough_mask, self.RAW_COL] = [self.DEFAULT_PAYLOAD] * int(passthrough_mask.sum()) + return out + + def evaluate( + self, + dataframe: pd.DataFrame, + *, + model_configs: list[ModelConfig], + selected_models: EvaluateModelSelection, + preview_num_records: int | None = None, + ) -> JudgeResult: + """Standalone single-judge entry point. The orchestrator in + ``ReplacementWorkflow`` does not go through this; tests and callers + that want to run one judge in isolation do. + """ + working_df = self.prepare(dataframe) + # `prepare()` returns a fresh copy (per its contract), so we can stamp + # the row-order column directly. ROW_ORDER_COL lets merge_and_reorder + # restore input order after the passthrough and LLM-judged partitions + # are processed independently. + working_df[ROW_ORDER_COL] = range(len(working_df)) + passthrough_mask = self._passthrough_mask(working_df) + passthrough_rows = working_df[passthrough_mask].copy() + with_content = working_df[~passthrough_mask].copy() + + passthrough_rows[self.RAW_COL] = [self.DEFAULT_PAYLOAD for _ in range(len(passthrough_rows))] + passthrough_rows[self.VALID_COL] = True + passthrough_rows[self.INVALID_COL] = [[] for _ in range(len(passthrough_rows))] + + if with_content.empty: + combined = merge_and_reorder(passthrough_rows) + return JudgeResult(dataframe=combined, failed_records=[]) + + effective_preview_num_records = ( + min(preview_num_records, len(with_content)) if preview_num_records is not None else None + ) + run_result = self._adapter.run_workflow( + with_content, + model_configs=model_configs, + columns=[self.column_config(selected_models)], + workflow_name=self.WORKFLOW_NAME, + preview_num_records=effective_preview_num_records, + ) + + judged_df = run_result.dataframe.copy() + flattened = judged_df[self.RAW_COL].apply(self._flatten_judgment) + judged_df[self.VALID_COL] = flattened.apply(lambda pair: pair[0]) + judged_df[self.INVALID_COL] = flattened.apply(lambda pair: pair[1]) + + combined = merge_and_reorder(judged_df, passthrough_rows) + return JudgeResult(dataframe=combined, failed_records=run_result.failed_records) diff --git a/src/anonymizer/engine/evaluation/replace/attribute_fidelity_judge.py b/src/anonymizer/engine/evaluation/replace/attribute_fidelity_judge.py index b9c72079..3dd67692 100644 --- a/src/anonymizer/engine/evaluation/replace/attribute_fidelity_judge.py +++ b/src/anonymizer/engine/evaluation/replace/attribute_fidelity_judge.py @@ -5,25 +5,20 @@ import json import logging -from dataclasses import dataclass from datetime import datetime +from typing import ClassVar import pandas as pd -from data_designer.config.column_configs import LLMStructuredColumnConfig -from data_designer.config.models import ModelConfig from pydantic import BaseModel, Field -from anonymizer.config.models import EvaluateModelSelection from anonymizer.engine.constants import ( COL_ATTRIBUTE_FIDELITY_INVALID_ENTITIES, COL_ATTRIBUTE_FIDELITY_JUDGE, COL_ATTRIBUTE_FIDELITY_VALID, COL_REPLACEMENT_MAP, ) -from anonymizer.engine.ndd.adapter import FailedRecord, NddAdapter -from anonymizer.engine.ndd.model_loader import resolve_model_alias +from anonymizer.engine.evaluation.judge_base import _BaseJudgeWorkflow from anonymizer.engine.prompt_utils import substitute_placeholders -from anonymizer.engine.row_partitioning import merge_and_reorder, split_rows from anonymizer.engine.schemas import EntityReplacementMapSchema logger = logging.getLogger("anonymizer.evaluation.replace.attribute_fidelity_judge") @@ -68,17 +63,6 @@ class AttributeFidelityJudgmentSchema(BaseModel): ) -# --------------------------------------------------------------------------- -# Result -# --------------------------------------------------------------------------- - - -@dataclass(frozen=True) -class AttributeFidelityJudgeResult: - dataframe: pd.DataFrame - failed_records: list[FailedRecord] - - # --------------------------------------------------------------------------- # Prompt # --------------------------------------------------------------------------- @@ -216,37 +200,12 @@ def _replacements_for_judge(raw_map: object) -> list[dict[str, str]]: return [{"original": r.original, "label": r.label, "synthetic": r.synthetic} for r in parsed.replacements] -def _flatten_judgment(raw: object) -> tuple[bool | None, list[dict[str, object]]]: - """Normalize an LLM judge output into (all_valid, invalid_entities). - - Returns ``(None, [])`` for any malformed or missing payload so downstream - display renders "judge unavailable" rather than fabricating a verdict. - """ - if raw is None: - return None, [] - if hasattr(raw, "model_dump"): - raw = raw.model_dump(mode="python") - if isinstance(raw, str): - try: - raw = json.loads(raw) - except (json.JSONDecodeError, ValueError): - return None, [] - if not isinstance(raw, dict): - return None, [] - try: - parsed = AttributeFidelityJudgmentSchema.model_validate(raw) - except Exception: - return None, [] - invalid = [e.model_dump() for e in parsed.entities if not e.passes] - return parsed.all_valid, invalid - - # --------------------------------------------------------------------------- # Workflow # --------------------------------------------------------------------------- -class AttributeFidelityJudgeWorkflow: +class AttributeFidelityJudgeWorkflow(_BaseJudgeWorkflow): """LLM-as-judge evaluator that checks per-entity attribute preservation. Runs after Substitute generates the replacement map. Output columns: @@ -256,88 +215,27 @@ class AttributeFidelityJudgeWorkflow: can derive the success-rate denominator from the full entities list). """ - def __init__(self, adapter: NddAdapter) -> None: - self._adapter = adapter + RAW_COL: ClassVar[str] = COL_ATTRIBUTE_FIDELITY_JUDGE + VALID_COL: ClassVar[str] = COL_ATTRIBUTE_FIDELITY_VALID + INVALID_COL: ClassVar[str] = COL_ATTRIBUTE_FIDELITY_INVALID_ENTITIES + SCHEMA: ClassVar[type[BaseModel]] = AttributeFidelityJudgmentSchema + VERDICT_FIELD: ClassVar[str] = "all_valid" + DEFAULT_PAYLOAD: ClassVar[dict] = {"all_valid": True, "entities": []} + MODEL_ROLE: ClassVar[str] = "replace_attribute_fidelity_judge" + WORKFLOW_NAME: ClassVar[str] = "replace-attribute-fidelity-judge" def prepare(self, dataframe: pd.DataFrame) -> pd.DataFrame: working_df = dataframe.copy() working_df[_REPLACEMENTS_FOR_JUDGE_COL] = working_df[COL_REPLACEMENT_MAP].apply(_replacements_for_judge) return working_df - def column_config(self, selected_models: EvaluateModelSelection) -> LLMStructuredColumnConfig: - return LLMStructuredColumnConfig( - name=COL_ATTRIBUTE_FIDELITY_JUDGE, - prompt=_judge_prompt(), - model_alias=resolve_model_alias("replace_attribute_fidelity_judge", selected_models), - output_format=AttributeFidelityJudgmentSchema, - ) - - def postprocess(self, dataframe: pd.DataFrame) -> pd.DataFrame: - out = dataframe.copy() - flattened = ( - out[COL_ATTRIBUTE_FIDELITY_JUDGE].apply(_flatten_judgment) - if COL_ATTRIBUTE_FIDELITY_JUDGE in out.columns - else None - ) - passthrough_mask = out[_REPLACEMENTS_FOR_JUDGE_COL].apply(lambda items: items is None or len(items) == 0) - - valid: list[bool | None] = [] - invalid: list[list[dict[str, str]]] = [] - for idx in out.index: - if passthrough_mask.loc[idx]: - valid.append(True) - invalid.append([]) - elif flattened is not None: - v, inv = flattened.loc[idx] - valid.append(v) - invalid.append(inv) - else: - valid.append(None) - invalid.append([]) - out[COL_ATTRIBUTE_FIDELITY_VALID] = valid - out[COL_ATTRIBUTE_FIDELITY_INVALID_ENTITIES] = invalid - if COL_ATTRIBUTE_FIDELITY_JUDGE in out.columns: - out.loc[passthrough_mask, COL_ATTRIBUTE_FIDELITY_JUDGE] = [{"all_valid": True, "entities": []}] * int( - passthrough_mask.sum() - ) - return out - - def evaluate( - self, - dataframe: pd.DataFrame, - *, - model_configs: list[ModelConfig], - selected_models: EvaluateModelSelection, - preview_num_records: int | None = None, - ) -> AttributeFidelityJudgeResult: - working_df = self.prepare(dataframe) - - with_replacements, passthrough_rows = split_rows(working_df, column=_REPLACEMENTS_FOR_JUDGE_COL, predicate=bool) - passthrough_rows[COL_ATTRIBUTE_FIDELITY_JUDGE] = [ - {"all_valid": True, "entities": []} for _ in range(len(passthrough_rows)) - ] - passthrough_rows[COL_ATTRIBUTE_FIDELITY_VALID] = True - passthrough_rows[COL_ATTRIBUTE_FIDELITY_INVALID_ENTITIES] = [[] for _ in range(len(passthrough_rows))] - - if with_replacements.empty: - combined = merge_and_reorder(passthrough_rows) - return AttributeFidelityJudgeResult(dataframe=combined, failed_records=[]) - - effective_preview_num_records = ( - min(preview_num_records, len(with_replacements)) if preview_num_records is not None else None - ) - run_result = self._adapter.run_workflow( - with_replacements, - model_configs=model_configs, - columns=[self.column_config(selected_models)], - workflow_name="replace-attribute-fidelity-judge", - preview_num_records=effective_preview_num_records, - ) + def _passthrough_mask(self, dataframe: pd.DataFrame) -> pd.Series: + return dataframe[_REPLACEMENTS_FOR_JUDGE_COL].apply(lambda items: items is None or len(items) == 0) - judged_df = run_result.dataframe.copy() - flattened = judged_df[COL_ATTRIBUTE_FIDELITY_JUDGE].apply(_flatten_judgment) - judged_df[COL_ATTRIBUTE_FIDELITY_VALID] = flattened.apply(lambda pair: pair[0]) - judged_df[COL_ATTRIBUTE_FIDELITY_INVALID_ENTITIES] = flattened.apply(lambda pair: pair[1]) + @classmethod + def _build_prompt(cls) -> str: + return _judge_prompt() - combined = merge_and_reorder(judged_df, passthrough_rows) - return AttributeFidelityJudgeResult(dataframe=combined, failed_records=run_result.failed_records) + @classmethod + def _extract_invalid(cls, parsed: BaseModel) -> list[dict[str, object]]: + return [e.model_dump() for e in parsed.entities if not e.passes] diff --git a/src/anonymizer/engine/evaluation/replace/relational_consistency_judge.py b/src/anonymizer/engine/evaluation/replace/relational_consistency_judge.py index 572f3542..ae190d3e 100644 --- a/src/anonymizer/engine/evaluation/replace/relational_consistency_judge.py +++ b/src/anonymizer/engine/evaluation/replace/relational_consistency_judge.py @@ -5,14 +5,11 @@ import json import logging -from dataclasses import dataclass +from typing import ClassVar import pandas as pd -from data_designer.config.column_configs import LLMStructuredColumnConfig -from data_designer.config.models import ModelConfig from pydantic import BaseModel, Field -from anonymizer.config.models import EvaluateModelSelection from anonymizer.engine.constants import ( COL_RELATIONAL_CONSISTENCY_INVALID_RELATIONS, COL_RELATIONAL_CONSISTENCY_JUDGE, @@ -21,10 +18,8 @@ COL_REPLACEMENT_MAP, _jinja, ) -from anonymizer.engine.ndd.adapter import FailedRecord, NddAdapter -from anonymizer.engine.ndd.model_loader import resolve_model_alias +from anonymizer.engine.evaluation.judge_base import _BaseJudgeWorkflow from anonymizer.engine.prompt_utils import substitute_placeholders -from anonymizer.engine.row_partitioning import merge_and_reorder, split_rows from anonymizer.engine.schemas import EntityReplacementMapSchema logger = logging.getLogger("anonymizer.evaluation.replace.relational_consistency_judge") @@ -69,17 +64,6 @@ class RelationalConsistencyJudgmentSchema(BaseModel): ) -# --------------------------------------------------------------------------- -# Result -# --------------------------------------------------------------------------- - - -@dataclass(frozen=True) -class RelationalConsistencyJudgeResult: - dataframe: pd.DataFrame - failed_records: list[FailedRecord] - - # --------------------------------------------------------------------------- # Prompt # --------------------------------------------------------------------------- @@ -254,11 +238,7 @@ def _judge_prompt() -> str: def _replacements_for_judge(raw_map: object) -> list[dict[str, str]]: - """Flatten COL_REPLACEMENT_MAP into Jinja-friendly dicts. - - Mirrors the type-fidelity helper but kept local so the two judges remain - independent — if one changes its prompt shape, the other is unaffected. - """ + """Flatten COL_REPLACEMENT_MAP into Jinja-friendly dicts.""" if raw_map is None: return [] if hasattr(raw_map, "model_dump"): @@ -277,37 +257,12 @@ def _replacements_for_judge(raw_map: object) -> list[dict[str, str]]: return [{"original": r.original, "label": r.label, "synthetic": r.synthetic} for r in parsed.replacements] -def _flatten_judgment(raw: object) -> tuple[bool | None, list[dict[str, object]]]: - """Normalize an LLM judge output into (all_consistent, invalid_relations). - - Returns ``(None, [])`` for any malformed or missing payload so downstream - display renders "judge unavailable" rather than fabricating a verdict. - """ - if raw is None: - return None, [] - if hasattr(raw, "model_dump"): - raw = raw.model_dump(mode="python") - if isinstance(raw, str): - try: - raw = json.loads(raw) - except (json.JSONDecodeError, ValueError): - return None, [] - if not isinstance(raw, dict): - return None, [] - try: - parsed = RelationalConsistencyJudgmentSchema.model_validate(raw) - except Exception: - return None, [] - invalid = [r.model_dump() for r in parsed.relations if not r.passes] - return parsed.all_consistent, invalid - - # --------------------------------------------------------------------------- # Workflow # --------------------------------------------------------------------------- -class RelationalConsistencyJudgeWorkflow: +class RelationalConsistencyJudgeWorkflow(_BaseJudgeWorkflow): """LLM-as-judge evaluator that checks cross-entity coherence within a record. Runs after Substitute generates the replacement map. Output columns: @@ -317,94 +272,28 @@ class RelationalConsistencyJudgeWorkflow: display can derive the success-rate denominator from the full relations list). """ - def __init__(self, adapter: NddAdapter) -> None: - self._adapter = adapter + RAW_COL: ClassVar[str] = COL_RELATIONAL_CONSISTENCY_JUDGE + VALID_COL: ClassVar[str] = COL_RELATIONAL_CONSISTENCY_VALID + INVALID_COL: ClassVar[str] = COL_RELATIONAL_CONSISTENCY_INVALID_RELATIONS + SCHEMA: ClassVar[type[BaseModel]] = RelationalConsistencyJudgmentSchema + VERDICT_FIELD: ClassVar[str] = "all_consistent" + DEFAULT_PAYLOAD: ClassVar[dict] = {"all_consistent": True, "relations": []} + MODEL_ROLE: ClassVar[str] = "replace_relational_consistency_judge" + WORKFLOW_NAME: ClassVar[str] = "replace-relational-consistency-judge" def prepare(self, dataframe: pd.DataFrame) -> pd.DataFrame: working_df = dataframe.copy() working_df[_REPLACEMENTS_FOR_JUDGE_COL] = working_df[COL_REPLACEMENT_MAP].apply(_replacements_for_judge) return working_df - def column_config(self, selected_models: EvaluateModelSelection) -> LLMStructuredColumnConfig: - return LLMStructuredColumnConfig( - name=COL_RELATIONAL_CONSISTENCY_JUDGE, - prompt=_judge_prompt(), - model_alias=resolve_model_alias("replace_relational_consistency_judge", selected_models), - output_format=RelationalConsistencyJudgmentSchema, - ) - - def postprocess(self, dataframe: pd.DataFrame) -> pd.DataFrame: - out = dataframe.copy() - flattened = ( - out[COL_RELATIONAL_CONSISTENCY_JUDGE].apply(_flatten_judgment) - if COL_RELATIONAL_CONSISTENCY_JUDGE in out.columns - else None - ) - # Passthrough: fewer than 2 replacements => no checkable relations. - # `items` may be a numpy array after a parquet round-trip via DD. - passthrough_mask = out[_REPLACEMENTS_FOR_JUDGE_COL].apply(lambda items: items is None or len(items) < 2) - - valid: list[bool | None] = [] - invalid: list[list[dict[str, str]]] = [] - for idx in out.index: - if passthrough_mask.loc[idx]: - valid.append(True) - invalid.append([]) - elif flattened is not None: - v, inv = flattened.loc[idx] - valid.append(v) - invalid.append(inv) - else: - valid.append(None) - invalid.append([]) - out[COL_RELATIONAL_CONSISTENCY_VALID] = valid - out[COL_RELATIONAL_CONSISTENCY_INVALID_RELATIONS] = invalid - if COL_RELATIONAL_CONSISTENCY_JUDGE in out.columns: - out.loc[passthrough_mask, COL_RELATIONAL_CONSISTENCY_JUDGE] = [ - {"all_consistent": True, "relations": []} - ] * int(passthrough_mask.sum()) - return out - - def evaluate( - self, - dataframe: pd.DataFrame, - *, - model_configs: list[ModelConfig], - selected_models: EvaluateModelSelection, - preview_num_records: int | None = None, - ) -> RelationalConsistencyJudgeResult: - working_df = self.prepare(dataframe) - - with_relations, passthrough_rows = split_rows( - working_df, - column=_REPLACEMENTS_FOR_JUDGE_COL, - predicate=lambda items: bool(items) and len(items) >= 2, - ) - passthrough_rows[COL_RELATIONAL_CONSISTENCY_JUDGE] = [ - {"all_consistent": True, "relations": []} for _ in range(len(passthrough_rows)) - ] - passthrough_rows[COL_RELATIONAL_CONSISTENCY_VALID] = True - passthrough_rows[COL_RELATIONAL_CONSISTENCY_INVALID_RELATIONS] = [[] for _ in range(len(passthrough_rows))] - - if with_relations.empty: - combined = merge_and_reorder(passthrough_rows) - return RelationalConsistencyJudgeResult(dataframe=combined, failed_records=[]) - - effective_preview_num_records = ( - min(preview_num_records, len(with_relations)) if preview_num_records is not None else None - ) - run_result = self._adapter.run_workflow( - with_relations, - model_configs=model_configs, - columns=[self.column_config(selected_models)], - workflow_name="replace-relational-consistency-judge", - preview_num_records=effective_preview_num_records, - ) + def _passthrough_mask(self, dataframe: pd.DataFrame) -> pd.Series: + # Fewer than 2 replacements => no checkable relations. + return dataframe[_REPLACEMENTS_FOR_JUDGE_COL].apply(lambda items: items is None or len(items) < 2) - judged_df = run_result.dataframe.copy() - flattened = judged_df[COL_RELATIONAL_CONSISTENCY_JUDGE].apply(_flatten_judgment) - judged_df[COL_RELATIONAL_CONSISTENCY_VALID] = flattened.apply(lambda pair: pair[0]) - judged_df[COL_RELATIONAL_CONSISTENCY_INVALID_RELATIONS] = flattened.apply(lambda pair: pair[1]) + @classmethod + def _build_prompt(cls) -> str: + return _judge_prompt() - combined = merge_and_reorder(judged_df, passthrough_rows) - return RelationalConsistencyJudgeResult(dataframe=combined, failed_records=run_result.failed_records) + @classmethod + def _extract_invalid(cls, parsed: BaseModel) -> list[dict[str, object]]: + return [r.model_dump() for r in parsed.relations if not r.passes] diff --git a/src/anonymizer/engine/evaluation/replace/type_fidelity_judge.py b/src/anonymizer/engine/evaluation/replace/type_fidelity_judge.py index dda838b5..8fe2a113 100644 --- a/src/anonymizer/engine/evaluation/replace/type_fidelity_judge.py +++ b/src/anonymizer/engine/evaluation/replace/type_fidelity_judge.py @@ -5,14 +5,11 @@ import json import logging -from dataclasses import dataclass +from typing import ClassVar import pandas as pd -from data_designer.config.column_configs import LLMStructuredColumnConfig -from data_designer.config.models import ModelConfig from pydantic import BaseModel, Field -from anonymizer.config.models import EvaluateModelSelection from anonymizer.engine.constants import ( COL_REPLACEMENT_MAP, COL_TYPE_FIDELITY_INVALID_REPLACEMENTS, @@ -20,10 +17,8 @@ COL_TYPE_FIDELITY_VALID, ENTITY_LABEL_EXAMPLES, ) -from anonymizer.engine.ndd.adapter import FailedRecord, NddAdapter -from anonymizer.engine.ndd.model_loader import resolve_model_alias +from anonymizer.engine.evaluation.judge_base import _BaseJudgeWorkflow from anonymizer.engine.prompt_utils import substitute_placeholders -from anonymizer.engine.row_partitioning import merge_and_reorder, split_rows from anonymizer.engine.schemas import EntityReplacementMapSchema logger = logging.getLogger("anonymizer.evaluation.replace.type_fidelity_judge") @@ -57,17 +52,6 @@ class TypeFidelityJudgmentSchema(BaseModel): ) -# --------------------------------------------------------------------------- -# Result -# --------------------------------------------------------------------------- - - -@dataclass(frozen=True) -class TypeFidelityJudgeResult: - dataframe: pd.DataFrame - failed_records: list[FailedRecord] - - # --------------------------------------------------------------------------- # Prompt # --------------------------------------------------------------------------- @@ -275,36 +259,12 @@ def _label_examples_for_judge(replacements: list[dict[str, str]]) -> str: return json.dumps(examples, ensure_ascii=True) -def _flatten_judgment(raw: object) -> tuple[bool | None, list[dict[str, str]]]: - """Normalize an LLM judge output into (all_valid, invalid_replacements). - - Returns ``(None, [])`` for any malformed or missing payload so downstream - display renders "judge unavailable" rather than fabricating a verdict. - """ - if raw is None: - return None, [] - if hasattr(raw, "model_dump"): - raw = raw.model_dump(mode="python") - if isinstance(raw, str): - try: - raw = json.loads(raw) - except (json.JSONDecodeError, ValueError): - return None, [] - if not isinstance(raw, dict): - return None, [] - try: - parsed = TypeFidelityJudgmentSchema.model_validate(raw) - except Exception: - return None, [] - return parsed.all_valid, [entry.model_dump() for entry in parsed.invalid_replacements] - - # --------------------------------------------------------------------------- # Workflow # --------------------------------------------------------------------------- -class TypeFidelityJudgeWorkflow: +class TypeFidelityJudgeWorkflow(_BaseJudgeWorkflow): """LLM-as-judge evaluator that flags replacements failing type fidelity. Runs after Substitute generates the replacement map. Output columns: @@ -313,89 +273,29 @@ class TypeFidelityJudgeWorkflow: {original, label, synthetic, reasoning}). """ - def __init__(self, adapter: NddAdapter) -> None: - self._adapter = adapter + RAW_COL: ClassVar[str] = COL_TYPE_FIDELITY_JUDGE + VALID_COL: ClassVar[str] = COL_TYPE_FIDELITY_VALID + INVALID_COL: ClassVar[str] = COL_TYPE_FIDELITY_INVALID_REPLACEMENTS + SCHEMA: ClassVar[type[BaseModel]] = TypeFidelityJudgmentSchema + VERDICT_FIELD: ClassVar[str] = "all_valid" + DEFAULT_PAYLOAD: ClassVar[dict] = {"all_valid": True, "invalid_replacements": []} + MODEL_ROLE: ClassVar[str] = "replace_type_fidelity_judge" + WORKFLOW_NAME: ClassVar[str] = "replace-type-fidelity-judge" def prepare(self, dataframe: pd.DataFrame) -> pd.DataFrame: - """Add intermediate columns this judge's prompt template references.""" working_df = dataframe.copy() replacements_per_row = working_df[COL_REPLACEMENT_MAP].apply(_replacements_for_judge) working_df[_REPLACEMENTS_FOR_JUDGE_COL] = replacements_per_row working_df[_EXAMPLES_FOR_JUDGE_COL] = replacements_per_row.apply(_label_examples_for_judge) return working_df - def column_config(self, selected_models: EvaluateModelSelection) -> LLMStructuredColumnConfig: - return LLMStructuredColumnConfig( - name=COL_TYPE_FIDELITY_JUDGE, - prompt=_judge_prompt(), - model_alias=resolve_model_alias("replace_type_fidelity_judge", selected_models), - output_format=TypeFidelityJudgmentSchema, - ) - - def postprocess(self, dataframe: pd.DataFrame) -> pd.DataFrame: - out = dataframe.copy() - flattened = ( - out[COL_TYPE_FIDELITY_JUDGE].apply(_flatten_judgment) if COL_TYPE_FIDELITY_JUDGE in out.columns else None - ) - passthrough_mask = out[_REPLACEMENTS_FOR_JUDGE_COL].apply(lambda items: items is None or len(items) == 0) - - valid: list[bool | None] = [] - invalid: list[list[dict[str, str]]] = [] - for idx in out.index: - if passthrough_mask.loc[idx]: - valid.append(True) - invalid.append([]) - elif flattened is not None: - v, inv = flattened.loc[idx] - valid.append(v) - invalid.append(inv) - else: - valid.append(None) - invalid.append([]) - out[COL_TYPE_FIDELITY_VALID] = valid - out[COL_TYPE_FIDELITY_INVALID_REPLACEMENTS] = invalid - if COL_TYPE_FIDELITY_JUDGE in out.columns: - out.loc[passthrough_mask, COL_TYPE_FIDELITY_JUDGE] = [ - {"all_valid": True, "invalid_replacements": []} - ] * int(passthrough_mask.sum()) - return out - - def evaluate( - self, - dataframe: pd.DataFrame, - *, - model_configs: list[ModelConfig], - selected_models: EvaluateModelSelection, - preview_num_records: int | None = None, - ) -> TypeFidelityJudgeResult: - working_df = self.prepare(dataframe) - - with_replacements, passthrough_rows = split_rows(working_df, column=_REPLACEMENTS_FOR_JUDGE_COL, predicate=bool) - passthrough_rows[COL_TYPE_FIDELITY_JUDGE] = [ - {"all_valid": True, "invalid_replacements": []} for _ in range(len(passthrough_rows)) - ] - passthrough_rows[COL_TYPE_FIDELITY_VALID] = True - passthrough_rows[COL_TYPE_FIDELITY_INVALID_REPLACEMENTS] = [[] for _ in range(len(passthrough_rows))] - - if with_replacements.empty: - combined = merge_and_reorder(passthrough_rows) - return TypeFidelityJudgeResult(dataframe=combined, failed_records=[]) - - effective_preview_num_records = ( - min(preview_num_records, len(with_replacements)) if preview_num_records is not None else None - ) - run_result = self._adapter.run_workflow( - with_replacements, - model_configs=model_configs, - columns=[self.column_config(selected_models)], - workflow_name="replace-type-fidelity-judge", - preview_num_records=effective_preview_num_records, - ) + def _passthrough_mask(self, dataframe: pd.DataFrame) -> pd.Series: + return dataframe[_REPLACEMENTS_FOR_JUDGE_COL].apply(lambda items: items is None or len(items) == 0) - judged_df = run_result.dataframe.copy() - flattened = judged_df[COL_TYPE_FIDELITY_JUDGE].apply(_flatten_judgment) - judged_df[COL_TYPE_FIDELITY_VALID] = flattened.apply(lambda pair: pair[0]) - judged_df[COL_TYPE_FIDELITY_INVALID_REPLACEMENTS] = flattened.apply(lambda pair: pair[1]) + @classmethod + def _build_prompt(cls) -> str: + return _judge_prompt() - combined = merge_and_reorder(judged_df, passthrough_rows) - return TypeFidelityJudgeResult(dataframe=combined, failed_records=run_result.failed_records) + @classmethod + def _extract_invalid(cls, parsed: BaseModel) -> list[dict[str, object]]: + return [entry.model_dump() for entry in parsed.invalid_replacements] diff --git a/tests/engine/evaluation/replace/test_attribute_fidelity_judge.py b/tests/engine/evaluation/replace/test_attribute_fidelity_judge.py index 78902f9d..03c22738 100644 --- a/tests/engine/evaluation/replace/test_attribute_fidelity_judge.py +++ b/tests/engine/evaluation/replace/test_attribute_fidelity_judge.py @@ -16,11 +16,12 @@ from anonymizer.engine.evaluation.replace.attribute_fidelity_judge import ( AttributeFidelityJudgeWorkflow, AttributeFidelityJudgmentSchema, - _flatten_judgment, _judge_prompt, _replacements_for_judge, ) +_flatten_judgment = AttributeFidelityJudgeWorkflow._flatten_judgment + # --------------------------------------------------------------------------- # Tests: _judge_prompt # --------------------------------------------------------------------------- diff --git a/tests/engine/evaluation/replace/test_relational_consistency_judge.py b/tests/engine/evaluation/replace/test_relational_consistency_judge.py index d4208181..44a5a7b0 100644 --- a/tests/engine/evaluation/replace/test_relational_consistency_judge.py +++ b/tests/engine/evaluation/replace/test_relational_consistency_judge.py @@ -17,11 +17,12 @@ from anonymizer.engine.evaluation.replace.relational_consistency_judge import ( RelationalConsistencyJudgeWorkflow, RelationalConsistencyJudgmentSchema, - _flatten_judgment, _judge_prompt, _replacements_for_judge, ) +_flatten_judgment = RelationalConsistencyJudgeWorkflow._flatten_judgment + # --------------------------------------------------------------------------- # Tests: _judge_prompt # --------------------------------------------------------------------------- diff --git a/tests/engine/evaluation/replace/test_type_fidelity_judge.py b/tests/engine/evaluation/replace/test_type_fidelity_judge.py index 45523ae0..c17767cd 100644 --- a/tests/engine/evaluation/replace/test_type_fidelity_judge.py +++ b/tests/engine/evaluation/replace/test_type_fidelity_judge.py @@ -16,12 +16,13 @@ from anonymizer.engine.evaluation.replace.type_fidelity_judge import ( TypeFidelityJudgeWorkflow, TypeFidelityJudgmentSchema, - _flatten_judgment, _judge_prompt, _label_examples_for_judge, _replacements_for_judge, ) +_flatten_judgment = TypeFidelityJudgeWorkflow._flatten_judgment + # --------------------------------------------------------------------------- # Tests: _judge_prompt # --------------------------------------------------------------------------- diff --git a/tests/engine/evaluation/test_detection_judge.py b/tests/engine/evaluation/test_detection_judge.py index 1d92eca2..2b807b1b 100644 --- a/tests/engine/evaluation/test_detection_judge.py +++ b/tests/engine/evaluation/test_detection_judge.py @@ -18,12 +18,13 @@ DetectionJudgeWorkflow, DetectionJudgmentSchema, _entities_for_judge, - _flatten_judgment, _judge_prompt, _label_examples_for_judge, ) from anonymizer.engine.schemas import EntitiesByValueSchema +_flatten_judgment = DetectionJudgeWorkflow._flatten_judgment + # --------------------------------------------------------------------------- # Tests: _judge_prompt # ---------------------------------------------------------------------------