From 14bf2460401c86572789d95816d2550e034d75b4 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Wed, 10 Jun 2026 16:42:15 -0700 Subject: [PATCH 1/3] feat: load bundled model providers by default (#180) Plain Anonymizer() now uses bundled providers.yaml instead of DataDesigner's machine-local defaults, with validation for explicit model provider fields and cross-reference checks between model configs and provider definitions. --- README.md | 2 +- docs/concepts/models.md | 6 +- skills/anonymizer/SKILL.md | 2 +- skills/anonymizer/workflows/interactive.md | 5 +- .../config/default_model_configs/README.md | 1 + .../default_model_configs/providers.yaml | 8 ++ src/anonymizer/engine/ndd/model_loader.py | 65 +++++++++++++++- src/anonymizer/interface/anonymizer.py | 36 +++++---- src/anonymizer/interface/errors.py | 2 +- tests/conftest.py | 6 +- tests/engine/test_model_loader.py | 70 +++++++++++++++-- tests/engine/test_ndd_adapter.py | 2 +- tests/interface/cli/test_cli_model_configs.py | 15 +++- tests/interface/test_anonymizer_interface.py | 77 ++++++++++++++++++- tests/interface/test_anonymizer_telemetry.py | 4 +- 15 files changed, 260 insertions(+), 41 deletions(-) create mode 100644 src/anonymizer/config/default_model_configs/providers.yaml diff --git a/README.md b/README.md index 523af650..a59d1ec7 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ Run `anonymizer --help` or `anonymizer --help` for all options. from anonymizer import Anonymizer, AnonymizerConfig, AnonymizerInput, Redact DATA_URL = "https://raw.githubusercontent.com/NVIDIA-NeMo/Anonymizer/refs/heads/main/docs/data/NVIDIA_synthetic_biographies.csv" -# Uses default model providers (build.nvidia.com) via NVIDIA_API_KEY env var +# Uses Anonymizer's bundled model providers (see src/anonymizer/config/default_model_configs/providers.yaml) anonymizer = Anonymizer() config = AnonymizerConfig(replace=Redact()) diff --git a/docs/concepts/models.md b/docs/concepts/models.md index 26ccaed4..1579530e 100644 --- a/docs/concepts/models.md +++ b/docs/concepts/models.md @@ -9,7 +9,9 @@ Anonymizer uses LLMs for entity detection, replacement, and rewriting. Models ar ## Defaults -Set your API key for Anonymizer to use models hosted on [build.nvidia.com](https://build.nvidia.com). +Plain `Anonymizer()` uses Anonymizer's bundled provider and model configs — not DataDesigner's machine-local defaults from `~/.data-designer/model_providers.yaml`. Bundled providers live at [`providers.yaml`](https://github.com/NVIDIA-NeMo/Anonymizer/blob/main/src/anonymizer/config/default_model_configs/providers.yaml); bundled models at [`models.yaml`](https://github.com/NVIDIA-NeMo/Anonymizer/blob/main/src/anonymizer/config/default_model_configs/models.yaml). + +Set your API key for Anonymizer to use models hosted on [build.nvidia.com](https://build.nvidia.com): ```bash export NVIDIA_API_KEY="your-nvidia-api-key" @@ -31,7 +33,7 @@ Each pipeline stage has a **role** mapped to one of these aliases. See the full ## Custom providers -Use `model_providers` to define named API endpoints for hosted models such as OpenAI or OpenRouter. +Pass `model_providers` when you need a non-default endpoint — for example OpenAI, OpenRouter, a local GLiNER server, or an internal inference deployment. Plain `Anonymizer()` already uses bundled [build.nvidia.com](https://build.nvidia.com) settings; override only when your models point at a different provider name or URL. Set your API keys first: diff --git a/skills/anonymizer/SKILL.md b/skills/anonymizer/SKILL.md index 6051f63e..34bee9b4 100644 --- a/skills/anonymizer/SKILL.md +++ b/skills/anonymizer/SKILL.md @@ -58,7 +58,7 @@ The agent should consult these as it goes — *do not* try to enumerate field re Environment-level issues only. Quality and pipeline issues are in `docs/troubleshooting.md`. - **`anonymizer` not installed:** Tell the user `nemo-anonymizer` is not in this Python environment (requires Python ≥ 3.11). Ask if they want you to install it (`pip install nemo-anonymizer`) or do it themselves. Do not install without permission. -- **Model aliases not configured:** Anonymizer can't run without `model_configs` and `model_providers` (YAML files or Python objects). Tell the user to set these up — see `docs/concepts/models.md`. If they don't have a config yet, point them at `src/anonymizer/config/default_model_configs/` for the shipped defaults. +- **Model/provider setup:** Plain `Anonymizer()` ships with bundled `models.yaml` and `providers.yaml` (see `src/anonymizer/config/default_model_configs/`). For the default path, confirm `NVIDIA_API_KEY` is set. Pass custom `model_configs` and/or `model_providers` only when targeting non-default endpoints or model pools — see `docs/concepts/models.md`. - **LLM calls failing at preview:** Usually an auth issue (missing or invalid API key), a network problem, or a wrong endpoint URL. See `docs/troubleshooting.md` "Validation passed but `preview` errors at LLM call". - **Local / on-prem GLiNER:** Clone or download `tools/serve_gliner.py` from the Anonymizer repo, start the server, add a provider with `endpoint: http://localhost:8001/v1`, and point `gliner-pii-detector` at `provider: local-gliner` with `skip_health_check: true`. Preflight errors about missing aliases usually mean `model_configs` only listed the detector — include the full default pool. Wrong `endpoint` or a down server surfaces as detection failures at preview — see [`docs/concepts/self-hosting-gliner.md`](../../docs/concepts/self-hosting-gliner.md). diff --git a/skills/anonymizer/workflows/interactive.md b/skills/anonymizer/workflows/interactive.md index 53b62ba5..01cf9820 100644 --- a/skills/anonymizer/workflows/interactive.md +++ b/skills/anonymizer/workflows/interactive.md @@ -7,10 +7,7 @@ Iterative design with the user. Do not disengage from the loop until the user sa 1. **Verify environment** - **Install**: run `python -c "import anonymizer; print(anonymizer.__version__)"`. If the import fails, STOP and follow the Troubleshooting section in `SKILL.md`. - - **Model providers**: before going further, confirm an LLM provider is configured. Anonymizer cannot run without one. Check that: - - An API key is set in the environment (`NVIDIA_API_KEY` for the shipped default, or the equivalent for the user's provider) - - A `providers.yaml` exists (defaults ship at `src/anonymizer/config/default_model_configs/providers.yaml`) - - If either is missing, STOP and walk the user through [`docs/concepts/models.md`](../../../docs/concepts/models.md) setup. Do not proceed to data inspection until the user confirms providers are ready. + - **Model providers**: plain `Anonymizer()` loads bundled providers from `src/anonymizer/config/default_model_configs/providers.yaml`. Before going further, confirm the API key for those defaults is set (`NVIDIA_API_KEY` for build.nvidia.com). Only ask for a custom `providers.yaml` when the user targets a non-default endpoint. If the key is missing, STOP and walk the user through [`docs/concepts/models.md`](../../../docs/concepts/models.md) setup. 2. **Inspect the data** — Read the first few rows of the source file with pandas. You need to know: - Path, format, encoding. diff --git a/src/anonymizer/config/default_model_configs/README.md b/src/anonymizer/config/default_model_configs/README.md index 689417af..4dab31fd 100644 --- a/src/anonymizer/config/default_model_configs/README.md +++ b/src/anonymizer/config/default_model_configs/README.md @@ -5,6 +5,7 @@ This directory contains the default model configurations used by the Anonymizer ## Files - **`models.yaml`** — Defines the pool of available models (alias, provider, inference parameters). Each entry becomes a `ModelConfig` that NeMo Data Designer can route requests to. +- **`providers.yaml`** — Defines named API endpoints (provider name, endpoint, API key env var). Loaded automatically when `Anonymizer(model_providers=None)`. - **`detection.yaml`** — Maps detection workflow roles (e.g. `entity_detector`, `entity_validator`) to model aliases from `models.yaml`. - **`replace.yaml`** — Maps replacement workflow roles (e.g. `replacement_generator`) to model aliases from `models.yaml`. - **`rewrite.yaml`** — Maps rewrite workflow roles (`domain_classifier`, `disposition_analyzer`, `meaning_extractor`, `qa_generator`, `rewriter`, `evaluator`, `repairer`, `judge`) to model aliases from `models.yaml`. diff --git a/src/anonymizer/config/default_model_configs/providers.yaml b/src/anonymizer/config/default_model_configs/providers.yaml new file mode 100644 index 00000000..8799886f --- /dev/null +++ b/src/anonymizer/config/default_model_configs/providers.yaml @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +providers: + - name: nvidia + endpoint: https://integrate.api.nvidia.com/v1 + provider_type: openai + api_key: NVIDIA_API_KEY diff --git a/src/anonymizer/engine/ndd/model_loader.py b/src/anonymizer/engine/ndd/model_loader.py index 01262687..0d5d1c56 100644 --- a/src/anonymizer/engine/ndd/model_loader.py +++ b/src/anonymizer/engine/ndd/model_loader.py @@ -8,7 +8,7 @@ from pathlib import Path from typing import Any -from data_designer.config.models import ModelConfig, load_model_configs +from data_designer.config.models import ModelConfig, ModelProvider, load_model_configs from data_designer.config.utils.io_helpers import load_config_file from pydantic import BaseModel @@ -62,8 +62,10 @@ def parse_model_configs(raw: str | Path | None) -> ParsedModelConfigs: parsed = _parse_yaml_string(raw) user_selections = parsed.pop("selected_models", None) + _validate_raw_model_configs_have_provider(parsed) + model_configs = load_model_configs(parsed) return ParsedModelConfigs( - model_configs=load_model_configs(parsed), + model_configs=model_configs, selected_models=_merge_selections(user_selections), ) @@ -82,6 +84,16 @@ def load_default_model_selection(config_dir: Path | None = None) -> ModelSelecti ) +def load_default_model_providers(config_dir: Path | None = None) -> list[ModelProvider]: + """Load bundled provider definitions from ``providers.yaml``.""" + resolved_dir = config_dir or DEFAULT_CONFIG_DIR + config_dict = _load_yaml_dict(resolved_dir / "providers.yaml") + raw_providers = config_dict.get("providers") + if not isinstance(raw_providers, list): + raise ValueError("Bundled providers YAML must contain a top-level 'providers' list.") + return [ModelProvider.model_validate(provider) for provider in raw_providers] + + def load_models_config(config_dir: Path | None = None) -> dict[str, Any]: """Load raw model definitions from models.yaml. @@ -222,6 +234,25 @@ def _merge(section: BaseModel, overrides: dict[str, Any]) -> BaseModel: ) +def validate_model_configs_reference_providers( + model_configs: list[ModelConfig], + providers: list[ModelProvider], +) -> None: + """Validate that every model config ``provider`` name exists in ``providers``.""" + known_providers = {provider.name for provider in providers} + unknown_by_alias = { + model_config.alias: model_config.provider + for model_config in model_configs + if model_config.provider is not None and model_config.provider not in known_providers + } + if unknown_by_alias: + details = ", ".join(f"{alias}={provider!r}" for alias, provider in sorted(unknown_by_alias.items())) + raise ValueError( + f"Model config provider names not found in model_providers: {details}. " + f"Known providers: {sorted(known_providers)}" + ) + + def validate_model_alias_references( model_configs: list[ModelConfig], selected_models: ModelSelection, @@ -303,6 +334,36 @@ def _validate_alias_references( ) +def _provider_field_missing(entry: dict[str, Any]) -> bool: + provider = entry.get("provider") + if provider is None: + return True + if isinstance(provider, str): + return not provider.strip() + return False + + +def _validate_raw_model_configs_have_provider(parsed: dict[str, Any]) -> None: + """Require an explicit ``provider`` on every user-supplied model config entry.""" + raw_configs = parsed.get("model_configs") + if raw_configs is None: + return + if not isinstance(raw_configs, list): + raise ValueError("model_configs must be a list.") + missing: list[str] = [] + for idx, entry in enumerate(raw_configs): + if not isinstance(entry, dict): + raise ValueError(f"model_configs[{idx}] must be a mapping.") + if _provider_field_missing(entry): + missing.append(str(entry.get("alias", f""))) + if missing: + aliases = ", ".join(repr(alias) for alias in missing) + raise ValueError( + f"Model config entries missing required field 'provider': {aliases}. " + "Each entry in model_configs must specify provider= explicitly." + ) + + def _parse_yaml_string(raw: str) -> dict[str, Any]: import yaml diff --git a/src/anonymizer/interface/anonymizer.py b/src/anonymizer/interface/anonymizer.py index ec08164a..0b8df502 100644 --- a/src/anonymizer/interface/anonymizer.py +++ b/src/anonymizer/interface/anonymizer.py @@ -51,7 +51,12 @@ from anonymizer.engine.evaluation.replace.type_fidelity_judge import TypeFidelityJudgeWorkflow from anonymizer.engine.io.reader import read_input from anonymizer.engine.ndd.adapter import FailedRecord, NddAdapter -from anonymizer.engine.ndd.model_loader import parse_model_configs, validate_model_alias_references +from anonymizer.engine.ndd.model_loader import ( + load_default_model_providers, + parse_model_configs, + validate_model_alias_references, + validate_model_configs_reference_providers, +) from anonymizer.engine.replace.llm_replace_workflow import LlmReplaceWorkflow from anonymizer.engine.replace.replace_runner import ReplacementWorkflow from anonymizer.engine.resolved_input import ResolvedInput @@ -108,7 +113,8 @@ def __init__( pool and optional ``selected_models`` overrides. ``None`` uses bundled defaults. See ``default_model_configs/README.md``. model_providers: Provider definitions (list, YAML string, or file path). - Each provider maps a name to an endpoint and API key. + Each provider maps a name to an endpoint and API key. ``None`` uses + bundled defaults from ``default_model_configs/providers.yaml``. artifact_path: Directory for intermediate artifacts. Defaults to ``.anonymizer-artifacts``. data_designer: Pre-configured DataDesigner instance (advanced usage). @@ -122,10 +128,14 @@ def __init__( os.environ.setdefault("NEMO_SESSION_PREFIX", "anonymizer-") os.environ.setdefault("NEMO_DEPLOYMENT_TYPE", "sdk") resolved_artifact_path = Path(artifact_path or ".anonymizer-artifacts") - parsed = parse_model_configs(model_configs) - self._model_configs = parsed.model_configs - self._selected_models = parsed.selected_models - self._resolved_providers: list[ModelProvider] | None = _resolve_model_providers(model_providers) + try: + parsed = parse_model_configs(model_configs) + self._model_configs = parsed.model_configs + self._selected_models = parsed.selected_models + self._resolved_providers = _resolve_model_providers(model_providers) + validate_model_configs_reference_providers(self._model_configs, self._resolved_providers) + except ValueError as exc: + raise InvalidConfigError(str(exc)) from exc logger.info("🔧 Anonymizer initialized with %d model configs", len(self._model_configs)) det = self._selected_models.detection logger.info(LOG_INDENT + "🔎 detector: %s", det.entity_detector) @@ -631,10 +641,12 @@ def _count_entities(df: pd.DataFrame) -> int: def _resolve_model_providers( model_providers: list[ModelProvider] | str | Path | None, -) -> list[ModelProvider] | None: +) -> list[ModelProvider]: if model_providers is None: - return None + return load_default_model_providers() if isinstance(model_providers, list): + if not model_providers: + raise ValueError("model_providers must contain at least one provider.") return model_providers if isinstance(model_providers, str) and "\n" not in model_providers: candidate = Path(model_providers.strip()).expanduser() @@ -858,13 +870,9 @@ def _repair_iterations_triggered(failed: list[FailedRecord], is_rewrite: bool) - def _resolve_model_hosts(providers: list[ModelProvider] | None) -> list[str]: - """Sorted, deduplicated list of provider host classifications. - - Returns ``["nvidia-build"]`` when no custom providers are configured — - anonymizer's defaults route through build.nvidia.com. - """ + """Sorted, deduplicated list of provider host classifications.""" if not providers: from anonymizer.telemetry import ModelHostEnum as _MH - return [_MH.NVIDIA_BUILD.value] + return [_MH.OTHER.value] return collect_model_hosts([classify_model_host(p) for p in providers]) diff --git a/src/anonymizer/interface/errors.py b/src/anonymizer/interface/errors.py index de8583ac..29a39672 100644 --- a/src/anonymizer/interface/errors.py +++ b/src/anonymizer/interface/errors.py @@ -13,7 +13,7 @@ class InvalidInputError(AnonymizerError): class InvalidConfigError(AnonymizerError): - """Raised when model aliases or semantic configuration are invalid.""" + """Raised when model, provider, alias, or semantic configuration is invalid.""" class AnonymizerIOError(AnonymizerError): diff --git a/tests/conftest.py b/tests/conftest.py index 8d374c95..1da4cd45 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -50,19 +50,19 @@ def _isolate_telemetry_env(monkeypatch: pytest.MonkeyPatch) -> None: @pytest.fixture def stub_detector_model_configs() -> list[ModelConfig]: """Model configs with the GLiNER PII detector alias.""" - return [ModelConfig(alias="gliner-pii-detector", model="nvidia/nemotron-pii")] + return [ModelConfig(alias="gliner-pii-detector", model="nvidia/nemotron-pii", provider="stub")] @pytest.fixture def stub_model_configs() -> list[ModelConfig]: """Generic model configs for workflows that don't care about the alias.""" - return [ModelConfig(alias="stub-model", model="stub-model")] + return [ModelConfig(alias="stub-model", model="stub-model", provider="stub")] @pytest.fixture def stub_known_model_configs() -> list[ModelConfig]: """Minimal model pool for alias validation tests.""" - return [ModelConfig(alias="known", model="some/model")] + return [ModelConfig(alias="known", model="some/model", provider="stub")] @pytest.fixture diff --git a/tests/engine/test_model_loader.py b/tests/engine/test_model_loader.py index 3b754dd4..165b29c5 100644 --- a/tests/engine/test_model_loader.py +++ b/tests/engine/test_model_loader.py @@ -4,7 +4,7 @@ from __future__ import annotations import pytest -from data_designer.config.models import ModelConfig +from data_designer.config.models import ModelConfig, ModelProvider from anonymizer.config.models import ( DetectionModelSelection, @@ -15,12 +15,14 @@ DEFAULT_CONFIG_DIR, WorkflowName, get_model_alias, + load_default_model_providers, load_default_model_selection, load_models_config, load_workflow_config, load_workflow_selections, parse_model_configs, validate_model_alias_references, + validate_model_configs_reference_providers, ) @@ -124,7 +126,7 @@ def test_get_model_alias_rejects_list_valued_role(tmp_path) -> None: get_model_alias(WorkflowName.detection, "entity_validator", config_dir) -WORKFLOW_YAMLS = [p.stem for p in DEFAULT_CONFIG_DIR.glob("*.yaml") if p.stem != "models"] +WORKFLOW_YAMLS = [p.stem for p in DEFAULT_CONFIG_DIR.glob("*.yaml") if p.stem not in {"models", "providers"}] @pytest.mark.parametrize("workflow_name", WORKFLOW_YAMLS) @@ -165,9 +167,56 @@ def test_load_default_model_selection_populates_all_workflows() -> None: assert selection.rewrite.judge +def test_parse_model_configs_rejects_missing_provider() -> None: + yaml_str = """ +model_configs: + - alias: custom-detector + model: test/model +""" + with pytest.raises(ValueError, match="missing required field 'provider'"): + parse_model_configs(yaml_str) + + +def test_parse_model_configs_rejects_empty_provider_string() -> None: + yaml_str = """ +model_configs: + - alias: custom-detector + model: test/model + provider: " " +""" + with pytest.raises(ValueError, match="missing required field 'provider'"): + parse_model_configs(yaml_str) + + +def test_parse_model_configs_rejects_non_dict_model_config_entry() -> None: + yaml_str = """ +model_configs: + - not-a-mapping +""" + with pytest.raises(ValueError, match="model_configs\\[0\\] must be a mapping"): + parse_model_configs(yaml_str) + + +def test_bundled_model_providers_cover_bundled_model_configs() -> None: + """Every provider name in bundled models.yaml must exist in providers.yaml.""" + models_config = load_models_config() + provider_names = {provider.name for provider in load_default_model_providers()} + referenced = {entry["provider"] for entry in models_config.get("model_configs", []) if entry.get("provider")} + unknown = referenced - provider_names + assert not unknown, f"Bundled models.yaml references unknown providers: {unknown}" + + +def test_validate_model_configs_reference_providers_rejects_unknown_provider() -> None: + configs = [ModelConfig(alias="detector", model="test/detector", provider="missing-provider")] + providers = [ModelProvider(name="nvidia", endpoint="https://example.com/v1")] + with pytest.raises(ValueError, match="missing-provider"): + validate_model_configs_reference_providers(configs, providers) + + def test_parse_model_configs_none_uses_defaults() -> None: result = parse_model_configs(None) assert len(result.model_configs) > 0 + assert all(model_config.provider is not None for model_config in result.model_configs) assert result.selected_models.detection.entity_detector == "gliner-pii-detector" @@ -179,8 +228,10 @@ def test_parse_model_configs_yaml_string_extracts_selections() -> None: model_configs: - alias: custom-detector model: test/model + provider: stub - alias: gpt-oss-120b model: test/gpt + provider: stub """ result = parse_model_configs(yaml_str) assert result.selected_models.detection.entity_detector == "custom-detector" @@ -193,6 +244,7 @@ def test_parse_model_configs_yaml_without_selections_uses_defaults() -> None: model_configs: - alias: stub-model model: test/stub + provider: stub """ result = parse_model_configs(yaml_str) assert len(result.model_configs) == 1 @@ -214,6 +266,7 @@ def test_parse_model_configs_rejects_empty_entity_validator_override() -> None: model_configs: - alias: gliner-pii-detector model: test/gliner + provider: stub """ with pytest.raises(ValueError, match="at least one model alias"): parse_model_configs(yaml_str) @@ -229,12 +282,16 @@ def test_parse_model_configs_dedupes_duplicate_override_aliases_with_warning( model_configs: - alias: gliner-pii-detector model: test/gliner + provider: stub - alias: v1 model: test/v1 + provider: stub - alias: v2 model: test/v2 + provider: stub - alias: v3 model: test/v3 + provider: stub """ with caplog.at_level("WARNING", logger="anonymizer.config.models"): result = parse_model_configs(yaml_str) @@ -250,10 +307,13 @@ def test_parse_model_configs_strips_whitespace_only_override_entries() -> None: model_configs: - alias: gliner-pii-detector model: test/gliner + provider: stub - alias: v1 model: test/v1 + provider: stub - alias: v2 model: test/v2 + provider: stub """ result = parse_model_configs(yaml_str) assert result.selected_models.detection.entity_validator == ["v1", "v2"] @@ -519,9 +579,9 @@ def test_accepts_all_pool_aliases_present( ) -> None: """Pool of aliases all present in the model pool — passes.""" configs = [ - ModelConfig(alias="v1", model="test/v1"), - ModelConfig(alias="v2", model="test/v2"), - ModelConfig(alias="known", model="some/model"), + ModelConfig(alias="v1", model="test/v1", provider="stub"), + ModelConfig(alias="v2", model="test/v2", provider="stub"), + ModelConfig(alias="known", model="some/model", provider="stub"), ] selected_models = stub_slim_model_selection.model_copy( update={ diff --git a/tests/engine/test_ndd_adapter.py b/tests/engine/test_ndd_adapter.py index ea0a01a8..f605b763 100644 --- a/tests/engine/test_ndd_adapter.py +++ b/tests/engine/test_ndd_adapter.py @@ -47,7 +47,7 @@ def _unique_records( def _make_model_config(alias: str = "test-model-alias") -> ModelConfig: - return ModelConfig(alias=alias, model="dummy-model-id") + return ModelConfig(alias=alias, model="dummy-model-id", provider="stub") def _make_columns() -> list[ColumnConfigT]: diff --git a/tests/interface/cli/test_cli_model_configs.py b/tests/interface/cli/test_cli_model_configs.py index ef3a1a25..a0cee911 100644 --- a/tests/interface/cli/test_cli_model_configs.py +++ b/tests/interface/cli/test_cli_model_configs.py @@ -10,7 +10,7 @@ import pandas as pd import pytest -from anonymizer.engine.ndd.model_loader import parse_model_configs +from anonymizer.engine.ndd.model_loader import load_default_model_providers, parse_model_configs from anonymizer.interface.anonymizer import _resolve_model_providers from anonymizer.interface.cli.main import app @@ -204,9 +204,11 @@ def test_model_providers_missing_yaml_message_names_the_file() -> None: _resolve_model_providers(bad_path) -def test_model_providers_none_returns_none() -> None: - """None returns None — no providers registered.""" - assert _resolve_model_providers(None) is None +def test_model_providers_none_returns_bundled_defaults() -> None: + """None loads Anonymizer's bundled providers — not DataDesigner's global defaults.""" + result = _resolve_model_providers(None) + bundled = load_default_model_providers() + assert {p.name for p in result} == {p.name for p in bundled} def test_model_providers_list_passthrough() -> None: @@ -218,6 +220,11 @@ def test_model_providers_list_passthrough() -> None: assert result is providers +def test_model_providers_empty_list_raises() -> None: + with pytest.raises(ValueError, match="at least one provider"): + _resolve_model_providers([]) + + # --------------------------------------------------------------------------- # CLI integration: _cli_error_handler catches FileNotFoundError # --------------------------------------------------------------------------- diff --git a/tests/interface/test_anonymizer_interface.py b/tests/interface/test_anonymizer_interface.py index f892285c..77617f49 100644 --- a/tests/interface/test_anonymizer_interface.py +++ b/tests/interface/test_anonymizer_interface.py @@ -5,7 +5,7 @@ from pathlib import Path from types import SimpleNamespace -from unittest.mock import Mock +from unittest.mock import Mock, patch import pandas as pd import pytest @@ -25,6 +25,7 @@ ) from anonymizer.engine.detection.detection_workflow import EntityDetectionResult, EntityDetectionWorkflow from anonymizer.engine.ndd.adapter import FailedRecord +from anonymizer.engine.ndd.model_loader import load_default_model_providers from anonymizer.engine.replace.replace_runner import ReplacementResult, ReplacementWorkflow from anonymizer.engine.rewrite.rewrite_workflow import RewriteResult, RewriteWorkflow from anonymizer.interface.anonymizer import Anonymizer, _resolve_model_providers @@ -137,6 +138,80 @@ def test_resolve_model_providers_raises_on_invalid_yaml(tmp_path: Path) -> None: _resolve_model_providers(yaml_path) +def test_anonymizer_default_passes_bundled_providers_to_data_designer() -> None: + bundled = load_default_model_providers() + with patch("anonymizer.interface.anonymizer.DataDesigner") as mock_data_designer: + Anonymizer( + detection_workflow=Mock(), + replace_runner=Mock(), + rewrite_runner=Mock(), + ) + mock_data_designer.assert_called_once() + passed_providers = mock_data_designer.call_args.kwargs["model_providers"] + assert {provider.name for provider in passed_providers} == {provider.name for provider in bundled} + + +def test_anonymizer_custom_model_providers_override_bundled_defaults() -> None: + from anonymizer import ModelProvider + + # Bundled model configs reference provider name "nvidia"; override the endpoint, not the name. + custom_providers = [ModelProvider(name="nvidia", endpoint="https://example.com/v1")] + with patch("anonymizer.interface.anonymizer.DataDesigner") as mock_data_designer: + Anonymizer( + model_providers=custom_providers, + detection_workflow=Mock(), + replace_runner=Mock(), + rewrite_runner=Mock(), + ) + passed_providers = mock_data_designer.call_args.kwargs["model_providers"] + assert passed_providers is custom_providers + + +def test_anonymizer_rejects_missing_provider_as_invalid_config_error() -> None: + yaml_str = """ +model_configs: + - alias: custom-detector + model: test/model +""" + with pytest.raises(InvalidConfigError, match="missing required field 'provider'"): + Anonymizer( + model_configs=yaml_str, + detection_workflow=Mock(), + replace_runner=Mock(), + rewrite_runner=Mock(), + ) + + +def test_anonymizer_rejects_unknown_model_provider_as_invalid_config_error() -> None: + from anonymizer import ModelProvider + + yaml_str = """ +model_configs: + - alias: custom-detector + model: test/model + provider: unknown-provider +""" + providers = [ModelProvider(name="nvidia", endpoint="https://example.com/v1")] + with pytest.raises(InvalidConfigError, match="unknown-provider"): + Anonymizer( + model_configs=yaml_str, + model_providers=providers, + detection_workflow=Mock(), + replace_runner=Mock(), + rewrite_runner=Mock(), + ) + + +def test_anonymizer_rejects_empty_model_providers_list() -> None: + with pytest.raises(InvalidConfigError, match="at least one provider"): + Anonymizer( + model_providers=[], + detection_workflow=Mock(), + replace_runner=Mock(), + rewrite_runner=Mock(), + ) + + def test_run_exposes_trace_dataframe_and_filters_internal_columns( stub_anonymizer_config: AnonymizerConfig, stub_input: AnonymizerInput, diff --git a/tests/interface/test_anonymizer_telemetry.py b/tests/interface/test_anonymizer_telemetry.py index 9e4c2b05..8d7e1124 100644 --- a/tests/interface/test_anonymizer_telemetry.py +++ b/tests/interface/test_anonymizer_telemetry.py @@ -315,12 +315,12 @@ def test_transformation_type_matches_schema_enum( assert captured_events[0].transformation_type == expected_value - def test_default_model_hosts_is_nvidia_build( + def test_default_model_hosts_reflect_bundled_providers( self, captured_events: list[AnonymizerEvent], stub_input: AnonymizerInput, ) -> None: - """When no providers are configured, hosts list contains only 'nvidia-build'.""" + """Plain Anonymizer() classifies hosts from bundled providers.yaml.""" anonymizer, *_ = _make_anonymizer() anonymizer.run(config=AnonymizerConfig(replace=Redact()), data=stub_input) From 7f22a3da87329be4b06fb1f7b63ff68c33f25a61 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Fri, 12 Jun 2026 15:59:41 -0700 Subject: [PATCH 2/3] fix: tighten provider resolution edge cases (#180) Reject empty providers lists from YAML files and remove the unreachable _resolve_model_hosts fallback now that bundled providers are always resolved. --- src/anonymizer/interface/anonymizer.py | 8 +++----- tests/interface/cli/test_cli_model_configs.py | 7 +++++++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/anonymizer/interface/anonymizer.py b/src/anonymizer/interface/anonymizer.py index 0b8df502..51ecf06f 100644 --- a/src/anonymizer/interface/anonymizer.py +++ b/src/anonymizer/interface/anonymizer.py @@ -658,6 +658,8 @@ def _resolve_model_providers( raw_providers = config_dict.get("providers") if not isinstance(raw_providers, list): raise ValueError("model_providers YAML must contain a top-level 'providers' list.") + if not raw_providers: + raise ValueError("model_providers must contain at least one provider.") return [ModelProvider.model_validate(provider) for provider in raw_providers] @@ -869,10 +871,6 @@ def _repair_iterations_triggered(failed: list[FailedRecord], is_rewrite: bool) - return len(iterations) -def _resolve_model_hosts(providers: list[ModelProvider] | None) -> list[str]: +def _resolve_model_hosts(providers: list[ModelProvider]) -> list[str]: """Sorted, deduplicated list of provider host classifications.""" - if not providers: - from anonymizer.telemetry import ModelHostEnum as _MH - - return [_MH.OTHER.value] return collect_model_hosts([classify_model_host(p) for p in providers]) diff --git a/tests/interface/cli/test_cli_model_configs.py b/tests/interface/cli/test_cli_model_configs.py index a0cee911..93082751 100644 --- a/tests/interface/cli/test_cli_model_configs.py +++ b/tests/interface/cli/test_cli_model_configs.py @@ -225,6 +225,13 @@ def test_model_providers_empty_list_raises() -> None: _resolve_model_providers([]) +def test_model_providers_empty_yaml_list_raises(tmp_path: Path) -> None: + path = tmp_path / "providers.yaml" + path.write_text("providers: []\n") + with pytest.raises(ValueError, match="at least one provider"): + _resolve_model_providers(str(path)) + + # --------------------------------------------------------------------------- # CLI integration: _cli_error_handler catches FileNotFoundError # --------------------------------------------------------------------------- From eb9effacfd076ac46821fd4a51ee5badbb79aae1 Mon Sep 17 00:00:00 2001 From: lipikaramaswamy Date: Fri, 12 Jun 2026 16:03:04 -0700 Subject: [PATCH 3/3] fix: resolve CI failures after merging main (#180) Thread validation_single_chunk_full_text through _build_detection_spec, add provider fields to benchmark preflight fixtures, and fix measurement test ModelConfig deprecation warnings. --- src/anonymizer/engine/detection/detection_workflow.py | 2 ++ tests/test_measurement.py | 6 +++--- tests/tools/test_measurement_tools.py | 6 ++++++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/anonymizer/engine/detection/detection_workflow.py b/src/anonymizer/engine/detection/detection_workflow.py index d59246b7..b08fc4af 100644 --- a/src/anonymizer/engine/detection/detection_workflow.py +++ b/src/anonymizer/engine/detection/detection_workflow.py @@ -117,6 +117,7 @@ def detect_and_validate_entities( gliner_detection_threshold=gliner_detection_threshold, validation_max_entities_per_call=validation_max_entities_per_call, validation_excerpt_window_chars=validation_excerpt_window_chars, + validation_single_chunk_full_text=validation_single_chunk_full_text, entity_labels=entity_labels, data_summary=data_summary, ) @@ -138,6 +139,7 @@ def _build_detection_spec( gliner_detection_threshold: float, validation_max_entities_per_call: int = _DEFAULT_VALIDATION_MAX_ENTITIES_PER_CALL, validation_excerpt_window_chars: int = _DEFAULT_VALIDATION_EXCERPT_WINDOW_CHARS, + validation_single_chunk_full_text: bool = True, entity_labels: list[str] | None = None, data_summary: str | None = None, ) -> tuple[list[ModelConfig], list[ColumnConfigT]]: diff --git a/tests/test_measurement.py b/tests/test_measurement.py index 5be9e361..a77b19f7 100644 --- a/tests/test_measurement.py +++ b/tests/test_measurement.py @@ -113,7 +113,7 @@ def test_ndd_adapter_records_workflow_measurement_without_raw_text() -> None: with measurement_session(collector): result = adapter.run_workflow( input_df, - model_configs=[ModelConfig(alias="detector", model="dummy")], + model_configs=[ModelConfig(alias="detector", model="dummy", provider="stub")], columns=[ LLMTextColumnConfig( name="raw_detected", @@ -185,7 +185,7 @@ def preview(self, _config_builder: object, *, num_records: int) -> SimpleNamespa with measurement_session(collector): adapter.run_workflow( input_df, - model_configs=[ModelConfig(alias="detector", model="dummy")], + model_configs=[ModelConfig(alias="detector", model="dummy", provider="stub")], columns=[ LLMTextColumnConfig( name="raw_detected", @@ -280,7 +280,7 @@ def preview(self, _config_builder: object, *, num_records: int) -> SimpleNamespa with measurement_session(collector): adapter.run_workflow( input_df, - model_configs=[ModelConfig(alias="validator", model="shared-model")], + model_configs=[ModelConfig(alias="validator", model="shared-model", provider="stub")], columns=[ LLMTextColumnConfig( name="raw_detected", diff --git a/tests/tools/test_measurement_tools.py b/tests/tools/test_measurement_tools.py index 3aa4a009..20d3d44b 100644 --- a/tests/tools/test_measurement_tools.py +++ b/tests/tools/test_measurement_tools.py @@ -696,10 +696,13 @@ def test_benchmark_preflight_rejects_bad_model_alias_references(tmp_path: Path) model_configs: - alias: detector model: test/detector + provider: stub - alias: validator model: test/validator + provider: stub - alias: augmenter model: test/augmenter + provider: stub workloads: - id: biography source: input.csv @@ -736,10 +739,13 @@ def test_benchmark_preflight_rejects_missing_evaluate_model_alias(tmp_path: Path model_configs: - alias: detector model: test/detector + provider: stub - alias: validator model: test/validator + provider: stub - alias: augmenter model: test/augmenter + provider: stub workloads: - id: biography source: input.csv