diff --git a/.github/workflows/benchmark-ci.yml b/.github/workflows/benchmark-ci.yml new file mode 100644 index 00000000..687690ca --- /dev/null +++ b/.github/workflows/benchmark-ci.yml @@ -0,0 +1,174 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +name: Benchmark CI + +on: + workflow_dispatch: + inputs: + ref: + description: "Commit SHA, branch, or tag to benchmark" + required: true + default: "main" + suite: + description: "Benchmark suite YAML path" + required: true + default: "tools/measurement/examples/repo-data-smoke.yaml" + output_dir: + description: "Output directory for benchmark artifacts" + required: true + default: "benchmark-results" + dd_trace: + description: "Capture DataDesigner message traces" + required: true + type: choice + options: + - "none" + - "last_message" + - "all_messages" + default: "none" + dd_task_trace: + description: "Capture sanitized DataDesigner scheduler task traces" + required: true + type: choice + options: + - "false" + - "true" + default: "false" + fail_fast: + description: "Stop at the first failed benchmark case" + required: true + type: choice + options: + - "false" + - "true" + default: "false" + +permissions: + contents: read + +env: + NEMO_TELEMETRY_ENABLED: "false" + BENCHMARK_REF: ${{ inputs.ref }} + BENCHMARK_SUITE: ${{ inputs.suite }} + BENCHMARK_OUTPUT_DIR: ${{ inputs.output_dir }} + BENCHMARK_DD_TRACE: ${{ inputs.dd_trace }} + BENCHMARK_DD_TASK_TRACE: ${{ inputs.dd_task_trace }} + BENCHMARK_FAIL_FAST: ${{ inputs.fail_fast }} + +jobs: + benchmark: + name: Benchmark + runs-on: [self-hosted, anonymizer-evals] + timeout-minutes: 120 + + steps: + - name: Checkout benchmark target + uses: actions/checkout@v4 + with: + ref: ${{ env.BENCHMARK_REF }} + fetch-depth: "0" + + - name: Resolve benchmark target commit + id: target + run: echo "commit=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT" + + - name: Install uv + uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.11" + + - name: Install dependencies + run: uv sync --group dev + + - name: Check NVIDIA API key + env: + NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }} + run: | + if [ -z "${NVIDIA_API_KEY:-}" ]; then + echo "::error::NVIDIA_API_KEY secret is required for benchmark CI" + exit 1 + fi + + - name: Run benchmark suite + env: + NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }} + run: | + TRACE_ARGS=(--dd-trace "$BENCHMARK_DD_TRACE") + if [ "$BENCHMARK_DD_TRACE" != "none" ]; then + TRACE_ARGS+=(--trace-dir "$BENCHMARK_OUTPUT_DIR/traces") + fi + + TASK_TRACE_ARGS=() + if [ "$BENCHMARK_DD_TASK_TRACE" = "true" ]; then + TASK_TRACE_ARGS+=(--dd-task-trace --task-trace-dir "$BENCHMARK_OUTPUT_DIR/task-traces") + fi + + FAIL_FAST_ARGS=() + if [ "$BENCHMARK_FAIL_FAST" = "true" ]; then + FAIL_FAST_ARGS+=(--fail-fast) + fi + + uv run python tools/measurement/run_benchmarks.py \ + "$BENCHMARK_SUITE" \ + --output "$BENCHMARK_OUTPUT_DIR" \ + --overwrite \ + "${TRACE_ARGS[@]}" \ + "${TASK_TRACE_ARGS[@]}" \ + "${FAIL_FAST_ARGS[@]}" + + - name: Add benchmark summary + if: always() + env: + BENCHMARK_COMMIT: ${{ steps.target.outputs.commit }} + run: | + python - <<'PY' + import json + import os + from pathlib import Path + + output_dir = Path(os.environ["BENCHMARK_OUTPUT_DIR"]) + summary_path = output_dir / "summary.json" + step_summary = Path(os.environ["GITHUB_STEP_SUMMARY"]) + + with step_summary.open("a", encoding="utf-8") as handle: + handle.write("# Anonymizer Benchmark\n\n") + handle.write(f"- Ref: `{os.environ['BENCHMARK_REF']}`\n") + handle.write(f"- Commit: `{os.environ.get('BENCHMARK_COMMIT', 'unknown')}`\n") + handle.write(f"- Suite: `{os.environ['BENCHMARK_SUITE']}`\n") + handle.write(f"- Output: `{output_dir}`\n") + handle.write(f"- DD traces: `{os.environ['BENCHMARK_DD_TRACE']}`\n") + handle.write(f"- DD task traces: `{os.environ['BENCHMARK_DD_TASK_TRACE']}`\n\n") + + if not summary_path.exists(): + handle.write("`summary.json` was not produced. Check job logs for setup or preflight failures.\n") + raise SystemExit(0) + + summary = json.loads(summary_path.read_text(encoding="utf-8")) + cases = summary.get("cases", []) + completed = sum(1 for case in cases if case.get("status") == "completed") + errors = sum(1 for case in cases if case.get("status") == "error") + handle.write(f"Ran {completed}/{len(cases)} case(s); errors={errors}.\n\n") + handle.write("| Case | Status | Elapsed | Attempts |\n") + handle.write("| --- | --- | ---: | ---: |\n") + for case in cases: + elapsed = case.get("elapsed_sec") + elapsed_text = "" if elapsed is None else f"{elapsed:.2f}s" + handle.write( + f"| `{case.get('case_id')}` | {case.get('status')} | {elapsed_text} | " + f"{case.get('attempt_count', 0)} |\n" + ) + PY + + - name: Upload benchmark artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: anonymizer-benchmark-${{ steps.target.outputs.commit }} + path: ${{ env.BENCHMARK_OUTPUT_DIR }}/ + if-no-files-found: warn diff --git a/.gitignore b/.gitignore index 6548d6ec..4382af19 100644 --- a/.gitignore +++ b/.gitignore @@ -108,6 +108,7 @@ ai/tmp/ # Anonymizer execution artifacts .anonymizer-artifacts/ +benchmark-results/ docs/notebook_source/data/synth_bios_sample10_anonymized.csv # TLS certs and keys (if any) diff --git a/docs/development/observability.md b/docs/development/observability.md new file mode 100644 index 00000000..be4f2ba5 --- /dev/null +++ b/docs/development/observability.md @@ -0,0 +1,183 @@ + + + +# Observability + +Anonymizer keeps local run measurement in the `anonymizer.measurement` package. +Measurement hooks record timings, counts, model-call summaries, and safety +metrics without changing anonymization behavior. Benchmark tools convert those +records into tables for latency, reliability, model usage, and quality analysis. + +Measurement is separate from anonymous NVIDIA telemetry. Telemetry can report +one product event per run or preview. Users can opt out as described in +[Telemetry and Privacy](../index.md#telemetry-and-privacy). Measurement records +are local artifacts. They are written only when developer tooling or caller code +activates a measurement session. + +## Model + +Instrumentation is passive unless a `MeasurementCollector` is active in the +current context: + +```python +from anonymizer.measurement import MeasurementConfig, configured_measurement_session + +measurement = MeasurementConfig(output_path="benchmark-runs/case/measurements.jsonl") + +with configured_measurement_session(measurement): + result = anonymizer.run(config=config, data=data) +``` + +Instrumentation uses these entry points: + +- `stage_timer(...)` wraps pipeline phases and records elapsed time. +- `record_run_metadata(...)` records config, input, model, and runtime metadata + once per run, without raw source values. +- `record_record_metrics(...)` records per-row counts and safety metrics from + the trace DataFrame. +- `record_ndd_workflow(...)` records DataDesigner workflow summaries at the + `NddAdapter` boundary. +- `record_model_workflow(...)` records benchmark-only direct model calls that do + not use DataDesigner. + +The public API and CLI do not read measurement environment variables by default. +Benchmark and developer tools opt into measurement explicitly. + +## Record Types + +Measurement output is JSONL by default. Each row has a `record_type` and shared +run metadata. + +| Record type | Meaning | +| --- | --- | +| `run` | One anonymization call: mode, strategy, input shape, config metadata, model aliases, runtime metadata. | +| `stage` | Pipeline phase timing, status, row counts, and row throughput. | +| `record` | Per-input-row counts, text-length buckets, entity counts, ground-truth comparison metrics when present, replacement coverage, leakage flags, and estimated LLM calls. | +| `ndd_workflow` | DataDesigner workflow summary: workflow name, model aliases, row counts, failures, elapsed time, usage summary, and throughput. | +| `model_workflow` | Direct model workflow summary for benchmark-only paths outside DataDesigner. | +| `dd_trace_coverage` | Trace coverage summary for DataDesigner columns when message tracing is enabled. | + +Use `tools/measurement/export_measurements.py` to convert raw measurement JSONL +into Parquet, CSV, or JSONL tables. + +## Output and Sinks + +`MeasurementConfig` controls output: + +| Field | Purpose | +| --- | --- | +| `output_path` | Destination for measurement records. | +| `output_format` | `jsonl` or `json`; defaults to `jsonl`. | +| `record_level` | Include per-row `record` entries; defaults to `True`. | +| `streaming` | Write JSONL records as they are emitted instead of collecting them in memory. | +| `keep_records` | Keep emitted records in memory for caller access. | +| `run_id` | Optional stable run ID. | +| `run_tags` | Caller-supplied tags copied to every record. | +| `fail_on_write_error` | Raise output write/close failures when the run body succeeded. | + +Streaming mode supports JSONL only. Use it for long benchmark suites where +holding all measurement records in memory is unnecessary. + +`MeasurementConfig.from_env()` can read `ANONYMIZER_MEASUREMENT_*` settings for +developer tooling. Product entry points do not call it automatically. + +| Environment variable | Field | +| --- | --- | +| `ANONYMIZER_MEASUREMENT_OUTPUT_PATH` | `output_path` | +| `ANONYMIZER_MEASUREMENT_OUTPUT_FORMAT` | `output_format` | +| `ANONYMIZER_MEASUREMENT_RECORD_LEVEL` | `record_level` | +| `ANONYMIZER_MEASUREMENT_STREAMING` | `streaming` | +| `ANONYMIZER_MEASUREMENT_KEEP_RECORDS` | `keep_records` | +| `ANONYMIZER_MEASUREMENT_DD_TRACE` | `dd_trace` | +| `ANONYMIZER_MEASUREMENT_DD_TRACE_PATH` | `dd_trace_path` | +| `ANONYMIZER_MEASUREMENT_DD_TASK_TRACE_PATH` | `dd_task_trace_path` | +| `ANONYMIZER_MEASUREMENT_FAIL_ON_WRITE_ERROR` | `fail_on_write_error` | +| `ANONYMIZER_MEASUREMENT_RUN_ID` | `run_id` | +| `ANONYMIZER_MEASUREMENT_RUN_TAGS` | `run_tags` | + +## DataDesigner Message Traces + +DataDesigner message traces are optional sidecar artifacts for model-call +debugging: + +```python +measurement = MeasurementConfig( + output_path="benchmark-runs/case/measurements.jsonl", + dd_trace="last_message", + dd_trace_path="benchmark-runs/case/traces.jsonl", +) +``` + +`last_message` stores the final prompt message for each traced DataDesigner +model call. `all_messages` stores the full message list. + +Message traces are separate from measurement records. They may contain raw input +text, prompts, generated output, entity values, replacement values, secrets, and +PII. Do not share them unless they have been reviewed or redacted. + +Anonymizer requests standard LLM-column traces through DataDesigner native LLM +column trace side effects. That covers `LLMTextColumnConfig` and +`LLMStructuredColumnConfig`. + +Model-backed `CustomColumnConfig` generator functions use a temporary +Anonymizer shim that instruments the per-run DataDesigner model registry and +returned model facades. This captures model calls that DataDesigner does not yet +expose through a public trace sink. Treat this as a brittle bridge over private +DataDesigner internals, not as a stable integration point. + +When tracing is enabled, the measurement stream records a `dd_trace_coverage` +row with native, private-facade, and unsupported column counts so benchmark +analysis can see which trace path covered each workflow. + +## DataDesigner Task Traces + +Scheduler task traces are a separate sidecar: + +```python +measurement = MeasurementConfig( + output_path="benchmark-runs/case/measurements.jsonl", + dd_task_trace_path="benchmark-runs/case/task-traces.jsonl", +) +``` + +Task traces capture DataDesigner scheduler timing metadata: workflow, column, +row group, row index, task type, status, relative dispatch/slot-acquired/ +completion offsets, queue wait time, execution time, total time, and whether an +error was present. They do not store raw DataDesigner error strings because +those strings can contain prompts, outputs, or source values. + +Offsets are relative to the earliest positive `dispatched_at` timestamp in the +task-trace batch for that workflow. They make task overlap easier to inspect +without persisting host-specific wall-clock timestamps. + +## Safety Rules + +Measurement records must not contain raw text, entity values, prompts, generated +outputs, replacement maps, provider secrets, or API keys. + +Use counts, labels, lengths, buckets, model aliases, status flags, elapsed time, +token counts, request counts, and run-scoped HMACs instead. The collector hashes +record identity with a per-run key. Record hashes can join artifacts from one +run, but they are not stable identifiers across unrelated runs unless the caller +supplies the same hash key deliberately. + +When adding instrumentation: + +- Put timing around stable phase boundaries, not every helper call. +- Record metadata at the boundary where the information is known. +- Keep raw debug payloads in explicit sidecars, never in measurement records. +- Prefer `run_tags` for external run context such as source refs, CI IDs, + topology labels, or experimental strategy. The benchmark runner owns + `suite_id`, `case_id`, `workload_id`, `config_id`, and `repetition`. +- Keep benchmark-only strategy switches in `tools/measurement`, not product + defaults. + +## Key Files + +| File | Purpose | +| --- | --- | +| `src/anonymizer/measurement/` | Collector, config, context managers, safe record builders, and trace sidecar hooks. | +| `src/anonymizer/interface/anonymizer.py` | Run-level and per-record measurement integration. | +| `src/anonymizer/engine/ndd/adapter.py` | DataDesigner workflow measurement, native message trace capture, and scheduler task trace capture. | +| `tools/measurement/run_benchmarks.py` | Benchmark suite runner that activates measurement sessions and writes per-case artifacts. | +| `tools/measurement/README.md` | Detailed benchmark and analysis command reference. | diff --git a/mkdocs.yml b/mkdocs.yml index 29d11c01..45673b1d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -167,6 +167,8 @@ nav: - Choosing a Replacement Strategy: notebooks/03_choosing_a_replacement_strategy.ipynb - Rewriting Biographies: notebooks/04_rewriting_biographies.ipynb - Rewriting Legal Documents: notebooks/05_rewriting_legal_documents.ipynb + - Development: + - Observability: development/observability.md - API Reference: reference/ - Developer Notes: - devnotes/index.md diff --git a/pyproject.toml b/pyproject.toml index 29865798..972fbb12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ license = "Apache-2.0" dependencies = [ "data-designer==0.6.0", "pydantic>=2.9,<3", + "pydantic-settings>=2.12,<3", "cyclopts>=3", "pygments>=2.20.0", "cryptography>=46.0.6", diff --git a/src/anonymizer/engine/constants.py b/src/anonymizer/engine/constants.py index fdfecadf..d9e1f079 100644 --- a/src/anonymizer/engine/constants.py +++ b/src/anonymizer/engine/constants.py @@ -45,6 +45,7 @@ COL_ENTITIES_BY_VALUE = "_entities_by_value" COL_REPLACED_TEXT = "__nemo_anonymizer_text_output__" COL_REPLACEMENT_MAP = "_replacement_map" +COL_REPLACEMENT_MAP_SOURCE = "_replacement_map_source" # LlmReplaceWorkflow internal prompt-construction columns. Created by # `LlmReplaceWorkflow.generate_map_only` for the replacement-generator prompt diff --git a/src/anonymizer/engine/detection/chunked_validation.py b/src/anonymizer/engine/detection/chunked_validation.py index 50601cad..b870fab9 100644 --- a/src/anonymizer/engine/detection/chunked_validation.py +++ b/src/anonymizer/engine/detection/chunked_validation.py @@ -102,6 +102,11 @@ class ChunkedValidationParams(BaseModel): max_entities_per_call: Upper bound on candidates per chunk. excerpt_window_chars: Chars of surrounding raw text included in each chunk's excerpt on either side of the chunk span. + single_chunk_full_text: If True, a row with one validation chunk sees + the full tagged document. If False, even a single chunk uses the + excerpt window. The default preserves production parity with the + pre-chunking validation path; benchmarks may disable it to probe + compact validation prompts. prompt_template: Jinja2 source for the validation prompt (with ``_seed_tagged_text``, ``_validation_skeleton``, ``_tag_notation`` placeholders). Typically produced by ``_get_validation_prompt``. @@ -119,6 +124,7 @@ class ChunkedValidationParams(BaseModel): pool: list[str] = Field(min_length=1) max_entities_per_call: int = Field(gt=0) excerpt_window_chars: int = Field(gt=0) + single_chunk_full_text: bool = True prompt_template: str = Field(repr=False) system_prompt: str | None = Field(default=None, repr=False) @@ -449,7 +455,11 @@ def chunked_validate_row( # only making one call there's no cost reason to clip, and clipping # would silently narrow the context the validator sees. Computed once # here because ``len(chunks) == 1`` is loop-invariant. - single_chunk_tagged_text = build_tagged_text(text, all_spans, notation=notation) if len(chunks) == 1 else None + single_chunk_tagged_text = ( + build_tagged_text(text, all_spans, notation=notation) + if len(chunks) == 1 and params.single_chunk_full_text + else None + ) dispatch_kwargs_per_chunk: list[dict[str, Any]] = [] for chunk_index, chunk in enumerate(chunks): diff --git a/src/anonymizer/engine/detection/detection_workflow.py b/src/anonymizer/engine/detection/detection_workflow.py index 87eb644b..c0a34f83 100644 --- a/src/anonymizer/engine/detection/detection_workflow.py +++ b/src/anonymizer/engine/detection/detection_workflow.py @@ -59,6 +59,7 @@ EntitiesSchema, LatentEntitiesSchema, ) +from anonymizer.measurement import stage_timer logger = logging.getLogger("anonymizer.detection") @@ -94,6 +95,7 @@ def detect_and_validate_entities( gliner_detection_threshold: float, validation_max_entities_per_call: int = _DEFAULT_VALIDATION_MAX_ENTITIES_PER_CALL, validation_excerpt_window_chars: int = _DEFAULT_VALIDATION_EXCERPT_WINDOW_CHARS, + validation_single_chunk_full_text: bool = True, entity_labels: list[str] | None = None, data_summary: str | None = None, preview_num_records: int | None = None, @@ -143,6 +145,7 @@ def detect_and_validate_entities( pool=list(validator_aliases), max_entities_per_call=validation_max_entities_per_call, excerpt_window_chars=validation_excerpt_window_chars, + single_chunk_full_text=validation_single_chunk_full_text, prompt_template=_get_validation_prompt(data_summary=data_summary, labels=labels), ) @@ -266,54 +269,64 @@ def run( ``identify_latent_entities`` if ``tag_latent_entities`` is True (rewrite mode). Merges failures from both stages. """ - if tag_latent_entities and privacy_goal is None: - raise ValueError("privacy_goal is required when tag_latent_entities=True (rewrite mode)") - - compute_grouped = True if compute_grouped_entities is None else compute_grouped_entities - detected_result = self.detect_and_validate_entities( - dataframe, - model_configs=model_configs, - selected_models=selected_models, - gliner_detection_threshold=gliner_detection_threshold, - validation_max_entities_per_call=validation_max_entities_per_call, - validation_excerpt_window_chars=validation_excerpt_window_chars, - entity_labels=entity_labels, - data_summary=data_summary, - preview_num_records=preview_num_records, - ) - - if tag_latent_entities: - latent_result = self.identify_latent_entities( - detected_result.dataframe, + with stage_timer( + "EntityDetectionWorkflow.run", + input_row_count=len(dataframe), + tag_latent_entities=tag_latent_entities, + ) as measurement: + if tag_latent_entities and privacy_goal is None: + raise ValueError("privacy_goal is required when tag_latent_entities=True (rewrite mode)") + + compute_grouped = True if compute_grouped_entities is None else compute_grouped_entities + detected_result = self.detect_and_validate_entities( + dataframe, model_configs=model_configs, selected_models=selected_models, gliner_detection_threshold=gliner_detection_threshold, + validation_max_entities_per_call=validation_max_entities_per_call, + validation_excerpt_window_chars=validation_excerpt_window_chars, entity_labels=entity_labels, - privacy_goal=privacy_goal, data_summary=data_summary, preview_num_records=preview_num_records, ) - final_df = latent_result.dataframe.copy() - final_failures = [*detected_result.failed_records, *latent_result.failed_records] - else: - final_df = detected_result.dataframe.copy() - final_failures = detected_result.failed_records - - # When entity_labels is explicitly provided (even if it matches DEFAULT_ENTITY_LABELS), - # the augmenter is strict and out-of-scope labels are filtered. - # entity_labels=None is the only way to get permissive augmentation. - # TODO(docs): document this None-vs-explicit contract in user-facing docs. - if COL_DETECTED_ENTITIES in final_df.columns: - allowed = set(entity_labels) if entity_labels is not None else None - final_df[COL_FINAL_ENTITIES] = final_df[COL_DETECTED_ENTITIES].apply( - lambda raw: _materialize_final_entities(raw, allowed_labels=allowed) + + if tag_latent_entities: + latent_result = self.identify_latent_entities( + detected_result.dataframe, + model_configs=model_configs, + selected_models=selected_models, + gliner_detection_threshold=gliner_detection_threshold, + entity_labels=entity_labels, + privacy_goal=privacy_goal, + data_summary=data_summary, + preview_num_records=preview_num_records, + ) + final_df = latent_result.dataframe.copy() + final_failures = [*detected_result.failed_records, *latent_result.failed_records] + else: + final_df = detected_result.dataframe.copy() + final_failures = detected_result.failed_records + + # When entity_labels is explicitly provided (even if it matches DEFAULT_ENTITY_LABELS), + # the augmenter is strict and out-of-scope labels are filtered. + # entity_labels=None is the only way to get permissive augmentation. + # TODO(docs): document this None-vs-explicit contract in user-facing docs. + if COL_DETECTED_ENTITIES in final_df.columns: + allowed = set(entity_labels) if entity_labels is not None else None + final_df[COL_FINAL_ENTITIES] = final_df[COL_DETECTED_ENTITIES].apply( + lambda raw: _materialize_final_entities(raw, allowed_labels=allowed) + ) + if compute_grouped: + final_df[COL_ENTITIES_BY_VALUE] = final_df[COL_FINAL_ENTITIES].apply(_build_entities_by_value) + result = EntityDetectionResult( + dataframe=final_df, + failed_records=final_failures, ) - if compute_grouped: - final_df[COL_ENTITIES_BY_VALUE] = final_df[COL_FINAL_ENTITIES].apply(_build_entities_by_value) - return EntityDetectionResult( - dataframe=final_df, - failed_records=final_failures, - ) + measurement.update( + output_row_count=len(result.dataframe), + failed_record_count=len(result.failed_records), + ) + return result def _inject_detector_params( self, diff --git a/src/anonymizer/engine/ndd/adapter.py b/src/anonymizer/engine/ndd/adapter.py index 8aa9b920..a7d9015b 100644 --- a/src/anonymizer/engine/ndd/adapter.py +++ b/src/anonymizer/engine/ndd/adapter.py @@ -3,21 +3,34 @@ from __future__ import annotations +import importlib import json import logging +import re import tempfile +import time import uuid +from collections.abc import Iterator, Mapping +from contextlib import contextmanager +from contextvars import ContextVar from dataclasses import dataclass +from functools import wraps from pathlib import Path -from typing import TYPE_CHECKING +from threading import RLock +from typing import TYPE_CHECKING, Any, Literal, Protocol, TypedDict, TypeGuard, cast +from data_designer.config.column_configs import CustomColumnConfig, LLMStructuredColumnConfig, LLMTextColumnConfig from data_designer.config.column_types import ColumnConfigT from data_designer.config.config_builder import DataDesignerConfigBuilder from data_designer.config.models import ModelConfig +from data_designer.config.run_config import RunConfig from data_designer.config.seed import SamplingStrategy from data_designer.config.seed_source import LocalFileSeedSource +from data_designer.config.utils.constants import TRACE_COLUMN_POSTFIX +from data_designer.config.utils.trace_type import TraceType from anonymizer.interface.errors import AnonymizerWorkflowError +from anonymizer.measurement import current_collector, record_ndd_workflow if TYPE_CHECKING: import pandas as pd @@ -26,6 +39,9 @@ logger = logging.getLogger("anonymizer.ndd") RECORD_ID_COLUMN = "_anonymizer_record_id" +_TRACEABLE_LLM_COLUMN_TYPES = (LLMTextColumnConfig, LLMStructuredColumnConfig) +_MODEL_TRACE_COLUMN: ContextVar[str | None] = ContextVar("anonymizer_dd_model_trace_column", default=None) +_MODEL_TRACE_PURPOSE: ContextVar[str | None] = ContextVar("anonymizer_dd_model_trace_purpose", default=None) @dataclass(frozen=True) @@ -45,11 +61,215 @@ class WorkflowRunResult: failed_records: list[FailedRecord] +@dataclass(frozen=True) +class _NativeTraceColumn: + column_name: str + trace_column_name: str + model_alias: str | None + model_name: str | None + model_provider_name: str | None + + +@dataclass(frozen=True) +class _PrivateFacadeTraceColumn: + column_name: str + + +@dataclass(frozen=True) +class _DDMessageTracePlan: + columns: list[ColumnConfigT] + native_columns: list[_NativeTraceColumn] + private_columns: list[_PrivateFacadeTraceColumn] + unsupported_columns: list[ColumnConfigT] + + @classmethod + def from_columns( + cls, + *, + columns: list[ColumnConfigT], + model_configs: list[ModelConfig], + collector: Any | None, + ) -> _DDMessageTracePlan: + if collector is None or not collector.dd_trace_enabled: + return cls(columns=columns, native_columns=[], private_columns=[], unsupported_columns=[]) + + model_configs_by_alias = {model_config.alias: model_config for model_config in model_configs} + native_columns: list[_NativeTraceColumn] = [] + private_columns: list[_PrivateFacadeTraceColumn] = [] + unsupported_columns: list[ColumnConfigT] = [] + configured_columns: list[ColumnConfigT] = [] + + for column in columns: + if isinstance(column, _TRACEABLE_LLM_COLUMN_TYPES): + configured_columns.append( + cast(ColumnConfigT, column.model_copy(update={"with_trace": cls.trace_type()})) + ) + model_config = model_configs_by_alias.get(column.model_alias) + native_columns.append( + _NativeTraceColumn( + column_name=column.name, + trace_column_name=f"{column.name}{TRACE_COLUMN_POSTFIX}", + model_alias=column.model_alias, + model_name=getattr(model_config, "model", None), + model_provider_name=getattr(model_config, "provider", None), + ) + ) + continue + + if _column_has_private_facade_model_calls(column): + configured_columns.append(_custom_column_with_trace_context(column)) + private_columns.append(_PrivateFacadeTraceColumn(column_name=column.name)) + continue + + unsupported_columns.append(column) + configured_columns.append(column) + + return cls( + columns=configured_columns, + native_columns=native_columns, + private_columns=private_columns, + unsupported_columns=unsupported_columns, + ) + + @staticmethod + def trace_type() -> TraceType: + # Preserve Anonymizer's existing dd_trace=last_message semantics: the trace + # sink records the final prompt message and response separately, while DD's + # native LAST_MESSAGE side effect only keeps the final assistant message. + return TraceType.ALL_MESSAGES + + def record_coverage(self, *, workflow_name: str, collector: Any | None) -> None: + if collector is None or not collector.dd_trace_enabled: + return + + traced_column_names = [column.column_name for column in self.native_columns] + [ + column.column_name for column in self.private_columns + ] + collector.record( + "dd_trace_coverage", + workflow_name=workflow_name, + trace_backend=self.backend, + trace_mode=collector.dd_trace_mode, + native_trace_type=self.trace_type().value, + traced_column_count=len(traced_column_names), + traced_column_names=traced_column_names, + native_trace_column_count=len(self.native_columns), + native_trace_column_names=[column.column_name for column in self.native_columns], + private_trace_column_count=len(self.private_columns), + private_trace_column_names=[column.column_name for column in self.private_columns], + private_trace_backend="anonymizer_private_model_facade" if self.private_columns else None, + private_trace_note=( + "temporary private DataDesigner model registry/facade instrumentation" if self.private_columns else None + ), + unsupported_column_count=len(self.unsupported_columns), + unsupported_column_names=[column.name for column in self.unsupported_columns], + unsupported_column_types=[_column_type_name(column) for column in self.unsupported_columns], + ) + + @property + def backend(self) -> str: + if self.native_columns and self.private_columns: + return "mixed" + if self.private_columns: + return "anonymizer_private_model_facade" + return "data_designer_column" + + def record_and_strip_native_traces( + self, + *, + output_df: pd.DataFrame, + workflow_name: str, + collector: Any | None, + ) -> pd.DataFrame: + if not self.native_columns: + return output_df + + trace_column_names = [column.trace_column_name for column in self.native_columns] + if collector is not None and collector.dd_trace_enabled: + for _, row in output_df.iterrows(): + for trace_column in self.native_columns: + if trace_column.trace_column_name not in output_df.columns: + continue + self._record_native_trace( + trace_column=trace_column, + trace_value=row.get(trace_column.trace_column_name), + workflow_name=workflow_name, + collector=collector, + ) + + existing_trace_columns = [column_name for column_name in trace_column_names if column_name in output_df.columns] + if not existing_trace_columns: + return output_df + return output_df.drop(columns=existing_trace_columns) + + @staticmethod + def _record_native_trace( + *, + trace_column: _NativeTraceColumn, + trace_value: Any, + workflow_name: str, + collector: Any, + ) -> None: + trace_messages = _native_trace_messages(trace_value) + if not trace_messages: + return + collector.record_dd_message_trace( + workflow_name=workflow_name, + trace_source="data_designer_column", + column_name=trace_column.column_name, + trace_column_name=trace_column.trace_column_name, + model_alias=trace_column.model_alias, + model_name=trace_column.model_name, + model_provider_name=trace_column.model_provider_name, + modality="chat", + is_async=None, + status="completed", + error_type=None, + elapsed_sec=None, + messages=_select_native_trace_messages(trace_messages, mode=collector.dd_trace_mode), + response=_native_trace_response(trace_messages), + usage=None, + ) + + +class _TaskTraceLike(Protocol): + column: Any + row_group: Any + row_index: Any + task_type: Any + status: Any + error: Any + dispatched_at: Any + slot_acquired_at: Any + completed_at: Any + + +_TaskTrace = Mapping[str, Any] | _TaskTraceLike + + +class _DDTaskTraceFields(TypedDict): + workflow_name: str + trace_source: Literal["data_designer_scheduler"] + column: Any + row_group: Any + row_index: Any + task_type: Any + status: Any + error_present: bool + dispatched_offset_sec: float | None + slot_acquired_offset_sec: float | None + completed_offset_sec: float | None + queue_wait_sec: float | None + execution_sec: float | None + total_sec: float | None + + class NddAdapter: """Adapter for running NDD workflows with uniform I/O and record tracking.""" def __init__(self, data_designer: DataDesigner) -> None: self._data_designer = data_designer + self._run_lock = RLock() logger.debug("NDD adapter: artifact_path=%s", getattr(data_designer, "_artifact_path", "unknown")) def run_workflow( @@ -86,7 +306,29 @@ def run_workflow( logger.debug("NDD workflow '%s' starting with %d records", workflow_name, len(workflow_input_df)) col_names = [c.name for c in columns] logger.debug("NDD workflow '%s': %d columns %s", workflow_name, len(col_names), col_names) - model_aliases = [m.alias for m in model_configs] + available_model_aliases = [m.alias for m in model_configs] + model_aliases = _extract_workflow_model_aliases(columns) or available_model_aliases + record_count = ( + min(preview_num_records, len(workflow_input_df)) + if preview_num_records is not None + else len(workflow_input_df) + ) + started = time.perf_counter() + collector = current_collector() + trace_plan = _DDMessageTracePlan.from_columns( + columns=columns, + model_configs=model_configs, + collector=collector, + ) + columns = trace_plan.columns + usage_probe = _DataDesignerUsageProbe( + self._data_designer, + enabled=collector is not None, + collector=collector, + workflow_name=workflow_name, + private_trace_columns=trace_plan.private_columns, + ) + trace_plan.record_coverage(workflow_name=workflow_name, collector=collector) with tempfile.TemporaryDirectory(prefix=f"anonymizer_{workflow_name}_") as tmp_dir: seed_path = str(Path(tmp_dir) / "seed.parquet") @@ -97,33 +339,32 @@ def run_workflow( for column in columns: config_builder.add_column(column) - record_count = ( - min(preview_num_records, len(workflow_input_df)) - if preview_num_records is not None - else len(workflow_input_df) - ) + task_traces: list[_TaskTrace] = [] try: - if preview_num_records is None: - run_results = self._data_designer.create( - config_builder, - num_records=len(workflow_input_df), - dataset_name=workflow_name, - ) - output_df = run_results.load_dataset() - else: - preview_results = self._data_designer.preview( - config_builder, - num_records=record_count, - ) - if preview_results.dataset is None: - output_df = workflow_input_df.iloc[0:0].copy() + with self._run_lock, usage_probe, _temporary_dd_task_trace(self._data_designer, collector=collector): + if preview_num_records is None: + run_results = self._data_designer.create( + config_builder, + num_records=len(workflow_input_df), + dataset_name=workflow_name, + ) + task_traces = _task_traces_from_result(run_results) + output_df = run_results.load_dataset() else: - output_df = preview_results.dataset + preview_results = self._data_designer.preview( + config_builder, + num_records=record_count, + ) + task_traces = _task_traces_from_result(preview_results) + if preview_results.dataset is None: + output_df = workflow_input_df.iloc[0:0].copy() + else: + output_df = preview_results.dataset except Exception as exc: logger.warning( "Workflow failed for %d input record(s) on model(s) %s: %s", record_count, - model_aliases, + available_model_aliases, exc, ) logger.debug( @@ -131,8 +372,38 @@ def run_workflow( workflow_name, col_names, ) + try: + usage_probe.flush_private_trace_records() + except Exception: + logger.warning("Failed to write DataDesigner private message trace records after workflow failure") + record_ndd_workflow( + workflow_name=workflow_name, + model_aliases=model_aliases, + input_row_count=record_count, + seed_row_count=len(workflow_input_df), + output_row_count=None, + failed_record_count=None, + elapsed_sec=time.perf_counter() - started, + status="error", + preview_num_records=preview_num_records, + column_count=len(col_names), + column_names=col_names, + model_usage=usage_probe.model_usage(), + ) raise AnonymizerWorkflowError(f"Workflow failed: {exc}") from exc + output_df = trace_plan.record_and_strip_native_traces( + output_df=output_df, + workflow_name=workflow_name, + collector=collector, + ) + _record_dd_task_traces( + workflow_name=workflow_name, + collector=collector, + task_traces=task_traces, + ) + usage_probe.flush_private_trace_records() + logger.debug("NDD workflow '%s' returned %d records", workflow_name, len(output_df)) failed_records = self._detect_missing_records( workflow_name=workflow_name, @@ -143,6 +414,19 @@ def run_workflow( ), output_df=output_df, ) + record_ndd_workflow( + workflow_name=workflow_name, + model_aliases=model_aliases, + input_row_count=record_count, + seed_row_count=len(workflow_input_df), + output_row_count=len(output_df), + failed_record_count=len(failed_records), + elapsed_sec=time.perf_counter() - started, + preview_num_records=preview_num_records, + column_count=len(col_names), + column_names=col_names, + model_usage=usage_probe.model_usage(), + ) return WorkflowRunResult(dataframe=output_df, failed_records=failed_records) def _attach_record_ids(self, df: pd.DataFrame) -> pd.DataFrame: @@ -225,3 +509,576 @@ def _detect_missing_records( ) for record_id in missing_ids ] + + +def _extract_workflow_model_aliases(columns: list[ColumnConfigT]) -> list[str]: + aliases: list[str] = [] + for column in columns: + aliases.extend(_as_alias_list(getattr(column, "model_alias", None))) + generator = getattr(column, "generator_function", None) + metadata = getattr(generator, "custom_column_metadata", None) + if isinstance(metadata, dict): + aliases.extend(_as_alias_list(metadata.get("model_aliases"))) + return list(dict.fromkeys(alias for alias in aliases if alias)) + + +def _as_alias_list(raw: Any) -> list[str]: + if raw is None: + return [] + if isinstance(raw, str): + return [raw] + if isinstance(raw, (list, tuple, set)): + return [str(item) for item in raw if item is not None and str(item)] + return [str(raw)] + + +class _DataDesignerUsageProbe: + """Capture DataDesigner model usage from the per-run private ResourceProvider.""" + + def __init__( + self, + data_designer: DataDesigner, + *, + enabled: bool, + collector: Any | None = None, + workflow_name: str | None = None, + private_trace_columns: list[_PrivateFacadeTraceColumn] | None = None, + ) -> None: + self._data_designer = data_designer + self._enabled = enabled + self._collector = collector + self._workflow_name = workflow_name + self._private_trace_column_names = {column.column_name for column in private_trace_columns or []} + self._original_create_resource_provider: Any | None = None + self._resource_providers: list[Any] = [] + self._model_registry_patches: list[tuple[Any, Any]] = [] + self._facade_patches: dict[int, tuple[Any, dict[str, Any]]] = {} + self._private_trace_records: list[dict[str, Any]] = [] + + def __enter__(self) -> _DataDesignerUsageProbe: + if not self._enabled: + return self + + original = getattr(self._data_designer, "_create_resource_provider", None) + if not callable(original): + return self + + self._original_create_resource_provider = original + + def wrapper(*args: Any, **kwargs: Any) -> Any: + resource_provider = original(*args, **kwargs) + self._resource_providers.append(resource_provider) + self._install_private_model_trace(resource_provider) + return resource_provider + + setattr(self._data_designer, "_create_resource_provider", wrapper) + return self + + def __exit__(self, exc_type: object, exc: object, traceback: object) -> None: + self._restore_private_trace_patches() + if self._original_create_resource_provider is not None: + setattr(self._data_designer, "_create_resource_provider", self._original_create_resource_provider) + + def model_usage(self) -> dict[str, Any] | None: + usage: dict[str, Any] = {} + for resource_provider in self._resource_providers: + model_registry = getattr(resource_provider, "model_registry", None) + snapshot = _get_model_usage_snapshot(model_registry) + if not snapshot: + continue + for model_name, stats in snapshot.items(): + usage[str(model_name)] = _model_usage_as_json(stats) + return usage or None + + def flush_private_trace_records(self) -> None: + collector = self._collector + if collector is None: + self._private_trace_records.clear() + return + while self._private_trace_records: + collector.record_dd_message_trace(**self._private_trace_records.pop(0)) + + def _private_trace_enabled(self) -> bool: + return bool( + self._collector is not None + and self._collector.dd_trace_enabled + and self._workflow_name + and self._private_trace_column_names + ) + + def _install_private_model_trace(self, resource_provider: Any) -> None: + if not self._private_trace_enabled(): + return + model_registry = getattr(resource_provider, "model_registry", None) + get_model = getattr(model_registry, "get_model", None) + if not callable(get_model): + return + + def wrapped_get_model(*args: Any, **kwargs: Any) -> Any: + facade = get_model(*args, **kwargs) + self._patch_model_facade(facade) + return facade + + # Temporary private DataDesigner shim: CustomColumnConfig receives + # ModelFacade objects directly and DD does not yet expose a public + # model-call event sink for those calls. + setattr(model_registry, "get_model", wrapped_get_model) + self._model_registry_patches.append((model_registry, get_model)) + + def _patch_model_facade(self, facade: Any) -> None: + facade_id = id(facade) + if facade_id in self._facade_patches: + return + + originals: dict[str, Any] = {} + for method_name in ("completion", "acompletion", "generate", "agenerate"): + method = getattr(facade, method_name, None) + if not callable(method): + continue + originals[method_name] = method + setattr(facade, method_name, self._wrap_facade_method(facade, method_name, method)) + + if originals: + self._facade_patches[facade_id] = (facade, originals) + + def _wrap_facade_method(self, facade: Any, method_name: str, method: Any) -> Any: + if method_name == "acompletion": + return self._wrap_async_completion(facade, method) + if method_name == "completion": + return self._wrap_completion(facade, method) + if method_name == "agenerate": + return self._wrap_async_generate(method) + return self._wrap_generate(method) + + def _wrap_generate(self, method: Any) -> Any: + def wrapper(*args: Any, **kwargs: Any) -> Any: + token = _MODEL_TRACE_PURPOSE.set(_purpose_from_kwargs(kwargs)) + try: + return method(*args, **kwargs) + finally: + _MODEL_TRACE_PURPOSE.reset(token) + + return wrapper + + def _wrap_async_generate(self, method: Any) -> Any: + async def wrapper(*args: Any, **kwargs: Any) -> Any: + token = _MODEL_TRACE_PURPOSE.set(_purpose_from_kwargs(kwargs)) + try: + return await method(*args, **kwargs) + finally: + _MODEL_TRACE_PURPOSE.reset(token) + + return wrapper + + def _wrap_completion(self, facade: Any, method: Any) -> Any: + def wrapper(*args: Any, **kwargs: Any) -> Any: + started = time.perf_counter() + error: Exception | None = None + response: Any = None + try: + response = method(*args, **kwargs) + return response + except Exception as exc: + error = exc + raise + finally: + self._record_private_completion_trace(facade, args, kwargs, started, response, error, is_async=False) + + return wrapper + + def _wrap_async_completion(self, facade: Any, method: Any) -> Any: + async def wrapper(*args: Any, **kwargs: Any) -> Any: + started = time.perf_counter() + error: Exception | None = None + response: Any = None + try: + response = await method(*args, **kwargs) + return response + except Exception as exc: + error = exc + raise + finally: + self._record_private_completion_trace(facade, args, kwargs, started, response, error, is_async=True) + + return wrapper + + def _record_private_completion_trace( + self, + facade: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + started: float, + response: Any, + error: Exception | None, + *, + is_async: bool, + ) -> None: + if not self._private_trace_enabled(): + return + column_name = _private_trace_column_name( + column_names=self._private_trace_column_names, + purpose=_purpose_from_kwargs(kwargs) or _MODEL_TRACE_PURPOSE.get(), + ) + if column_name is None: + return + collector = self._collector + if collector is None: + return + self._private_trace_records.append( + _private_completion_trace_fields( + workflow_name=self._workflow_name, + column_name=column_name, + facade=facade, + args=args, + kwargs=kwargs, + response=response, + error=error, + elapsed_sec=time.perf_counter() - started, + is_async=is_async, + trace_mode=collector.dd_trace_mode, + ) + ) + + def _restore_private_trace_patches(self) -> None: + for facade, originals in reversed(list(self._facade_patches.values())): + for method_name, original in originals.items(): + setattr(facade, method_name, original) + self._facade_patches.clear() + + for model_registry, get_model in reversed(self._model_registry_patches): + setattr(model_registry, "get_model", get_model) + self._model_registry_patches.clear() + + +def _get_model_usage_snapshot(model_registry: object) -> Mapping[str, object] | None: + alias_snapshot = _get_model_usage_snapshot_by_alias(model_registry) + if alias_snapshot: + return alias_snapshot + + get_snapshot = getattr(model_registry, "get_model_usage_snapshot", None) + if not callable(get_snapshot): + return None + snapshot = get_snapshot() + if isinstance(snapshot, Mapping): + return snapshot + return None + + +def _get_model_usage_snapshot_by_alias(model_registry: object) -> Mapping[str, object] | None: + models = getattr(model_registry, "_models", None) + if not isinstance(models, Mapping): + return None + + snapshot: dict[str, object] = {} + for model_alias, model_facade in models.items(): + stats = getattr(model_facade, "usage_stats", None) + if stats is None or not getattr(stats, "has_usage", False): + continue + payload = _model_usage_as_json(stats) + if isinstance(payload, Mapping): + payload = { + **payload, + "model_alias": getattr(model_facade, "model_alias", str(model_alias)), + "model_name": getattr(model_facade, "model_name", None), + "model_provider_name": getattr(model_facade, "model_provider_name", None), + } + snapshot[str(model_alias)] = payload + return snapshot or None + + +def _model_usage_as_json(stats: object) -> Any: + model_dump = getattr(stats, "model_dump", None) + if callable(model_dump): + return model_dump(mode="json") + return stats + + +def _purpose_from_kwargs(kwargs: Mapping[str, Any]) -> str | None: + purpose = kwargs.get("purpose") + return purpose if isinstance(purpose, str) and purpose else None + + +def _private_trace_column_name(*, column_names: set[str], purpose: str | None) -> str | None: + context_column = _MODEL_TRACE_COLUMN.get() + if context_column in column_names: + return context_column + + task_column = _runtime_correlation_task_column() + if task_column in column_names: + return task_column + + purpose_column = _column_name_from_purpose(purpose) + if purpose_column in column_names: + return purpose_column + + if len(column_names) == 1: + return next(iter(column_names)) + return None + + +def _runtime_correlation_task_column() -> str | None: + try: + observability = importlib.import_module("data_designer.engine.observability") + except Exception: + return None + + runtime_correlation_provider = getattr(observability, "runtime_correlation_provider", None) + current = getattr(runtime_correlation_provider, "current", None) + if not callable(current): + return None + correlation = current() + task_column = getattr(correlation, "task_column", None) + return task_column if isinstance(task_column, str) and task_column else None + + +def _column_name_from_purpose(purpose: str | None) -> str | None: + if not purpose: + return None + match = re.search(r"column '([^']+)'", purpose) + if match: + return match.group(1) + return None + + +def _model_provider_endpoint(facade: Any) -> str | None: + provider = getattr(facade, "model_provider", None) + endpoint = getattr(provider, "endpoint", None) + return endpoint if isinstance(endpoint, str) and endpoint else None + + +def _private_trace_messages(*, args: tuple[Any, ...], kwargs: Mapping[str, Any]) -> list[dict[str, Any]]: + messages = args[0] if args else kwargs.get("messages") + if isinstance(messages, list): + return [_trace_message(message) for message in messages] + return [] + + +def _private_completion_trace_fields( + *, + workflow_name: str | None, + column_name: str, + facade: Any, + args: tuple[Any, ...], + kwargs: Mapping[str, Any], + response: Any, + error: Exception | None, + elapsed_sec: float, + is_async: bool, + trace_mode: str, +) -> dict[str, Any]: + return { + "workflow_name": workflow_name, + "trace_source": "anonymizer_private_model_facade", + "column_name": column_name, + "trace_column_name": None, + "model_alias": getattr(facade, "model_alias", None), + "model_name": getattr(facade, "model_name", None), + "model_provider_name": getattr(facade, "model_provider_name", None), + "model_provider_endpoint": _model_provider_endpoint(facade), + "modality": "chat", + "is_async": is_async, + "status": "error" if error is not None else "completed", + "error_type": type(error).__name__ if error is not None else None, + "elapsed_sec": elapsed_sec, + "messages": _select_native_trace_messages(_private_trace_messages(args=args, kwargs=kwargs), mode=trace_mode), + "response": _model_trace_response(response), + "usage": _model_trace_usage(response), + } + + +def _model_trace_response(response: Any) -> dict[str, Any] | None: + message = getattr(response, "message", None) + if message is None: + return None + return { + "content": getattr(message, "content", None), + "reasoning_content": getattr(message, "reasoning_content", None), + "tool_calls": _trace_tool_calls(getattr(message, "tool_calls", [])), + } + + +def _model_trace_usage(response: Any) -> Any: + usage = getattr(response, "usage", None) + if usage is None: + return None + model_dump = getattr(usage, "model_dump", None) + if callable(model_dump): + return model_dump(mode="json") + if isinstance(usage, Mapping): + return dict(usage) + fields = ("input_tokens", "output_tokens", "total_tokens", "reasoning_tokens") + payload = {field: getattr(usage, field) for field in fields if getattr(usage, field, None) is not None} + return payload or None + + +@contextmanager +def _temporary_dd_task_trace(data_designer: DataDesigner, *, collector: Any | None) -> Iterator[None]: + if collector is None or not collector.dd_task_trace_enabled: + yield + return + + original_run_config = getattr(data_designer, "run_config", None) + set_run_config = getattr(data_designer, "set_run_config", None) + if original_run_config is None or not callable(set_run_config): + yield + return + + traced_run_config = _run_config_with_async_trace(original_run_config) + set_run_config(traced_run_config) + try: + yield + finally: + set_run_config(original_run_config) + + +def _run_config_with_async_trace(run_config: Any) -> Any: + model_copy = getattr(run_config, "model_copy", None) + if callable(model_copy): + return model_copy(update={"async_trace": True}) + if isinstance(run_config, RunConfig): + return run_config.model_copy(update={"async_trace": True}) + return run_config + + +def _task_traces_from_result(result: Any) -> list[_TaskTrace]: + raw_traces = getattr(result, "task_traces", None) + if raw_traces is None: + return [] + if isinstance(raw_traces, list): + return cast(list[_TaskTrace], raw_traces) + try: + return cast(list[_TaskTrace], list(raw_traces)) + except TypeError: + return [] + + +def _custom_column_with_trace_context(column: CustomColumnConfig) -> ColumnConfigT: + generator = column.generator_function + + @wraps(generator) + def traced_generator(*args: Any, **kwargs: Any) -> Any: + token = _MODEL_TRACE_COLUMN.set(column.name) + try: + return generator(*args, **kwargs) + finally: + _MODEL_TRACE_COLUMN.reset(token) + + traced_generator.custom_column_metadata = getattr(generator, "custom_column_metadata", {}) # type: ignore[attr-defined] + return cast(ColumnConfigT, column.model_copy(update={"generator_function": traced_generator})) + + +def _column_has_private_facade_model_calls(column: ColumnConfigT) -> TypeGuard[CustomColumnConfig]: + return isinstance(column, CustomColumnConfig) and bool(_extract_workflow_model_aliases([column])) + + +def _column_type_name(column: ColumnConfigT) -> str: + column_type = getattr(column, "column_type", None) + return str(column_type) if column_type is not None else type(column).__name__ + + +def _native_trace_messages(value: Any) -> list[dict[str, Any]]: + if value is None or isinstance(value, (str, bytes, Mapping)): + return [] + try: + messages = list(value) + except TypeError: + return [] + return [_trace_message(message) for message in messages] + + +def _select_native_trace_messages(messages: list[dict[str, Any]], *, mode: str) -> list[dict[str, Any]]: + if mode == "all_messages": + return messages + last_prompt = next((message for message in reversed(messages) if message.get("role") != "assistant"), None) + return [last_prompt] if last_prompt is not None else [] + + +def _native_trace_response(messages: list[dict[str, Any]]) -> dict[str, Any] | None: + assistant_message = next((message for message in reversed(messages) if message.get("role") == "assistant"), None) + if assistant_message is None: + return None + return { + "content": assistant_message.get("content"), + "reasoning_content": assistant_message.get("reasoning_content"), + "tool_calls": _trace_tool_calls(assistant_message.get("tool_calls", [])), + } + + +def _trace_message(message: Any) -> dict[str, Any]: + to_dict = getattr(message, "to_dict", None) + if callable(to_dict): + return cast(dict[str, Any], to_dict()) + if isinstance(message, Mapping): + return dict(message) + return {"role": getattr(message, "role", None), "content": getattr(message, "content", None)} + + +def _trace_tool_calls(tool_calls: Any) -> list[Any]: + if isinstance(tool_calls, list): + return [getattr(tool_call, "__dict__", tool_call) for tool_call in tool_calls] + return [] + + +def _record_dd_task_traces(*, workflow_name: str, collector: Any | None, task_traces: list[_TaskTrace]) -> None: + if collector is None or not collector.dd_task_trace_enabled: + return + trace_origin = _task_trace_origin(task_traces) + for task_trace in task_traces: + collector.record_dd_task_trace(**_dd_task_trace_fields(workflow_name, task_trace, trace_origin)) + + +def _dd_task_trace_fields( + workflow_name: str, + task_trace: _TaskTrace, + trace_origin: float | None, +) -> _DDTaskTraceFields: + dispatched_at = _trace_attr(task_trace, "dispatched_at") + slot_acquired_at = _trace_attr(task_trace, "slot_acquired_at") + completed_at = _trace_attr(task_trace, "completed_at") + return { + "workflow_name": workflow_name, + "trace_source": "data_designer_scheduler", + "column": _trace_attr(task_trace, "column"), + "row_group": _trace_attr(task_trace, "row_group"), + "row_index": _trace_attr(task_trace, "row_index"), + "task_type": _trace_attr(task_trace, "task_type"), + "status": _trace_attr(task_trace, "status"), + "error_present": bool(_trace_attr(task_trace, "error")), + "dispatched_offset_sec": _trace_offset(trace_origin, dispatched_at), + "slot_acquired_offset_sec": _trace_offset(trace_origin, slot_acquired_at), + "completed_offset_sec": _trace_offset(trace_origin, completed_at), + "queue_wait_sec": _trace_duration(dispatched_at, slot_acquired_at), + "execution_sec": _trace_duration(slot_acquired_at, completed_at), + "total_sec": _trace_duration(dispatched_at, completed_at), + } + + +def _task_trace_origin(task_traces: list[_TaskTrace]) -> float | None: + dispatch_times: list[float] = [] + for task_trace in task_traces: + dispatched_at = _trace_attr(task_trace, "dispatched_at") + if isinstance(dispatched_at, (int, float)) and dispatched_at > 0: + dispatch_times.append(float(dispatched_at)) + return min(dispatch_times) if dispatch_times else None + + +def _trace_attr(task_trace: _TaskTrace, name: str) -> Any: + if isinstance(task_trace, Mapping): + return cast(Mapping[str, Any], task_trace).get(name) + return getattr(task_trace, name, None) + + +def _trace_offset(origin: float | None, timestamp: Any) -> float | None: + if origin is None or not isinstance(timestamp, (int, float)): + return None + if timestamp <= 0 or timestamp < origin: + return None + return float(timestamp - origin) + + +def _trace_duration(start: Any, end: Any) -> float | None: + if not isinstance(start, (int, float)) or not isinstance(end, (int, float)): + return None + if start <= 0 or end <= 0 or end < start: + return None + return float(end - start) diff --git a/src/anonymizer/engine/replace/llm_replace_workflow.py b/src/anonymizer/engine/replace/llm_replace_workflow.py index ccd5cb1d..531b6827 100644 --- a/src/anonymizer/engine/replace/llm_replace_workflow.py +++ b/src/anonymizer/engine/replace/llm_replace_workflow.py @@ -19,6 +19,7 @@ COL_ENTITIES_FOR_REPLACE_JSON, COL_ENTITY_EXAMPLES, COL_REPLACEMENT_MAP, + COL_REPLACEMENT_MAP_SOURCE, ENTITY_LABEL_EXAMPLES, ) from anonymizer.engine.ndd.adapter import FailedRecord, NddAdapter @@ -28,6 +29,7 @@ from anonymizer.engine.schemas import EntitiesByValueSchema, EntityReplacementMapSchema logger = logging.getLogger("anonymizer.replace.llm_workflow") +REPLACEMENT_MAP_SOURCE_LLM = "llm" # Workflow-internal scratch columns used only to build the replacement-generator # prompt. Created in `generate_map_only` and dropped before returning — nothing @@ -71,6 +73,7 @@ def generate_map_only( # Partition: rows with an empty entity list bypass replacement-map generation. entity_rows, passthrough_rows = split_rows(working_df, column=COL_ENTITIES_FOR_REPLACE, predicate=bool) passthrough_rows[COL_REPLACEMENT_MAP] = [{"replacements": []} for _ in range(len(passthrough_rows))] + passthrough_rows[COL_REPLACEMENT_MAP_SOURCE] = REPLACEMENT_MAP_SOURCE_LLM if entity_rows.empty: passthrough_only = merge_and_reorder(passthrough_rows) @@ -110,6 +113,7 @@ def generate_map_only( ), axis=1, ) + output_df[COL_REPLACEMENT_MAP_SOURCE] = REPLACEMENT_MAP_SOURCE_LLM combined = merge_and_reorder(output_df, passthrough_rows) return LlmReplaceResult( @@ -160,28 +164,56 @@ def _filter_replacement_map_to_input_entities( for label in entity.labels if entity.value and label } + protected_original_values = {value for value, _ in allowed_pairs} filtered: list[dict[str, str]] = [] seen: set[tuple[str, str]] = set() + synthetic_collision_labels: Counter[str] = Counter() for replacement in parsed_map.replacements: key = (replacement.original, replacement.label) if key not in allowed_pairs or key in seen: continue + if replacement.synthetic in protected_original_values: + synthetic_collision_labels[replacement.label] += 1 + seen.add(key) + filtered.append( + { + "original": replacement.original, + "label": replacement.label, + "synthetic": _collision_safe_synthetic( + replacement.label, + index=synthetic_collision_labels[replacement.label], + protected_original_values=protected_original_values, + ), + } + ) + continue seen.add(key) filtered.append(replacement.model_dump()) + if synthetic_collision_labels: + logger.warning( + "Replacement map repaired synthetic-original collision entries for record %s; repaired=%d " + "(repaired_by_label=%s)", + record_id or "", + sum(synthetic_collision_labels.values()), + dict(synthetic_collision_labels), + ) if logger.isEnabledFor(logging.DEBUG): raw_pairs = {(r.original, r.label) for r in parsed_map.replacements} filtered_pairs = {(f["original"], f["label"]) for f in filtered} unrequested_labels = Counter(label for _, label in (raw_pairs - allowed_pairs)) unfilled_labels = Counter(label for _, label in (allowed_pairs - filtered_pairs)) logger.debug( - "Replacement map record %s: requested=%d raw=%d filtered=%d%s%s", + "Replacement map record %s: requested=%d raw=%d filtered=%d%s%s%s", record_id or "", len(allowed_pairs), len(parsed_map.replacements), len(filtered), f" unrequested_by_label={dict(unrequested_labels)}" if unrequested_labels else "", f" unfilled_by_label={dict(unfilled_labels)}" if unfilled_labels else "", + f" synthetic_original_collision_by_label={dict(synthetic_collision_labels)}" + if synthetic_collision_labels + else "", ) if not filtered and allowed_pairs: requested_labels = Counter(label for _, label in allowed_pairs) @@ -195,6 +227,15 @@ def _filter_replacement_map_to_input_entities( return {"replacements": filtered} +def _collision_safe_synthetic(label: str, *, index: int, protected_original_values: set[str]) -> str: + label_token = "".join(char.upper() if char.isalnum() else "_" for char in label).strip("_") or "VALUE" + while True: + candidate = f"[SUBSTITUTE_{label_token}_{index}]" + if candidate not in protected_original_values: + return candidate + index += 1 + + def _get_replacement_mapping_prompt(*, entities_column: str, instructions: str | None = None) -> str: instruction_block = f"\nAdditional instructions: {instructions}\n" if instructions else "" prompt = """Generate synthetic replacements for sensitive entities. ONE value per entity, used consistently. diff --git a/src/anonymizer/engine/replace/replace_runner.py b/src/anonymizer/engine/replace/replace_runner.py index d6501834..f3a95adc 100644 --- a/src/anonymizer/engine/replace/replace_runner.py +++ b/src/anonymizer/engine/replace/replace_runner.py @@ -28,6 +28,7 @@ from anonymizer.engine.ndd.adapter import RECORD_ID_COLUMN, FailedRecord, NddAdapter from anonymizer.engine.replace.llm_replace_workflow import LlmReplaceWorkflow from anonymizer.engine.replace.strategies import apply_local_replace_strategy, apply_replacement_map +from anonymizer.measurement import stage_timer logger = logging.getLogger("anonymizer.replace") @@ -73,27 +74,38 @@ def run( Evaluation is a separate concern — call ``evaluate()`` on the resulting dataframe when you want the LLM alignment scores. """ - logger.debug("replacement strategy: %s on %d records", type(replace_method).__name__, len(dataframe)) - - if isinstance(replace_method, (Annotate, Redact, Hash)): - local_df = apply_local_replace_strategy(dataframe, strategy=replace_method) - failed_records: list[FailedRecord] = [] - elif isinstance(replace_method, Substitute): - if self._llm_workflow is None: - raise ValueError("Substitute requires an llm_workflow, but none was provided.") - map_result = self._llm_workflow.generate_map_only( - dataframe, - model_configs=model_configs, - selected_models=selected_models, - instructions=replace_method.instructions, - preview_num_records=preview_num_records, - ) - local_df = apply_replacement_map(map_result.dataframe) - failed_records = list(map_result.failed_records) - else: - raise ValueError(f"Unsupported replace method: {type(replace_method).__name__}") + strategy = type(replace_method).__name__ + with stage_timer( + "ReplacementWorkflow.run", + strategy=strategy, + input_row_count=len(dataframe), + ) as measurement: + logger.debug("replacement strategy: %s on %d records", strategy, len(dataframe)) + + if isinstance(replace_method, (Annotate, Redact, Hash)): + local_df = apply_local_replace_strategy(dataframe, strategy=replace_method) + failed_records: list[FailedRecord] = [] + elif isinstance(replace_method, Substitute): + if self._llm_workflow is None: + raise ValueError("Substitute requires an llm_workflow, but none was provided.") + map_result = self._llm_workflow.generate_map_only( + dataframe, + model_configs=model_configs, + selected_models=selected_models, + instructions=replace_method.instructions, + preview_num_records=preview_num_records, + ) + local_df = apply_replacement_map(map_result.dataframe) + failed_records = list(map_result.failed_records) + else: + raise ValueError(f"Unsupported replace method: {type(replace_method).__name__}") - return ReplacementResult(dataframe=local_df, failed_records=failed_records) + result = ReplacementResult(dataframe=local_df, failed_records=failed_records) + measurement.update( + output_row_count=len(result.dataframe), + failed_record_count=len(result.failed_records), + ) + return result def evaluate( self, diff --git a/src/anonymizer/engine/rewrite/rewrite_workflow.py b/src/anonymizer/engine/rewrite/rewrite_workflow.py index 88c2b9c3..88fe07f0 100644 --- a/src/anonymizer/engine/rewrite/rewrite_workflow.py +++ b/src/anonymizer/engine/rewrite/rewrite_workflow.py @@ -37,6 +37,7 @@ from anonymizer.engine.rewrite.sensitivity_disposition import SensitivityDispositionWorkflow from anonymizer.engine.rewrite.workflow_utils import derive_seed_columns, select_seed_cols from anonymizer.engine.row_partitioning import merge_and_reorder, split_rows +from anonymizer.measurement import stage_timer logger = logging.getLogger("anonymizer.rewrite.workflow") @@ -196,80 +197,95 @@ def run( preview_num_records: int | None = None, strict_entity_protection: bool = False, ) -> RewriteResult: - all_failed: list[FailedRecord] = [] - - entity_rows, passthrough_rows = split_rows(dataframe, column=COL_ENTITIES_BY_VALUE, predicate=_has_entities) + with stage_timer("RewriteWorkflow.run", input_row_count=len(dataframe)) as measurement: + all_failed: list[FailedRecord] = [] - # Fast path: no entities anywhere - if entity_rows.empty: - _apply_passthrough_defaults(passthrough_rows) - result_df = merge_and_reorder(passthrough_rows) - return RewriteResult(dataframe=result_df, failed_records=all_failed) + entity_rows, passthrough_rows = split_rows(dataframe, column=COL_ENTITIES_BY_VALUE, predicate=_has_entities) + measurement.update( + entity_row_count=len(entity_rows), + passthrough_row_count=len(passthrough_rows), + ) - # --- Step 1: replacement map (needs only detection output) --- - replace_workflow = LlmReplaceWorkflow(adapter=self._adapter) - replace_result = replace_workflow.generate_map_only( - entity_rows, - model_configs=model_configs, - selected_models=replace_model_selection, - ) - entity_rows = _join_new_columns(entity_rows, replace_result.dataframe) - all_failed.extend(replace_result.failed_records) + # Fast path: no entities anywhere + if entity_rows.empty: + _apply_passthrough_defaults(passthrough_rows) + result_df = merge_and_reorder(passthrough_rows) + result = RewriteResult(dataframe=result_df, failed_records=all_failed) + measurement.update( + output_row_count=len(result.dataframe), + failed_record_count=len(result.failed_records), + ) + return result + + # --- Step 1: replacement map (needs only detection output) --- + replace_workflow = LlmReplaceWorkflow(adapter=self._adapter) + replace_result = replace_workflow.generate_map_only( + entity_rows, + model_configs=model_configs, + selected_models=replace_model_selection, + ) + entity_rows = _join_new_columns(entity_rows, replace_result.dataframe) + all_failed.extend(replace_result.failed_records) + + # --- Step 2: domain, disposition, QA, rewrite (single adapter call) --- + pipeline_columns = [ + *self._domain_wf.columns(selected_models=selected_models, data_summary=data_summary), + *self._disposition_wf.columns( + selected_models=selected_models, + privacy_goal=privacy_goal, + data_summary=data_summary, + strict_entity_protection=strict_entity_protection, + ), + *self._qa_wf.columns(selected_models=selected_models), + *self._rewrite_gen_wf.columns( + selected_models=selected_models, + privacy_goal=privacy_goal, + data_summary=data_summary, + ), + ] + + pipeline_seed = select_seed_cols(entity_rows, derive_seed_columns(pipeline_columns, entity_rows)) + pipeline_result = self._adapter.run_workflow( + pipeline_seed, + model_configs=model_configs, + columns=pipeline_columns, + workflow_name="rewrite-pipeline", + preview_num_records=preview_num_records, + ) + entity_rows = _join_new_columns(entity_rows, pipeline_result.dataframe) + all_failed.extend(pipeline_result.failed_records) - # --- Step 2: domain, disposition, QA, rewrite (single adapter call) --- - pipeline_columns = [ - *self._domain_wf.columns(selected_models=selected_models, data_summary=data_summary), - *self._disposition_wf.columns( + # --- Step 5: evaluate-repair loop --- + entity_rows, eval_repair_failed = self._run_evaluate_repair_loop( + entity_rows, + model_configs=model_configs, selected_models=selected_models, privacy_goal=privacy_goal, - data_summary=data_summary, - strict_entity_protection=strict_entity_protection, - ), - *self._qa_wf.columns(selected_models=selected_models), - *self._rewrite_gen_wf.columns( + evaluation=evaluation, + preview_num_records=preview_num_records, + ) + all_failed.extend(eval_repair_failed) + + # --- Step 6: final judge (non-critical) --- + entity_rows, judge_failed = self._run_final_judge( + entity_rows, + model_configs=model_configs, selected_models=selected_models, privacy_goal=privacy_goal, - data_summary=data_summary, - ), - ] - - pipeline_seed = select_seed_cols(entity_rows, derive_seed_columns(pipeline_columns, entity_rows)) - pipeline_result = self._adapter.run_workflow( - pipeline_seed, - model_configs=model_configs, - columns=pipeline_columns, - workflow_name="rewrite-pipeline", - preview_num_records=preview_num_records, - ) - entity_rows = _join_new_columns(entity_rows, pipeline_result.dataframe) - all_failed.extend(pipeline_result.failed_records) - - # --- Step 5: evaluate-repair loop --- - entity_rows, eval_repair_failed = self._run_evaluate_repair_loop( - entity_rows, - model_configs=model_configs, - selected_models=selected_models, - privacy_goal=privacy_goal, - evaluation=evaluation, - preview_num_records=preview_num_records, - ) - all_failed.extend(eval_repair_failed) - - # --- Step 6: final judge (non-critical) --- - entity_rows, judge_failed = self._run_final_judge( - entity_rows, - model_configs=model_configs, - selected_models=selected_models, - privacy_goal=privacy_goal, - evaluation=evaluation, - preview_num_records=preview_num_records, - ) - all_failed.extend(judge_failed) + evaluation=evaluation, + preview_num_records=preview_num_records, + ) + all_failed.extend(judge_failed) - # --- Merge and return --- - _apply_passthrough_defaults(passthrough_rows) - combined = merge_and_reorder(entity_rows, passthrough_rows) - return RewriteResult(dataframe=combined, failed_records=all_failed) + # --- Merge and return --- + _apply_passthrough_defaults(passthrough_rows) + combined = merge_and_reorder(entity_rows, passthrough_rows) + result = RewriteResult(dataframe=combined, failed_records=all_failed) + measurement.update( + output_row_count=len(result.dataframe), + failed_record_count=len(result.failed_records), + ) + return result # --------------------------------------------------------------------------- # Evaluate-repair loop diff --git a/src/anonymizer/interface/anonymizer.py b/src/anonymizer/interface/anonymizer.py index ec08164a..b19762d3 100644 --- a/src/anonymizer/interface/anonymizer.py +++ b/src/anonymizer/interface/anonymizer.py @@ -59,6 +59,11 @@ from anonymizer.interface.errors import InvalidConfigError from anonymizer.interface.results import AnonymizerResult, PreviewResult from anonymizer.logging import LOG_INDENT, configure_logging, reapply_log_levels +from anonymizer.measurement import ( + record_record_metrics, + record_run_metadata, + stage_timer, +) from anonymizer.telemetry import ( NOT_APPLICABLE, AnonymizerEvent, @@ -331,6 +336,45 @@ def _run_internal( data: AnonymizerInput, context: ResolvedInput, preview_num_records: int | None, + ) -> AnonymizerResult: + input_df = context.dataframe + mode = "replace" if config.replace is not None else "rewrite" + strategy = type(config.replace).__name__ if config.replace is not None else "Rewrite" + with stage_timer( + "Anonymizer._run_internal", + mode=mode, + strategy=strategy, + input_row_count=len(input_df), + preview_num_records=preview_num_records, + ) as measurement: + record_run_metadata( + config=config, + data=data, + mode=mode, + strategy=strategy, + input_row_count=len(input_df), + preview_num_records=preview_num_records, + model_configs=self._model_configs, + ) + result = self._run_internal_impl( + config=config, + data=data, + context=context, + preview_num_records=preview_num_records, + ) + measurement.update( + output_row_count=len(result.trace_dataframe), + failed_record_count=len(result.failed_records), + ) + return result + + def _run_internal_impl( + self, + *, + config: AnonymizerConfig, + data: AnonymizerInput, + context: ResolvedInput, + preview_num_records: int | None, ) -> AnonymizerResult: input_df = context.dataframe num_records = len(input_df) @@ -455,6 +499,13 @@ def _run_internal( text_col = context.resolved_text_column renamed_trace = _rename_output_columns(final_df, resolved_text_column=text_col) logger.info("🎉 Pipeline complete — %d records processed, %d total failures", num_records, len(all_failures)) + record_record_metrics( + final_df, + mode="replace" if config.replace is not None else "rewrite", + strategy=type(config.replace).__name__ if config.replace is not None else "Rewrite", + text_column=COL_TEXT, + validation_max_entities_per_call=config.detect.validation_max_entities_per_call, + ) return AnonymizerResult( dataframe=_build_user_dataframe(renamed_trace, resolved_text_column=text_col), trace_dataframe=renamed_trace, diff --git a/src/anonymizer/measurement/__init__.py b/src/anonymizer/measurement/__init__.py new file mode 100644 index 00000000..aee9499a --- /dev/null +++ b/src/anonymizer/measurement/__init__.py @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from anonymizer.measurement.collector import MeasurementCollector +from anonymizer.measurement.config import MeasurementConfig +from anonymizer.measurement.constants import ( + DD_TRACE_MODES, + DEFAULT_MEASUREMENT_ENV_PREFIX, + MEASUREMENT_SCHEMA_VERSION, + DDTraceMode, +) +from anonymizer.measurement.metrics.llm_calls import estimate_llm_calls_by_stage +from anonymizer.measurement.recorders import ( + record_model_workflow, + record_ndd_workflow, + record_run_metadata, + record_stage, + stage_timer, +) +from anonymizer.measurement.records.row import record_evaluation_metrics, record_record_metrics +from anonymizer.measurement.session import configured_measurement_session, current_collector, measurement_session + +__all__ = [ + "DD_TRACE_MODES", + "DDTraceMode", + "DEFAULT_MEASUREMENT_ENV_PREFIX", + "MEASUREMENT_SCHEMA_VERSION", + "MeasurementCollector", + "MeasurementConfig", + "configured_measurement_session", + "current_collector", + "estimate_llm_calls_by_stage", + "measurement_session", + "record_model_workflow", + "record_ndd_workflow", + "record_evaluation_metrics", + "record_record_metrics", + "record_run_metadata", + "record_stage", + "stage_timer", +] diff --git a/src/anonymizer/measurement/_coerce.py b/src/anonymizer/measurement/_coerce.py new file mode 100644 index 00000000..05d2cfcb --- /dev/null +++ b/src/anonymizer/measurement/_coerce.py @@ -0,0 +1,134 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json +import math +from collections.abc import Mapping +from numbers import Integral +from typing import Any, cast + + +def _safe_row_index(row_index: object) -> int | None: + if isinstance(row_index, bool): + return None + if isinstance(row_index, Integral): + return int(row_index) + return None + + +def _count_items(raw: object, *, primary_key: str, fallback_keys: tuple[str, ...] = ()) -> int: + payload = _coerce_payload(raw) + if isinstance(payload, Mapping): + payload_map = cast(Mapping[str, Any], payload) + for key in (primary_key, *fallback_keys): + items = payload_map.get(key) + if isinstance(items, list): + return len(items) + return 0 + if isinstance(payload, list): + return len(payload) + return 0 + + +def _coerce_payload(raw: object) -> object: + model_dump = getattr(raw, "model_dump", None) + if callable(model_dump): + return model_dump(mode="python") + if isinstance(raw, str): + try: + return json.loads(raw) + except json.JSONDecodeError: + return {} + if raw is None: + return {} + return raw + + +def _coerce_int(raw: object, *, default: int) -> int: + try: + return int(cast(Any, raw)) + except (TypeError, ValueError): + return default + + +def _coerce_float(raw: object) -> float | None: + try: + value = float(cast(Any, raw)) + except (TypeError, ValueError): + return None + return None if math.isnan(value) else value + + +def _coerce_bool(raw: object) -> bool | None: + if raw is None: + return None + if isinstance(raw, float) and math.isnan(raw): + return None + if isinstance(raw, bool): + return raw + if isinstance(raw, str): + lowered = raw.strip().lower() + if lowered in {"true", "1", "yes"}: + return True + if lowered in {"false", "0", "no"}: + return False + return None + try: + return bool(cast(Any, raw)) + except (TypeError, ValueError): + return None + + +def _safe_rate(numerator: int | float | None, elapsed_sec: float) -> float | None: + if numerator is None or elapsed_sec <= 0: + return None + return float(numerator) / elapsed_sec + + +def _safe_ratio(numerator: int | float | None, denominator: int | float | None) -> float | None: + if numerator is None or denominator is None or denominator == 0: + return None + return float(numerator) / float(denominator) + + +def _f1(precision: float | None, recall: float | None) -> float | None: + if precision is None or recall is None or precision + recall == 0: + return None + return 2 * precision * recall / (precision + recall) + + +def _size_bucket(value: int) -> str: + if value == 0: + return "0" + for upper in (128, 512, 2048, 8192): + if value < upper: + return f"1-{upper - 1}" if upper == 128 else f"{upper // 4}-{upper - 1}" + return "8192+" + + +def _count_text_tokens(text: str) -> int: + try: + import tiktoken + + tokenizer = tiktoken.get_encoding("cl100k_base") + return len(tokenizer.encode(text, disallowed_special=())) + except Exception: + return len(text.split()) + + +def _json_safe(value: object) -> Any: + if isinstance(value, dict): + return {str(k): _json_safe(v) for k, v in value.items()} + if isinstance(value, list): + return [_json_safe(v) for v in value] + if isinstance(value, tuple): + return [_json_safe(v) for v in value] + if isinstance(value, set): + return sorted((_json_safe(v) for v in value), key=str) + if isinstance(value, float) and not math.isfinite(value): + return None + if isinstance(value, (str, int, float, bool)) or value is None: + return value + return str(value) diff --git a/src/anonymizer/measurement/collector.py b/src/anonymizer/measurement/collector.py new file mode 100644 index 00000000..704b6dfa --- /dev/null +++ b/src/anonymizer/measurement/collector.py @@ -0,0 +1,200 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import hashlib +import hmac +import json +import logging +import secrets +import time +import uuid +from collections.abc import Mapping +from pathlib import Path +from typing import TYPE_CHECKING, Any, cast + +from anonymizer.measurement._coerce import _json_safe +from anonymizer.measurement.constants import MEASUREMENT_SCHEMA_VERSION, DDTraceMode +from anonymizer.measurement.sinks import _JsonlMeasurementWriter, _JsonMeasurementWriter, _MeasurementSink + +if TYPE_CHECKING: + import pandas as pd + +logger = logging.getLogger("anonymizer.measurement") + + +class MeasurementCollector: + """In-memory collector for local benchmark and throughput records. + + Records contain counts, labels, lengths, aliases, timings, and run-scoped + HMACs. They must not contain raw text, entity values, prompts, generated + outputs, replacement maps, provider secrets, or API keys. + """ + + def __init__( + self, + *, + run_id: str | None = None, + record_hash_key: bytes | str | None = None, + record_level: bool = True, + run_tags: Mapping[str, Any] | None = None, + record_sink: _MeasurementSink | None = None, + keep_records: bool = True, + dd_trace_mode: DDTraceMode = "none", + dd_trace_sink: _MeasurementSink | None = None, + dd_task_trace_sink: _MeasurementSink | None = None, + fail_on_write_error: bool = False, + ) -> None: + self.run_id = run_id or uuid.uuid4().hex + self.record_level = record_level + self.run_tags = cast(dict[str, Any], _json_safe(dict(run_tags or {}))) + self._record_sink = record_sink + self._keep_records = keep_records + self._dd_trace_mode = dd_trace_mode + self._dd_trace_sink = dd_trace_sink + self._dd_task_trace_sink = dd_task_trace_sink + self._fail_on_write_error = fail_on_write_error + self._sink_failed = False + self._dd_trace_failed = False + self._dd_task_trace_failed = False + if record_hash_key is None: + self._record_hash_key = secrets.token_bytes(32) + elif isinstance(record_hash_key, str): + self._record_hash_key = record_hash_key.encode("utf-8") + else: + self._record_hash_key = bytes(record_hash_key) + self._records: list[dict[str, Any]] = [] + + @property + def records(self) -> list[dict[str, Any]]: + """Return a shallow copy of collected measurement records.""" + return list(self._records) + + def record(self, record_type: str, **fields: Any) -> None: + """Append one machine-readable measurement record.""" + record = { + **fields, + "schema_version": MEASUREMENT_SCHEMA_VERSION, + "record_type": record_type, + "run_id": self.run_id, + "run_tags": self.run_tags, + "timestamp_unix_sec": time.time(), + } + safe_record = _json_safe(record) + if self._keep_records: + self._records.append(safe_record) + if self._record_sink is not None: + self._write_record_to_sink(safe_record) + + def close(self) -> None: + """Close any streaming measurement sink attached to this collector.""" + close_error: Exception | None = None + for sink in (self._record_sink, self._dd_trace_sink, self._dd_task_trace_sink): + if sink is None: + continue + try: + sink.close() + except Exception as exc: + if close_error is None: + close_error = exc + if close_error is not None: + raise close_error + + @property + def dd_trace_mode(self) -> DDTraceMode: + return self._dd_trace_mode + + @property + def dd_trace_enabled(self) -> bool: + return self._dd_trace_mode != "none" and self._dd_trace_sink is not None + + @property + def dd_task_trace_enabled(self) -> bool: + return self._dd_task_trace_sink is not None + + def record_dd_message_trace(self, **fields: Any) -> None: + """Write an explicitly opt-in DataDesigner message trace record. + + These records may contain raw prompts, input text, model outputs, and + PII. They are intentionally written to a separate trace sink and are + never appended to the safe measurement record list. + """ + if not self.dd_trace_enabled or self._dd_trace_failed: + return + + record = _json_safe( + { + **fields, + "schema_version": MEASUREMENT_SCHEMA_VERSION, + "record_type": "dd_message_trace", + "run_id": self.run_id, + "run_tags": self.run_tags, + "timestamp_unix_sec": time.time(), + } + ) + try: + cast(_MeasurementSink, self._dd_trace_sink).write_record(record) + except Exception: + self._dd_trace_failed = True + logger.warning("Failed to write DataDesigner message trace records") + if self._fail_on_write_error: + raise + + def record_dd_task_trace(self, **fields: Any) -> None: + """Write an opt-in sanitized DataDesigner scheduler task trace record.""" + if not self.dd_task_trace_enabled or self._dd_task_trace_failed: + return + + record = _json_safe( + { + **fields, + "schema_version": MEASUREMENT_SCHEMA_VERSION, + "record_type": "dd_task_trace", + "run_id": self.run_id, + "run_tags": self.run_tags, + "timestamp_unix_sec": time.time(), + } + ) + try: + cast(_MeasurementSink, self._dd_task_trace_sink).write_record(record) + except Exception: + self._dd_task_trace_failed = True + logger.warning("Failed to write DataDesigner task trace records") + if self._fail_on_write_error: + raise + + def _write_record_to_sink(self, record: dict[str, Any]) -> None: + if self._sink_failed: + return + try: + cast(_MeasurementSink, self._record_sink).write_record(record) + except Exception: + self._sink_failed = True + logger.warning("Failed to stream Anonymizer measurement records") + if self._fail_on_write_error: + raise + + def record_hash(self, *, row_index: object, text: str) -> str: + """Return a run-scoped HMAC for joining records without storing text.""" + serialized = json.dumps( + {"row_index": str(row_index), "text": text}, + default=str, + sort_keys=True, + separators=(",", ":"), + ) + return hmac.new(self._record_hash_key, serialized.encode("utf-8"), hashlib.sha256).hexdigest() + + def write_jsonl(self, path: str | Path) -> None: + """Write records as newline-delimited JSON.""" + _JsonlMeasurementWriter().write(self._records, path) + + def write_json(self, path: str | Path) -> None: + """Write records as a JSON array.""" + _JsonMeasurementWriter().write(self._records, path) + + def to_dataframe(self) -> pd.DataFrame: + """Return records as a pandas DataFrame for benchmark tooling.""" + import pandas as pd + + return pd.DataFrame(self._records) diff --git a/src/anonymizer/measurement/config.py b/src/anonymizer/measurement/config.py new file mode 100644 index 00000000..623f3549 --- /dev/null +++ b/src/anonymizer/measurement/config.py @@ -0,0 +1,136 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from collections.abc import Mapping +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Literal, cast + +from pydantic import Field, ValidationError +from pydantic_settings import BaseSettings, SettingsConfigDict, SettingsError + +from anonymizer.measurement.collector import MeasurementCollector +from anonymizer.measurement.constants import DD_TRACE_MODES, DEFAULT_MEASUREMENT_ENV_PREFIX, DDTraceMode +from anonymizer.measurement.sinks import _writer_for_format + + +class _MeasurementEnvSettings(BaseSettings): + model_config = SettingsConfigDict( + env_prefix=DEFAULT_MEASUREMENT_ENV_PREFIX, + env_ignore_empty=True, + extra="ignore", + ) + + output_path: str | None = None + output_format: Literal["jsonl", "json"] = "jsonl" + record_level: bool = True + streaming: bool = False + keep_records: bool = True + dd_trace: DDTraceMode = "none" + dd_trace_path: str | None = None + dd_task_trace_path: str | None = None + fail_on_write_error: bool = False + run_id: str | None = None + run_tags: dict[str, Any] = Field(default_factory=dict) + + +@dataclass(frozen=True) +class MeasurementConfig: + """Configuration for writing structured measurement records around a run.""" + + output_path: str | Path + output_format: Literal["jsonl", "json"] = "jsonl" + record_level: bool = True + streaming: bool = False + keep_records: bool = True + dd_trace: DDTraceMode = "none" + dd_trace_path: str | Path | None = None + dd_task_trace_path: str | Path | None = None + run_id: str | None = None + record_hash_key: bytes | str | None = None + run_tags: Mapping[str, Any] | None = None + fail_on_write_error: bool = False + + def __post_init__(self) -> None: + if self.output_format not in {"jsonl", "json"}: + raise ValueError("output_format must be 'jsonl' or 'json'") + if self.streaming and self.output_format != "jsonl": + raise ValueError("streaming measurement output only supports jsonl") + if self.dd_trace not in DD_TRACE_MODES: + raise ValueError("dd_trace must be 'none', 'last_message', or 'all_messages'") + if self.dd_trace != "none" and self.dd_trace_path is None: + raise ValueError("dd_trace_path is required when dd_trace is enabled") + + @classmethod + def from_env(cls, *, prefix: str = DEFAULT_MEASUREMENT_ENV_PREFIX) -> MeasurementConfig | None: + """Build measurement config from environment variables, or None if output is unset. + + This is intentionally opt-in. Anonymizer API and CLI calls do not read + measurement environment variables unless benchmark/tooling code calls this + helper explicitly. + """ + try: + settings = _load_measurement_env_settings(prefix=prefix) + except (SettingsError, ValidationError) as exc: + raise ValueError(_measurement_env_error_message(exc, prefix=prefix)) from None + + if settings.output_path is None: + return None + return cls( + output_path=settings.output_path, + output_format=settings.output_format, + record_level=settings.record_level, + streaming=settings.streaming, + keep_records=settings.keep_records, + dd_trace=settings.dd_trace, + dd_trace_path=settings.dd_trace_path, + dd_task_trace_path=settings.dd_task_trace_path, + run_id=settings.run_id, + run_tags=settings.run_tags, + fail_on_write_error=settings.fail_on_write_error, + ) + + @classmethod + def from_sources( + cls, + explicit: MeasurementConfig | None = None, + *, + env: bool = False, + prefix: str = DEFAULT_MEASUREMENT_ENV_PREFIX, + ) -> MeasurementConfig | None: + """Resolve measurement config from explicit config first, then optional env.""" + if explicit is not None: + return explicit + if env: + return cls.from_env(prefix=prefix) + return None + + def write_collector(self, collector: MeasurementCollector) -> None: + """Write a collector using this config's output format.""" + _writer_for_format(self.output_format).write(collector.records, self.output_path) + + +def _measurement_env_error_message(exc: SettingsError | ValidationError, *, prefix: str) -> str: + fields: set[str] = set() + if isinstance(exc, ValidationError): + for error in exc.errors(include_input=False): + loc = error.get("loc", ()) + if loc: + fields.add(str(loc[0]).upper()) + else: + error_text = str(exc).lower() + for field_name in _MeasurementEnvSettings.model_fields: + if field_name in error_text: + fields.add(field_name.upper()) + + if fields: + env_fields = ", ".join(f"{prefix}{field}" for field in sorted(fields)) + return f"Invalid Anonymizer measurement environment configuration for: {env_fields}" + return "Invalid Anonymizer measurement environment configuration" + + +def _load_measurement_env_settings(*, prefix: str) -> _MeasurementEnvSettings: + settings_factory = cast(Any, _MeasurementEnvSettings) + return cast(_MeasurementEnvSettings, settings_factory(_env_prefix=prefix)) diff --git a/src/anonymizer/measurement/constants.py b/src/anonymizer/measurement/constants.py new file mode 100644 index 00000000..0fd1ff89 --- /dev/null +++ b/src/anonymizer/measurement/constants.py @@ -0,0 +1,11 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typing import Literal + +MEASUREMENT_SCHEMA_VERSION = 1 +DEFAULT_MEASUREMENT_ENV_PREFIX = "ANONYMIZER_MEASUREMENT_" +DD_TRACE_MODES = {"none", "last_message", "all_messages"} +DDTraceMode = Literal["none", "last_message", "all_messages"] diff --git a/src/anonymizer/measurement/metrics/__init__.py b/src/anonymizer/measurement/metrics/__init__.py new file mode 100644 index 00000000..3d2894b7 --- /dev/null +++ b/src/anonymizer/measurement/metrics/__init__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations diff --git a/src/anonymizer/measurement/metrics/entities.py b/src/anonymizer/measurement/metrics/entities.py new file mode 100644 index 00000000..da2bfe3a --- /dev/null +++ b/src/anonymizer/measurement/metrics/entities.py @@ -0,0 +1,216 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from collections import Counter +from collections.abc import Mapping +from typing import Any, cast + +from anonymizer.measurement._coerce import _coerce_float, _coerce_payload, _f1, _safe_ratio + +_GROUND_TRUTH_ENTITY_COLUMNS = ("ground_truth_entities", "gt_entities", "expected_entities") +_ENTITY_LABEL_EQUIVALENCE_CLASSES = ( + frozenset( + { + "access_token", + "api_key", + "auth_token", + "bearer_token", + "password", + "secret_key", + "session_id", + "unique_id", + "user_id", + } + ), + frozenset({"full_name", "person_name", "user", "user_name", "username"}), + frozenset({"phone", "phone_number", "telephone"}), + frozenset({"email", "email_address"}), + frozenset({"cookie", "http_cookie", "session_cookie"}), +) +_ENTITY_LABEL_EQUIVALENCE: dict[str, str] = { + label: sorted(labels)[0] for labels in _ENTITY_LABEL_EQUIVALENCE_CLASSES for label in labels +} + + +def _entities_from_raw(raw: object) -> list[dict[str, Any]]: + payload = _coerce_payload(raw) + if isinstance(payload, Mapping): + items = cast(Mapping[str, Any], payload).get("entities", []) + elif isinstance(payload, list): + items = payload + else: + items = [] + return [dict(cast(Mapping[str, Any], item)) for item in items if isinstance(item, Mapping)] + + +def _entity_ground_truth_metrics( + final_entities: list[dict[str, Any]], + ground_truth_entities: list[dict[str, Any]] | None, +) -> dict[str, Any]: + if ground_truth_entities is None: + return { + "ground_truth_entity_count": None, + "ground_truth_entity_label_counts": None, + "entity_true_positive_count": None, + "entity_false_positive_count": None, + "entity_false_negative_count": None, + "entity_precision": None, + "entity_recall": None, + "entity_f1": None, + "entity_relaxed_gt_found_count": None, + "entity_relaxed_detected_tp_count": None, + "entity_relaxed_label_compatible_gt_found_count": None, + "entity_relaxed_label_compatible_detected_tp_count": None, + "entity_relaxed_precision": None, + "entity_relaxed_recall": None, + "entity_relaxed_f1": None, + "entity_relaxed_label_compatible_precision": None, + "entity_relaxed_label_compatible_recall": None, + "entity_relaxed_label_compatible_f1": None, + } + + predicted = _entity_identity_counts(final_entities) + expected = _entity_identity_counts(ground_truth_entities) + true_positive = sum((predicted & expected).values()) + false_positive = sum((predicted - expected).values()) + false_negative = sum((expected - predicted).values()) + precision = _safe_ratio(true_positive, true_positive + false_positive) + recall = _safe_ratio(true_positive, true_positive + false_negative) + return { + "ground_truth_entity_count": len(ground_truth_entities), + "ground_truth_entity_label_counts": dict( + sorted(Counter(e.get("label", "") for e in ground_truth_entities if e.get("label")).items()) + ), + "entity_true_positive_count": true_positive, + "entity_false_positive_count": false_positive, + "entity_false_negative_count": false_negative, + "entity_precision": precision, + "entity_recall": recall, + "entity_f1": _f1(precision, recall), + **_entity_relaxed_ground_truth_metrics(final_entities, ground_truth_entities), + } + + +def _entity_identity_counts(entities: list[dict[str, Any]]) -> Counter[tuple[str, str]]: + identities: Counter[tuple[str, str]] = Counter() + for entity in entities: + label = entity.get("label") + value = entity.get("value") + if label is None or value is None: + continue + identities[(str(value), str(label))] += 1 + return identities + + +def _entity_relaxed_ground_truth_metrics( + final_entities: list[dict[str, Any]], + ground_truth_entities: list[dict[str, Any]], +) -> dict[str, Any]: + relaxed_match_count = _relaxed_entity_match_count(final_entities, ground_truth_entities) + label_compatible_match_count = _relaxed_entity_match_count( + final_entities, + ground_truth_entities, + require_label_compatible=True, + ) + gt_found = relaxed_match_count + detected_tp = relaxed_match_count + label_compatible_gt_found = label_compatible_match_count + label_compatible_detected_tp = label_compatible_match_count + precision = _safe_ratio(detected_tp, len(final_entities)) + recall = _safe_ratio(gt_found, len(ground_truth_entities)) + label_compatible_precision = _safe_ratio(label_compatible_detected_tp, len(final_entities)) + label_compatible_recall = _safe_ratio(label_compatible_gt_found, len(ground_truth_entities)) + return { + "entity_relaxed_gt_found_count": gt_found, + "entity_relaxed_detected_tp_count": detected_tp, + "entity_relaxed_label_compatible_gt_found_count": label_compatible_gt_found, + "entity_relaxed_label_compatible_detected_tp_count": label_compatible_detected_tp, + "entity_relaxed_precision": precision, + "entity_relaxed_recall": recall, + "entity_relaxed_f1": _f1(precision, recall), + "entity_relaxed_label_compatible_precision": label_compatible_precision, + "entity_relaxed_label_compatible_recall": label_compatible_recall, + "entity_relaxed_label_compatible_f1": _f1(label_compatible_precision, label_compatible_recall), + } + + +def _relaxed_entity_match_count( + final_entities: list[dict[str, Any]], + ground_truth_entities: list[dict[str, Any]], + *, + require_label_compatible: bool = False, +) -> int: + matches_by_ground_truth = [ + [ + final_index + for final_index, final_entity in enumerate(final_entities) + if _entities_match_relaxed( + final_entity, + ground_truth_entity, + require_label_compatible=require_label_compatible, + ) + ] + for ground_truth_entity in ground_truth_entities + ] + matched_ground_truth_by_final: dict[int, int] = {} + + def assign(ground_truth_index: int, seen: set[int]) -> bool: + for final_index in matches_by_ground_truth[ground_truth_index]: + if final_index in seen: + continue + seen.add(final_index) + if final_index not in matched_ground_truth_by_final or assign( + matched_ground_truth_by_final[final_index], + seen, + ): + matched_ground_truth_by_final[final_index] = ground_truth_index + return True + return False + + return sum(1 for ground_truth_index in range(len(ground_truth_entities)) if assign(ground_truth_index, set())) + + +def _entities_match_relaxed( + left: dict[str, Any], + right: dict[str, Any], + *, + require_label_compatible: bool, +) -> bool: + if require_label_compatible and not _entity_labels_compatible(left.get("label"), right.get("label")): + return False + left_span = _entity_span(left) + right_span = _entity_span(right) + if left_span is not None and right_span is not None: + return left_span[0] < right_span[1] and right_span[0] < left_span[1] + left_value = left.get("value") + right_value = right.get("value") + return left_value is not None and right_value is not None and str(left_value) == str(right_value) + + +def _entity_span(entity: dict[str, Any]) -> tuple[int, int] | None: + start = _coerce_float(entity.get("start_position", entity.get("start"))) + end = _coerce_float(entity.get("end_position", entity.get("end"))) + if start is None or end is None: + return None + start_int = int(start) + end_int = int(end) + if start_int < 0 or end_int <= start_int: + return None + return start_int, end_int + + +def _entity_labels_compatible(left: object, right: object) -> bool: + left_key = _entity_label_key(left) + right_key = _entity_label_key(right) + return left_key is not None and right_key is not None and left_key == right_key + + +def _entity_label_key(label: object) -> str | None: + if label is None: + return None + normalized = str(label).strip().lower() + if not normalized: + return None + return _ENTITY_LABEL_EQUIVALENCE.get(normalized, normalized) diff --git a/src/anonymizer/measurement/metrics/llm_calls.py b/src/anonymizer/measurement/metrics/llm_calls.py new file mode 100644 index 00000000..545c61cf --- /dev/null +++ b/src/anonymizer/measurement/metrics/llm_calls.py @@ -0,0 +1,51 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import math + + +def estimate_llm_calls_by_stage( + *, + mode: str, + strategy: str, + has_grouped_entities: bool, + validation_chunk_count: int | None, + repair_iterations: int = 0, + replace_map_generation_uses_llm: bool = True, +) -> dict[str, int | None]: + """Estimate nominal model calls for one record, split by workflow stage.""" + detection_calls = None if validation_chunk_count is None else 2 + validation_chunk_count + replace_map_generation = 0 + if replace_map_generation_uses_llm and has_grouped_entities and (mode == "rewrite" or strategy == "Substitute"): + replace_map_generation = 1 + + if mode != "rewrite": + return { + "entity_detection": detection_calls, + "replace_map_generation": replace_map_generation, + } + + rewrite_body_calls = has_grouped_entities + return { + "entity_detection": detection_calls, + "latent_entity_detection": 1 if rewrite_body_calls else 0, + "replace_map_generation": replace_map_generation, + "rewrite_pipeline": 5 if rewrite_body_calls else 0, + "rewrite_evaluate": 3 * (1 + repair_iterations) if rewrite_body_calls else 0, + "rewrite_repair": repair_iterations if rewrite_body_calls else 0, + "rewrite_final_judge": 1 if rewrite_body_calls else 0, + } + + +def _validation_chunk_count( + detected_candidate_count: int | None, + *, + validation_max_entities_per_call: int, +) -> int | None: + if detected_candidate_count is None: + return None + if detected_candidate_count <= 0: + return 0 + return int(math.ceil(detected_candidate_count / validation_max_entities_per_call)) diff --git a/src/anonymizer/measurement/metrics/replacements.py b/src/anonymizer/measurement/metrics/replacements.py new file mode 100644 index 00000000..01ed9a84 --- /dev/null +++ b/src/anonymizer/measurement/metrics/replacements.py @@ -0,0 +1,85 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from collections import Counter +from collections.abc import Mapping +from typing import Any, cast + +from anonymizer.measurement._coerce import _coerce_payload + + +def _replacement_map_metrics(raw: object) -> dict[str, Any]: + replacement_maps = _replacement_maps_from_raw(raw) + synthetic_values = [] + for item in replacement_maps: + synthetic = item.get("replacement", item.get("synthetic")) + if synthetic is not None: + synthetic_values.append(str(synthetic)) + return { + "replacement_count": len(replacement_maps), + "replacement_label_counts": dict( + sorted(Counter(item.get("label", "") for item in replacement_maps if item.get("label")).items()) + ), + "replacement_duplicate_value_count": max(0, len(synthetic_values) - len(set(synthetic_values))), + } + + +def _replacement_coverage_metrics(raw: object, final_entities: list[dict[str, Any]]) -> dict[str, Any]: + replacement_original_values = { + str(original) + for item in _replacement_maps_from_raw(raw) + if (original := item.get("original")) is not None and str(original) + } + missing_entities = [ + entity + for entity in final_entities + if entity.get("value") and str(entity.get("value")) not in replacement_original_values + ] + missing_values = {str(entity.get("value")) for entity in missing_entities if entity.get("value")} + return { + "replacement_missing_final_entity_count": len(missing_entities), + "replacement_missing_final_entity_label_counts": dict( + sorted( + Counter(str(entity.get("label") or "") for entity in missing_entities if entity.get("label")).items() + ) + ), + "replacement_missing_final_value_count": len(missing_values), + } + + +def _replacement_collision_metrics(raw: object, final_entities: list[dict[str, Any]]) -> dict[str, Any]: + synthetic_values = { + str(synthetic) + for item in _replacement_maps_from_raw(raw) + if (synthetic := item.get("replacement", item.get("synthetic"))) is not None and str(synthetic) + } + collided_entities = [ + entity for entity in final_entities if entity.get("value") and str(entity.get("value")) in synthetic_values + ] + collided_values = {str(entity.get("value")) for entity in collided_entities if entity.get("value")} + return { + "replacement_synthetic_original_collision_count": len(collided_entities), + "replacement_synthetic_original_collision_label_counts": dict( + sorted( + Counter(str(entity.get("label") or "") for entity in collided_entities if entity.get("label")).items() + ) + ), + "replacement_synthetic_original_collision_value_count": len(collided_values), + } + + +def _replacement_maps_from_raw(raw: object) -> list[Mapping[str, Any]]: + payload = _coerce_payload(raw) + if isinstance(payload, Mapping): + replacements_raw = cast(Mapping[str, Any], payload).get("replacements") + tolist = getattr(replacements_raw, "tolist", None) + if callable(tolist): + replacements_raw = tolist() + replacements = replacements_raw if isinstance(replacements_raw, list) else [] + elif isinstance(payload, list): + replacements = payload + else: + replacements = [] + return [cast(Mapping[str, Any], item) for item in replacements if isinstance(item, Mapping)] diff --git a/src/anonymizer/measurement/metrics/rewrite.py b/src/anonymizer/measurement/metrics/rewrite.py new file mode 100644 index 00000000..ea68ec49 --- /dev/null +++ b/src/anonymizer/measurement/metrics/rewrite.py @@ -0,0 +1,94 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from collections import Counter +from typing import Any + +from anonymizer.measurement._coerce import _coerce_bool, _coerce_float + + +def _rewrite_record_fields(row: Any, *, columns: set[str]) -> dict[str, Any]: + from anonymizer.engine.constants import ( + COL_ANY_HIGH_LEAKED, + COL_LEAKAGE_MASS, + COL_NEEDS_HUMAN_REVIEW, + COL_NEEDS_REPAIR, + COL_UTILITY_SCORE, + COL_WEIGHTED_LEAKAGE_RATE, + ) + + return { + "utility_score": _coerce_float(row.get(COL_UTILITY_SCORE)) if COL_UTILITY_SCORE in columns else None, + "leakage_mass": _coerce_float(row.get(COL_LEAKAGE_MASS)) if COL_LEAKAGE_MASS in columns else None, + "weighted_leakage_rate": ( + _coerce_float(row.get(COL_WEIGHTED_LEAKAGE_RATE)) if COL_WEIGHTED_LEAKAGE_RATE in columns else None + ), + "any_high_leaked": _coerce_bool(row.get(COL_ANY_HIGH_LEAKED)) if COL_ANY_HIGH_LEAKED in columns else None, + "needs_human_review": ( + _coerce_bool(row.get(COL_NEEDS_HUMAN_REVIEW)) if COL_NEEDS_HUMAN_REVIEW in columns else None + ), + "needs_repair": _coerce_bool(row.get(COL_NEEDS_REPAIR)) if COL_NEEDS_REPAIR in columns else None, + } + + +def _original_value_leak_record_fields( + row: Any, + *, + columns: set[str], + final_entities: list[dict[str, Any]], +) -> dict[str, Any]: + output_column = _output_text_column(columns) + if output_column is None: + return {"original_value_leak_count": None, "original_value_leak_label_counts": {}} + output_text = str(row.get(output_column, "")) + leaked = [ + entity + for entity in final_entities + if entity.get("value") and _output_contains_original_value(output_text, str(entity.get("value"))) + ] + return { + "original_value_leak_count": len(leaked), + "original_value_leak_label_counts": dict( + sorted(Counter(str(entity.get("label") or "") for entity in leaked if entity.get("label")).items()) + ), + } + + +def _output_contains_original_value(output_text: str, value: str) -> bool: + if _needs_boundary_sensitive_leak_match(value): + return _contains_with_alnum_boundaries(output_text, value) + return value in output_text + + +def _needs_boundary_sensitive_leak_match(value: str) -> bool: + return len(value) <= 4 or value.isdigit() + + +def _contains_with_alnum_boundaries(output_text: str, value: str) -> bool: + start = 0 + while True: + match_start = output_text.find(value, start) + if match_start < 0: + return False + match_end = match_start + len(value) + if _has_alnum_boundaries(output_text, match_start, match_end): + return True + start = match_start + 1 + + +def _has_alnum_boundaries(text: str, start: int, end: int) -> bool: + before_is_alnum = start > 0 and text[start - 1].isalnum() + after_is_alnum = end < len(text) and text[end].isalnum() + return not before_is_alnum and not after_is_alnum + + +def _output_text_column(columns: set[str]) -> str | None: + from anonymizer.engine.constants import COL_REPLACED_TEXT, COL_REWRITTEN_TEXT + + if COL_REPLACED_TEXT in columns: + return COL_REPLACED_TEXT + if COL_REWRITTEN_TEXT in columns: + return COL_REWRITTEN_TEXT + return None diff --git a/src/anonymizer/measurement/recorders.py b/src/anonymizer/measurement/recorders.py new file mode 100644 index 00000000..1a1bbf3f --- /dev/null +++ b/src/anonymizer/measurement/recorders.py @@ -0,0 +1,220 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import time +from collections.abc import Iterator, Mapping +from contextlib import contextmanager +from typing import Any + +from anonymizer.measurement._coerce import _coerce_int +from anonymizer.measurement.records.model import _model_workflow_fields, _row_throughput_fields, _summarize_model_usage +from anonymizer.measurement.records.run import ( + _detect_config_metadata, + _model_config_metadata, + _replace_config_metadata, + _rewrite_config_metadata, + _runtime_metadata, + _source_metadata, +) +from anonymizer.measurement.session import current_collector + + +@contextmanager +def stage_timer(stage: str, **fields: Any) -> Iterator[dict[str, Any]]: + """Record wall time for a stage when collection is active.""" + collector = current_collector() + if collector is None: + yield fields + return + + started = time.perf_counter() + status = "completed" + try: + yield fields + except BaseException: + status = "error" + raise + finally: + elapsed_sec = time.perf_counter() - started + collector.record( + "stage", + stage=stage, + status=status, + elapsed_sec=elapsed_sec, + **fields, + **_row_throughput_fields( + elapsed_sec=elapsed_sec, + input_row_count=_coerce_int(fields.get("input_row_count"), default=-1), + output_row_count=_coerce_int(fields.get("output_row_count"), default=-1), + ), + ) + + +def record_stage(stage: str, *, elapsed_sec: float, status: str = "completed", **fields: Any) -> None: + """Record a pre-timed stage measurement if collection is active.""" + collector = current_collector() + if collector is None: + return + collector.record( + "stage", + stage=stage, + status=status, + elapsed_sec=elapsed_sec, + **fields, + **_row_throughput_fields( + elapsed_sec=elapsed_sec, + input_row_count=_coerce_int(fields.get("input_row_count"), default=-1), + output_row_count=_coerce_int(fields.get("output_row_count"), default=-1), + ), + ) + + +def record_ndd_workflow( + *, + workflow_name: str, + model_aliases: list[str], + input_row_count: int, + output_row_count: int | None, + failed_record_count: int | None, + elapsed_sec: float, + status: str = "completed", + seed_row_count: int | None = None, + preview_num_records: int | None = None, + column_count: int | None = None, + column_names: list[str] | None = None, + model_usage: Mapping[str, Any] | None = None, +) -> None: + """Record one DataDesigner workflow execution through the adapter boundary.""" + _record_model_workflow( + workflow_name=workflow_name, + model_aliases=model_aliases, + input_row_count=input_row_count, + output_row_count=output_row_count, + failed_record_count=failed_record_count, + elapsed_sec=elapsed_sec, + status=status, + seed_row_count=seed_row_count, + preview_num_records=preview_num_records, + column_count=column_count, + column_names=column_names, + model_usage=model_usage, + record_type="ndd_workflow", + extra_fields=None, + ) + + +def record_model_workflow( + *, + workflow_name: str, + model_aliases: list[str], + input_row_count: int, + output_row_count: int | None, + failed_record_count: int | None, + elapsed_sec: float, + status: str = "completed", + seed_row_count: int | None = None, + preview_num_records: int | None = None, + column_count: int | None = None, + column_names: list[str] | None = None, + model_usage: Mapping[str, Any] | None = None, + extra_fields: Mapping[str, Any] | None = None, +) -> None: + """Record one sanitized model-backed workflow execution. + + Use this for non-DataDesigner model calls that still need benchmark + accounting. Raw prompts, text, responses, and replacement values do not + belong in ``model_usage``. + """ + _record_model_workflow( + workflow_name=workflow_name, + model_aliases=model_aliases, + input_row_count=input_row_count, + output_row_count=output_row_count, + failed_record_count=failed_record_count, + elapsed_sec=elapsed_sec, + status=status, + seed_row_count=seed_row_count, + preview_num_records=preview_num_records, + column_count=column_count, + column_names=column_names, + model_usage=model_usage, + record_type="model_workflow", + extra_fields=extra_fields, + ) + + +def _record_model_workflow( + *, + workflow_name: str, + model_aliases: list[str], + input_row_count: int, + output_row_count: int | None, + failed_record_count: int | None, + elapsed_sec: float, + status: str, + seed_row_count: int | None, + preview_num_records: int | None, + column_count: int | None, + column_names: list[str] | None, + model_usage: Mapping[str, Any] | None, + record_type: str, + extra_fields: Mapping[str, Any] | None, +) -> None: + collector = current_collector() + if collector is None: + return + observed_usage = _summarize_model_usage(model_usage) + workflow_fields = { + "workflow_name": workflow_name, + "status": status, + "model_aliases": sorted(set(model_aliases)), + "input_row_count": input_row_count, + "seed_row_count": seed_row_count, + "output_row_count": output_row_count, + "failed_record_count": failed_record_count, + "elapsed_sec": elapsed_sec, + "preview_num_records": preview_num_records, + "column_count": column_count, + "column_names": column_names or [], + "model_usage": dict(model_usage or {}), + **dict(extra_fields or {}), + } + collector.record(record_type, **_model_workflow_fields(workflow_fields, observed_usage)) + + +def record_run_metadata( + *, + config: Any, + data: Any, + mode: str, + strategy: str, + input_row_count: int, + preview_num_records: int | None, + model_configs: list[Any], +) -> None: + """Record sanitized run/config metadata once per anonymizer run.""" + collector = current_collector() + if collector is None: + return + + detect = getattr(config, "detect", None) + source = str(getattr(data, "source", "")) + collector.record( + "run", + mode=mode, + strategy=strategy, + input_row_count=input_row_count, + preview_num_records=preview_num_records, + source_hash=collector.record_hash(row_index="source", text=source), + input_source=_source_metadata(source), + input_text_column=str(getattr(data, "text_column", "")), + input_has_id_column=bool(getattr(data, "id_column", None)), + input_has_data_summary=bool(getattr(data, "data_summary", None)), + detect=_detect_config_metadata(detect), + replace=_replace_config_metadata(getattr(config, "replace", None)), + rewrite=_rewrite_config_metadata(getattr(config, "rewrite", None)), + models=[_model_config_metadata(model_config) for model_config in model_configs], + runtime=_runtime_metadata(), + ) diff --git a/src/anonymizer/measurement/records/__init__.py b/src/anonymizer/measurement/records/__init__.py new file mode 100644 index 00000000..3d2894b7 --- /dev/null +++ b/src/anonymizer/measurement/records/__init__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations diff --git a/src/anonymizer/measurement/records/model.py b/src/anonymizer/measurement/records/model.py new file mode 100644 index 00000000..826270e5 --- /dev/null +++ b/src/anonymizer/measurement/records/model.py @@ -0,0 +1,115 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from collections.abc import Mapping +from typing import Any, cast + +from anonymizer.measurement._coerce import _coerce_int, _safe_rate, _safe_ratio + + +def _model_workflow_fields(fields: dict[str, Any], observed_usage: dict[str, int | None]) -> dict[str, Any]: + return { + **fields, + **observed_usage, + "observed_failed_request_rate": _safe_ratio( + observed_usage["observed_failed_requests"], + observed_usage["observed_total_requests"], + ), + **_throughput_fields( + elapsed_sec=cast(float, fields["elapsed_sec"]), + input_row_count=cast(int, fields["input_row_count"]), + output_row_count=cast(int | None, fields["output_row_count"]), + total_tokens=observed_usage["observed_total_tokens"], + total_requests=observed_usage["observed_total_requests"], + successful_requests=observed_usage["observed_successful_requests"], + ), + } + + +def _throughput_fields( + *, + elapsed_sec: float, + input_row_count: int | None, + output_row_count: int | None, + total_tokens: int | None, + total_requests: int | None, + successful_requests: int | None, +) -> dict[str, float | None]: + return { + "input_rows_per_sec": _safe_rate(input_row_count, elapsed_sec), + "output_rows_per_sec": _safe_rate(output_row_count, elapsed_sec), + "observed_tokens_per_sec": _safe_rate(total_tokens, elapsed_sec), + "observed_requests_per_sec": _safe_rate(total_requests, elapsed_sec), + "observed_tokens_per_successful_request": _safe_ratio(total_tokens, successful_requests), + } + + +def _row_throughput_fields( + *, + elapsed_sec: float, + input_row_count: int | None, + output_row_count: int | None, +) -> dict[str, float | None]: + if input_row_count is not None and input_row_count < 0: + input_row_count = None + if output_row_count is not None and output_row_count < 0: + output_row_count = None + return { + "input_rows_per_sec": _safe_rate(input_row_count, elapsed_sec), + "output_rows_per_sec": _safe_rate(output_row_count, elapsed_sec), + } + + +def _summarize_model_usage(model_usage: Mapping[str, Any] | None) -> dict[str, int | None]: + totals = _empty_model_usage_totals() + for usage in (model_usage or {}).values(): + if not isinstance(usage, Mapping): + continue + _add_model_usage_totals(totals, usage) + + if totals["total_tokens"] == 0: + totals["total_tokens"] = totals["input_tokens"] + totals["output_tokens"] + if totals["total_requests"] == 0: + totals["total_requests"] = totals["successful_requests"] + totals["failed_requests"] + + return { + "observed_input_tokens": totals["input_tokens"], + "observed_output_tokens": totals["output_tokens"], + "observed_total_tokens": totals["total_tokens"], + "observed_reasoning_tokens": totals["reasoning_tokens"] if totals["has_reasoning_tokens"] else None, + "observed_successful_requests": totals["successful_requests"], + "observed_failed_requests": totals["failed_requests"], + "observed_total_requests": totals["total_requests"], + } + + +def _empty_model_usage_totals() -> dict[str, int | bool]: + return { + "input_tokens": 0, + "output_tokens": 0, + "total_tokens": 0, + "reasoning_tokens": 0, + "has_reasoning_tokens": False, + "successful_requests": 0, + "failed_requests": 0, + "total_requests": 0, + } + + +def _add_model_usage_totals(totals: dict[str, int | bool], usage: Mapping[str, Any]) -> None: + token_usage = usage.get("token_usage") + if isinstance(token_usage, Mapping): + totals["input_tokens"] += _coerce_int(token_usage.get("input_tokens"), default=0) + totals["output_tokens"] += _coerce_int(token_usage.get("output_tokens"), default=0) + totals["total_tokens"] += _coerce_int(token_usage.get("total_tokens"), default=0) + if token_usage.get("reasoning_tokens") is not None: + totals["has_reasoning_tokens"] = True + totals["reasoning_tokens"] += _coerce_int(token_usage.get("reasoning_tokens"), default=0) + + request_usage = usage.get("request_usage") + if isinstance(request_usage, Mapping): + totals["successful_requests"] += _coerce_int(request_usage.get("successful_requests"), default=0) + totals["failed_requests"] += _coerce_int(request_usage.get("failed_requests"), default=0) + totals["total_requests"] += _coerce_int(request_usage.get("total_requests"), default=0) diff --git a/src/anonymizer/measurement/records/row.py b/src/anonymizer/measurement/records/row.py new file mode 100644 index 00000000..087e938f --- /dev/null +++ b/src/anonymizer/measurement/records/row.py @@ -0,0 +1,318 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from collections import Counter +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Literal, TypedDict + +from anonymizer.engine.constants import ( + COL_ATTRIBUTE_FIDELITY_INVALID_ENTITIES, + COL_ATTRIBUTE_FIDELITY_VALID, + COL_DETECTION_INVALID_ENTITIES, + COL_DETECTION_VALID, + COL_FINAL_ENTITIES, + COL_RELATIONAL_CONSISTENCY_INVALID_RELATIONS, + COL_RELATIONAL_CONSISTENCY_VALID, + COL_TYPE_FIDELITY_INVALID_REPLACEMENTS, + COL_TYPE_FIDELITY_VALID, +) +from anonymizer.measurement._coerce import ( + _coerce_bool, + _coerce_int, + _count_items, + _count_text_tokens, + _safe_row_index, + _size_bucket, +) +from anonymizer.measurement.metrics.entities import ( + _GROUND_TRUTH_ENTITY_COLUMNS, + _entities_from_raw, + _entity_ground_truth_metrics, +) +from anonymizer.measurement.metrics.llm_calls import _validation_chunk_count, estimate_llm_calls_by_stage +from anonymizer.measurement.metrics.replacements import ( + _replacement_collision_metrics, + _replacement_coverage_metrics, + _replacement_map_metrics, +) +from anonymizer.measurement.metrics.rewrite import _original_value_leak_record_fields, _rewrite_record_fields +from anonymizer.measurement.session import current_collector + +if TYPE_CHECKING: + import pandas as pd + + from anonymizer.measurement.collector import MeasurementCollector + + +_EvaluationBoolField = Literal[ + "detection_valid", + "type_fidelity_valid", + "relational_consistency_valid", + "attribute_fidelity_valid", +] +_EvaluationCountField = Literal[ + "detection_invalid_entity_count", + "type_fidelity_invalid_replacement_count", + "relational_consistency_invalid_relation_count", + "attribute_fidelity_invalid_entity_count", +] + + +class _EvaluationRecordFields(TypedDict, total=False): + detection_valid: bool | None + type_fidelity_valid: bool | None + relational_consistency_valid: bool | None + attribute_fidelity_valid: bool | None + detection_invalid_entity_count: int + type_fidelity_invalid_replacement_count: int + relational_consistency_invalid_relation_count: int + attribute_fidelity_invalid_entity_count: int + + +@dataclass(frozen=True) +class _EvaluationBoolMetric: + source_column: str + output_field: _EvaluationBoolField + + +@dataclass(frozen=True) +class _EvaluationCountMetric: + source_column: str + output_field: _EvaluationCountField + primary_key: str + + +_EVALUATION_BOOL_METRICS = ( + _EvaluationBoolMetric(COL_DETECTION_VALID, "detection_valid"), + _EvaluationBoolMetric(COL_TYPE_FIDELITY_VALID, "type_fidelity_valid"), + _EvaluationBoolMetric(COL_RELATIONAL_CONSISTENCY_VALID, "relational_consistency_valid"), + _EvaluationBoolMetric(COL_ATTRIBUTE_FIDELITY_VALID, "attribute_fidelity_valid"), +) + +_EVALUATION_COUNT_METRICS = ( + _EvaluationCountMetric(COL_DETECTION_INVALID_ENTITIES, "detection_invalid_entity_count", "invalid_entities"), + _EvaluationCountMetric( + COL_TYPE_FIDELITY_INVALID_REPLACEMENTS, + "type_fidelity_invalid_replacement_count", + "invalid_replacements", + ), + _EvaluationCountMetric( + COL_RELATIONAL_CONSISTENCY_INVALID_RELATIONS, + "relational_consistency_invalid_relation_count", + "invalid_relations", + ), + _EvaluationCountMetric( + COL_ATTRIBUTE_FIDELITY_INVALID_ENTITIES, "attribute_fidelity_invalid_entity_count", "entities" + ), +) + + +def record_record_metrics( + dataframe: pd.DataFrame, + *, + mode: str, + strategy: str, + text_column: str, + validation_max_entities_per_call: int, +) -> None: + """Record per-row count, length, and nominal-call metrics from a trace DataFrame.""" + collector = current_collector() + if collector is None or not collector.record_level: + return + + ground_truth_column = next((col for col in _GROUND_TRUTH_ENTITY_COLUMNS if col in dataframe.columns), None) + columns = set(dataframe.columns) + for row_index, row in dataframe.iterrows(): + final_entities = _entities_from_raw(row.get(COL_FINAL_ENTITIES)) + collector.record( + "record", + **_base_record_fields( + collector=collector, + row_index=row_index, + row=row, + text_column=text_column, + mode=mode, + strategy=strategy, + ), + **_entity_record_fields(row, final_entities=final_entities, ground_truth_column=ground_truth_column), + **_replacement_record_fields(row, columns=columns, final_entities=final_entities), + **_rewrite_record_fields(row, columns=columns), + **_original_value_leak_record_fields(row, columns=columns, final_entities=final_entities), + **_llm_record_fields( + row, + columns=columns, + mode=mode, + strategy=strategy, + final_entity_count=len(final_entities), + validation_max_entities_per_call=validation_max_entities_per_call, + ), + ) + + +def record_evaluation_metrics( + dataframe: pd.DataFrame, + *, + mode: str, + strategy: str, + text_column: str, +) -> None: + """Record sanitized per-row LLM-as-judge verdict metrics from an evaluated trace dataframe.""" + collector = current_collector() + if collector is None or not collector.record_level: + return + + columns = set(dataframe.columns) + if not _has_evaluation_metrics(columns): + return + + for row_index, row in dataframe.iterrows(): + collector.record( + "evaluation_record", + **_base_record_fields( + collector=collector, + row_index=row_index, + row=row, + text_column=text_column, + mode=mode, + strategy=strategy, + ), + **_evaluation_record_fields(row, columns=columns), + ) + + +def _has_evaluation_metrics(columns: set[str]) -> bool: + return any(metric.source_column in columns for metric in _EVALUATION_BOOL_METRICS) or any( + metric.source_column in columns for metric in _EVALUATION_COUNT_METRICS + ) + + +def _evaluation_record_fields(row: pd.Series, *, columns: set[str]) -> _EvaluationRecordFields: + fields: _EvaluationRecordFields = {} + for metric in _EVALUATION_BOOL_METRICS: + if metric.source_column in columns: + fields[metric.output_field] = _coerce_bool(row.get(metric.source_column)) + for metric in _EVALUATION_COUNT_METRICS: + if metric.source_column in columns: + fields[metric.output_field] = _count_items(row.get(metric.source_column), primary_key=metric.primary_key) + return fields + + +def _base_record_fields( + *, + collector: MeasurementCollector, + row_index: object, + row: Any, + text_column: str, + mode: str, + strategy: str, +) -> dict[str, Any]: + text = str(row.get(text_column, "")) + text_length_tokens = _count_text_tokens(text) + return { + "mode": mode, + "strategy": strategy, + "row_index": _safe_row_index(row_index), + "record_hash": collector.record_hash(row_index=row_index, text=text), + "text_length_chars": len(text), + "text_length_chars_bucket": _size_bucket(len(text)), + "text_length_tokens": text_length_tokens, + "text_length_tokens_bucket": _size_bucket(text_length_tokens), + } + + +def _entity_record_fields( + row: Any, + *, + final_entities: list[dict[str, Any]], + ground_truth_column: str | None, +) -> dict[str, Any]: + ground_truth_entities = ( + _entities_from_raw(row.get(ground_truth_column)) if ground_truth_column is not None else None + ) + return { + "final_entity_count": len(final_entities), + "final_entity_label_counts": dict( + sorted(Counter(e.get("label", "") for e in final_entities if e.get("label")).items()) + ), + **_entity_ground_truth_metrics(final_entities, ground_truth_entities), + } + + +def _replacement_record_fields( + row: Any, + *, + columns: set[str], + final_entities: list[dict[str, Any]], +) -> dict[str, Any]: + from anonymizer.engine.constants import COL_REPLACEMENT_MAP + + if COL_REPLACEMENT_MAP not in columns: + return {} + raw_map = row.get(COL_REPLACEMENT_MAP) + return { + **_replacement_map_metrics(raw_map), + **_replacement_coverage_metrics(raw_map, final_entities), + **_replacement_collision_metrics(raw_map, final_entities), + } + + +def _llm_record_fields( + row: Any, + *, + columns: set[str], + mode: str, + strategy: str, + final_entity_count: int, + validation_max_entities_per_call: int, +) -> dict[str, Any]: + from anonymizer.engine.constants import COL_REPAIR_ITERATIONS + + detected_candidate_count = _detected_candidate_count(row, columns=columns) + validation_chunk_count = _validation_chunk_count( + detected_candidate_count, + validation_max_entities_per_call=validation_max_entities_per_call, + ) + grouped_entity_count = _grouped_entity_count(row, columns=columns, final_entity_count=final_entity_count) + repair_iterations = _coerce_int(row.get(COL_REPAIR_ITERATIONS, 0), default=0) + replace_map_generation_uses_llm = _replace_map_generation_uses_llm(row, columns=columns) + calls_by_stage = estimate_llm_calls_by_stage( + mode=mode, + strategy=strategy, + has_grouped_entities=grouped_entity_count > 0, + validation_chunk_count=validation_chunk_count, + repair_iterations=repair_iterations, + replace_map_generation_uses_llm=replace_map_generation_uses_llm, + ) + total_estimated = ( + sum(calls_by_stage.values()) if all(value is not None for value in calls_by_stage.values()) else None + ) + return { + "detected_candidate_count": detected_candidate_count, + "validation_chunk_count": validation_chunk_count, + "repair_iterations": repair_iterations if mode == "rewrite" else 0, + "llm_calls_estimated_by_stage": calls_by_stage, + "llm_calls_estimated_total": total_estimated, + } + + +def _replace_map_generation_uses_llm(row: Any, *, columns: set[str]) -> bool: + del row, columns + return True + + +def _detected_candidate_count(row: Any, *, columns: set[str]) -> int | None: + from anonymizer.engine.constants import COL_SEED_VALIDATION_CANDIDATES + + if COL_SEED_VALIDATION_CANDIDATES not in columns: + return None + return _count_items(row.get(COL_SEED_VALIDATION_CANDIDATES), primary_key="candidates", fallback_keys=("entities",)) + + +def _grouped_entity_count(row: Any, *, columns: set[str], final_entity_count: int) -> int: + from anonymizer.engine.constants import COL_ENTITIES_BY_VALUE + + if COL_ENTITIES_BY_VALUE not in columns: + return final_entity_count + return _count_items(row.get(COL_ENTITIES_BY_VALUE), primary_key="entities_by_value", fallback_keys=("entities",)) diff --git a/src/anonymizer/measurement/records/run.py b/src/anonymizer/measurement/records/run.py new file mode 100644 index 00000000..4a9e95b9 --- /dev/null +++ b/src/anonymizer/measurement/records/run.py @@ -0,0 +1,108 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import platform +from importlib.metadata import PackageNotFoundError, version +from pathlib import Path +from typing import Any +from urllib.parse import urlparse + +from anonymizer.measurement.constants import MEASUREMENT_SCHEMA_VERSION + + +def _detect_config_metadata(detect: Any | None) -> dict[str, Any]: + entity_labels = getattr(detect, "entity_labels", None) + if entity_labels is None: + from anonymizer.engine.constants import DEFAULT_ENTITY_LABELS + + entity_label_count = len(DEFAULT_ENTITY_LABELS) + else: + entity_label_count = len(entity_labels) + return { + "gliner_threshold": getattr(detect, "gliner_threshold", None), + "entity_label_source": "custom" if entity_labels is not None else "default", + "entity_label_count": entity_label_count, + "entity_labels": list(entity_labels) if entity_labels is not None else None, + "validation_max_entities_per_call": getattr(detect, "validation_max_entities_per_call", None), + "validation_excerpt_window_chars": getattr(detect, "validation_excerpt_window_chars", None), + } + + +def _source_metadata(source: str) -> dict[str, Any]: + parsed = urlparse(source) + if parsed.scheme in {"http", "https"}: + return { + "kind": "remote_file", + "scheme": parsed.scheme, + "suffix": Path(parsed.path).suffix.lower() or None, + } + if parsed.scheme == "file": + return { + "kind": "local_file", + "scheme": "file", + "suffix": Path(parsed.path).suffix.lower() or None, + } + return { + "kind": "local_file" if source else "unknown", + "scheme": None, + "suffix": Path(source).suffix.lower() or None, + } + + +def _replace_config_metadata(replace_config: Any | None) -> dict[str, Any] | None: + if replace_config is None: + return None + + metadata: dict[str, Any] = { + "strategy": type(replace_config).__name__, + "has_instructions": bool(getattr(replace_config, "instructions", None)), + } + for attr in ("normalize_label", "algorithm", "digest_length"): + if hasattr(replace_config, attr): + metadata[attr] = getattr(replace_config, attr) + if hasattr(replace_config, "format_template"): + metadata["has_format_template"] = True + return metadata + + +def _rewrite_config_metadata(rewrite_config: Any | None) -> dict[str, Any] | None: + if rewrite_config is None: + return None + return { + "risk_tolerance": _enum_value(getattr(rewrite_config, "risk_tolerance", None)), + "max_repair_iterations": getattr(rewrite_config, "max_repair_iterations", None), + "strict_entity_protection": getattr(rewrite_config, "strict_entity_protection", None), + "has_privacy_goal": bool(getattr(rewrite_config, "privacy_goal", None)), + "has_instructions": bool(getattr(rewrite_config, "instructions", None)), + } + + +def _model_config_metadata(model_config: Any) -> dict[str, Any]: + inference_parameters = getattr(model_config, "inference_parameters", None) + return { + "alias": getattr(model_config, "alias", None), + "model": getattr(model_config, "model", None), + "provider": _enum_value(getattr(model_config, "provider", None)), + "base_url": bool(getattr(model_config, "base_url", None)), + "max_parallel_requests": getattr(inference_parameters, "max_parallel_requests", None), + } + + +def _runtime_metadata() -> dict[str, Any]: + try: + anonymizer_version = version("nemo-anonymizer") + except PackageNotFoundError: + anonymizer_version = None + return { + "anonymizer_version": anonymizer_version, + "measurement_schema_version": MEASUREMENT_SCHEMA_VERSION, + "platform_machine": platform.machine(), + "platform_system": platform.system(), + "python_version": platform.python_version(), + } + + +def _enum_value(value: Any) -> Any: + return getattr(value, "value", value) diff --git a/src/anonymizer/measurement/session.py b/src/anonymizer/measurement/session.py new file mode 100644 index 00000000..1d985d4d --- /dev/null +++ b/src/anonymizer/measurement/session.py @@ -0,0 +1,115 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import logging +from collections.abc import Iterator +from contextlib import contextmanager +from contextvars import ContextVar + +from anonymizer.measurement.collector import MeasurementCollector +from anonymizer.measurement.config import MeasurementConfig +from anonymizer.measurement.sinks import _JsonlMeasurementSink + +logger = logging.getLogger("anonymizer.measurement") + +_ACTIVE_COLLECTOR: ContextVar[MeasurementCollector | None] = ContextVar( + "anonymizer_measurement_collector", + default=None, +) + + +@contextmanager +def measurement_session(collector: MeasurementCollector | None = None) -> Iterator[MeasurementCollector]: + """Activate a collector for code running in this context.""" + active = collector or MeasurementCollector() + token = _ACTIVE_COLLECTOR.set(active) + try: + yield active + finally: + _ACTIVE_COLLECTOR.reset(token) + + +@contextmanager +def configured_measurement_session(config: MeasurementConfig | None) -> Iterator[MeasurementCollector | None]: + """Activate and persist a collector when a measurement config is provided.""" + if config is None: + yield None + return + + sink = _JsonlMeasurementSink(config.output_path) if config.streaming else None + dd_trace_sink = None + if config.dd_trace != "none": + if config.dd_trace_path is None: + raise ValueError("dd_trace_path is required when dd_trace is enabled") + dd_trace_sink = _JsonlMeasurementSink(config.dd_trace_path) + dd_task_trace_sink = _JsonlMeasurementSink(config.dd_task_trace_path) if config.dd_task_trace_path else None + collector = MeasurementCollector( + run_id=config.run_id, + record_hash_key=config.record_hash_key, + record_level=config.record_level, + run_tags=config.run_tags, + record_sink=sink, + keep_records=config.keep_records, + dd_trace_mode=config.dd_trace, + dd_trace_sink=dd_trace_sink, + dd_task_trace_sink=dd_task_trace_sink, + fail_on_write_error=config.fail_on_write_error, + ) + with measurement_session(collector): + body_error: BaseException | None = None + try: + yield collector + except BaseException as exc: + body_error = exc + raise + finally: + if config.streaming: + _close_collector_safely(config=config, collector=collector, body_error=body_error) + else: + write_error: BaseException | None = None + try: + _write_collector_safely(config=config, collector=collector, body_error=body_error) + except BaseException as exc: + write_error = exc + raise + finally: + _close_collector_safely( + config=config, + collector=collector, + body_error=body_error or write_error, + ) + + +def current_collector() -> MeasurementCollector | None: + """Return the active collector, if measurement is enabled.""" + return _ACTIVE_COLLECTOR.get() + + +def _write_collector_safely( + *, + config: MeasurementConfig, + collector: MeasurementCollector, + body_error: BaseException | None, +) -> None: + try: + config.write_collector(collector) + except Exception as exc: + logger.warning("Failed to write Anonymizer measurement records (%s)", type(exc).__name__) + if body_error is None and config.fail_on_write_error: + raise + + +def _close_collector_safely( + *, + config: MeasurementConfig, + collector: MeasurementCollector, + body_error: BaseException | None, +) -> None: + try: + collector.close() + except Exception as exc: + logger.warning("Failed to close Anonymizer measurement stream (%s)", type(exc).__name__) + if body_error is None and config.fail_on_write_error: + raise diff --git a/src/anonymizer/measurement/sinks.py b/src/anonymizer/measurement/sinks.py new file mode 100644 index 00000000..4ec0f0aa --- /dev/null +++ b/src/anonymizer/measurement/sinks.py @@ -0,0 +1,58 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json +from pathlib import Path +from threading import Lock +from typing import Any, Literal, Protocol + + +class _MeasurementWriter(Protocol): + def write(self, records: list[dict[str, Any]], path: str | Path) -> None: ... + + +class _MeasurementSink(Protocol): + def write_record(self, record: dict[str, Any]) -> None: ... + + def close(self) -> None: ... + + +class _JsonlMeasurementWriter: + def write(self, records: list[dict[str, Any]], path: str | Path) -> None: + output_path = Path(path) + output_path.parent.mkdir(parents=True, exist_ok=True) + with output_path.open("w", encoding="utf-8") as f: + for record in records: + f.write(json.dumps(record, ensure_ascii=True, sort_keys=True) + "\n") + + +class _JsonMeasurementWriter: + def write(self, records: list[dict[str, Any]], path: str | Path) -> None: + output_path = Path(path) + output_path.parent.mkdir(parents=True, exist_ok=True) + with output_path.open("w", encoding="utf-8") as f: + json.dump(records, f, ensure_ascii=True, indent=2, sort_keys=True) + + +def _writer_for_format(output_format: Literal["jsonl", "json"]) -> _MeasurementWriter: + if output_format == "json": + return _JsonMeasurementWriter() + return _JsonlMeasurementWriter() + + +class _JsonlMeasurementSink: + def __init__(self, path: str | Path) -> None: + output_path = Path(path) + output_path.parent.mkdir(parents=True, exist_ok=True) + self._file = output_path.open("w", encoding="utf-8", buffering=1) + self._lock = Lock() + + def write_record(self, record: dict[str, Any]) -> None: + with self._lock: + self._file.write(json.dumps(record, ensure_ascii=True, sort_keys=True) + "\n") + + def close(self) -> None: + with self._lock: + self._file.close() diff --git a/tests/engine/test_chunked_validation.py b/tests/engine/test_chunked_validation.py index f9b402a3..6eb3ed9e 100644 --- a/tests/engine/test_chunked_validation.py +++ b/tests/engine/test_chunked_validation.py @@ -447,6 +447,41 @@ def test_single_chunk_sends_single_chunk_tagged_text_not_windowed_excerpt(self) "around Alice/Bob would clip the suffix entirely." ) + def test_single_chunk_can_use_compact_excerpt_when_configured(self) -> None: + prefix = "START_ONLY_MARKER " + ("prefix filler " * 80) + suffix = (" suffix filler" * 80) + " END_ONLY_MARKER" + middle = "Alice met Bob." + text = prefix + middle + suffix + alice_start = len(prefix) + bob_start = alice_start + 10 + + spans = [ + _entity_span("a", "Alice", "first_name", alice_start, alice_start + 5), + _entity_span("b", "Bob", "first_name", bob_start, bob_start + 3), + ] + candidates = _candidates_schema( + ("a", "Alice", "first_name"), + ("b", "Bob", "first_name"), + ) + row = _build_row(text=text, seed_entities=spans, candidates=candidates) + + facade = FakeFacade("v0", response={"decisions": [{"id": "a", "decision": "keep"}]}) + params = ChunkedValidationParams( + pool=["v0"], + max_entities_per_call=10, + excerpt_window_chars=20, + single_chunk_full_text=False, + prompt_template=_MINIMAL_TEMPLATE, + ) + + chunked_validate_row(row, params, {"v0": facade}) + + prompt = facade.calls[0]["prompt"] + assert "Alice" in prompt + assert "Bob" in prompt + assert "START_ONLY_MARKER" not in prompt + assert "END_ONLY_MARKER" not in prompt + def test_empty_candidates_short_circuits_without_calls(self) -> None: row = _build_row(text="hello", seed_entities=[], candidates=_candidates_schema()) facade = FakeFacade("v0", response={"decisions": []}) diff --git a/tests/engine/test_llm_replace_workflow.py b/tests/engine/test_llm_replace_workflow.py index f7abbc44..f6ae372b 100644 --- a/tests/engine/test_llm_replace_workflow.py +++ b/tests/engine/test_llm_replace_workflow.py @@ -330,6 +330,41 @@ def test_filter_replacement_map_anomaly_summaries_do_not_leak_pii( _assert_no_pii_in_logs(caplog, extra_secrets=("Acme Corp", "NovaCorp")) +def test_filter_replacement_map_repairs_synthetic_original_collisions_without_pii( + caplog: pytest.LogCaptureFixture, +) -> None: + """Synthetic values must not reuse another protected original from the same row.""" + parsed_entities = EntitiesByValueSchema.model_validate( + { + "entities_by_value": [ + {"value": "1979-01-01", "labels": ["date"]}, + {"value": "1980-02-02", "labels": ["date"]}, + ] + } + ) + raw_map = { + "replacements": [ + {"original": "1979-01-01", "label": "date", "synthetic": "1980-02-02"}, + {"original": "1980-02-02", "label": "date", "synthetic": "1991-03-04"}, + ] + } + + with caplog.at_level(logging.WARNING, logger="anonymizer"): + result = _filter_replacement_map_to_input_entities( + raw_map=raw_map, parsed_entities=parsed_entities, record_id="row-collision" + ) + + assert result == { + "replacements": [ + {"original": "1979-01-01", "label": "date", "synthetic": "[SUBSTITUTE_DATE_1]"}, + {"original": "1980-02-02", "label": "date", "synthetic": "1991-03-04"}, + ] + } + assert "synthetic-original collision" in caplog.text + assert "date" in caplog.text + _assert_no_pii_in_logs(caplog, extra_secrets=("1979-01-01", "1980-02-02", "1991-03-04")) + + def test_filter_replacement_map_empty_warning_does_not_leak_pii( caplog: pytest.LogCaptureFixture, ) -> None: diff --git a/tests/engine/test_ndd_adapter.py b/tests/engine/test_ndd_adapter.py index ea0a01a8..7563f371 100644 --- a/tests/engine/test_ndd_adapter.py +++ b/tests/engine/test_ndd_adapter.py @@ -13,6 +13,7 @@ from data_designer.config.models import ModelConfig from data_designer.interface.data_designer import DataDesigner +from anonymizer.engine.ndd import adapter as ndd_adapter from anonymizer.engine.ndd.adapter import RECORD_ID_COLUMN, NddAdapter from anonymizer.interface.errors import AnonymizerWorkflowError @@ -60,6 +61,10 @@ def _make_columns() -> list[ColumnConfigT]: ] +def test_as_alias_list_drops_none_items_before_stringifying() -> None: + assert ndd_adapter._as_alias_list(["validator", None, "", 0]) == ["validator", "0"] + + def test_attach_record_ids_adds_deterministic_ids() -> None: adapter = NddAdapter(data_designer=Mock(spec=DataDesigner)) input_df = pd.DataFrame({"text": ["a", "b"]}) diff --git a/tests/fixtures/measurement/benchmark-output/detection-artifacts.jsonl b/tests/fixtures/measurement/benchmark-output/detection-artifacts.jsonl new file mode 100644 index 00000000..c656eaa0 --- /dev/null +++ b/tests/fixtures/measurement/benchmark-output/detection-artifacts.jsonl @@ -0,0 +1,2 @@ +{"suite_id":"suite","workload_id":"bio","config_id":"default","repetition":0,"case_id":"bio__default__r000","run_id":"bio__default__r000","workflow_name":"entity-detection","seed_entity_count":13,"seed_validation_candidate_count":13,"augmented_entity_count":1,"augmented_new_final_value_count":1,"final_entity_count":14,"final_source_counts":{"detector":11,"augmenter":3},"final_entity_signature_hashes":["bio-hash-a","bio-hash-b"],"final_entity_signature_labels":{"bio-hash-a":"person","bio-hash-b":"city"},"final_entity_signature_details":{"bio-hash-a":{"label":"person","source":"detector","row_index":0,"start_position":0,"end_position":5,"value_hash":"hash-person","value_length":5}},"final_entity_signature_count":2} +{"suite_id":"suite","workload_id":"shell","config_id":"native-local","repetition":0,"case_id":"shell__native-local__r000","run_id":"shell__native-local__r000","workflow_name":"native-single-pass","seed_entity_count":8,"seed_validation_candidate_count":0,"augmented_entity_count":0,"augmented_new_final_value_count":0,"final_entity_count":8,"final_source_counts":{"augmenter":8},"final_entity_signature_hashes":["shell-hash-a"],"final_entity_signature_labels":{"shell-hash-a":"api_key"},"final_entity_signature_details":{"shell-hash-a":{"label":"api_key","source":"native","row_index":0,"start_position":12,"end_position":32,"value_hash":"hash-secret","value_length":20}},"final_entity_signature_count":1} diff --git a/tests/fixtures/measurement/benchmark-output/measurements.jsonl b/tests/fixtures/measurement/benchmark-output/measurements.jsonl new file mode 100644 index 00000000..c776a21d --- /dev/null +++ b/tests/fixtures/measurement/benchmark-output/measurements.jsonl @@ -0,0 +1,5 @@ +{"record_type":"ndd_workflow","run_id":"bio__default__r000","workflow_name":"entity-detection","elapsed_sec":8.5,"observed_total_requests":4,"observed_successful_requests":3,"observed_input_tokens":5000,"observed_output_tokens":1000,"observed_total_tokens":6000,"observed_failed_requests":1,"model_usage":{"nvidia/gliner-pii":{"request_usage":{"successful_requests":1,"failed_requests":0,"total_requests":1},"token_usage":{"input_tokens":1000,"output_tokens":100,"total_tokens":1100}},"local-nemotron-json":{"model_alias":"local-nemotron-json","model_name":"nvidia/nemotron-3-super","model_provider_name":"local-vllm","request_usage":{"successful_requests":2,"failed_requests":1,"total_requests":3},"token_usage":{"input_tokens":4000,"output_tokens":900,"total_tokens":4900}}},"detect":{"validation_max_entities_per_call":10},"run_tags":{"suite_id":"suite","workload_id":"bio","workload_category":"synthetic_biography","config_id":"default","experimental_detection_strategy":"default","experimental_replacement_strategy":"default","dd_parser_compat":"raw_json","entity_label_set_id":"agent","entity_label_count":4,"gliner_threshold":0.3,"topology_endpoint_count":2,"topology_gpu_count":4,"topology_tensor_parallelism":2,"repetition":0,"case_id":"bio__default__r000"}} +{"record_type":"stage","run_id":"bio__default__r000","stage":"Anonymizer._run_internal","elapsed_sec":10.0,"status":"completed","run_tags":{"suite_id":"suite","workload_id":"bio","workload_category":"synthetic_biography","config_id":"default","experimental_detection_strategy":"default","experimental_replacement_strategy":"default","dd_parser_compat":"raw_json","entity_label_set_id":"agent","entity_label_count":4,"gliner_threshold":0.3,"topology_endpoint_count":2,"topology_gpu_count":4,"topology_tensor_parallelism":2,"repetition":0,"case_id":"bio__default__r000"}} +{"record_type":"record","run_id":"bio__default__r000","text_length_tokens":1200,"final_entity_count":14,"ground_truth_entity_count":20,"entity_true_positive_count":10,"entity_false_positive_count":4,"entity_false_negative_count":10,"entity_relaxed_gt_found_count":15,"entity_relaxed_detected_tp_count":14,"entity_relaxed_label_compatible_gt_found_count":13,"entity_relaxed_label_compatible_detected_tp_count":12,"replacement_count":12,"replacement_missing_final_entity_count":2,"replacement_missing_final_entity_label_counts":{"date":2},"replacement_missing_final_value_count":1,"replacement_synthetic_original_collision_count":1,"replacement_synthetic_original_collision_label_counts":{"date":1},"replacement_synthetic_original_collision_value_count":1,"original_value_leak_count":0,"original_value_leak_label_counts":{},"run_tags":{"suite_id":"suite","workload_id":"bio","workload_category":"synthetic_biography","config_id":"default","experimental_detection_strategy":"default","experimental_replacement_strategy":"default","dd_parser_compat":"raw_json","entity_label_set_id":"agent","entity_label_count":4,"gliner_threshold":0.3,"repetition":0,"case_id":"bio__default__r000"}} +{"record_type":"record","run_id":"bio__default__r000","text_length_tokens":300,"final_entity_count":0,"ground_truth_entity_count":2,"entity_true_positive_count":0,"entity_false_positive_count":0,"entity_false_negative_count":2,"entity_relaxed_gt_found_count":0,"entity_relaxed_detected_tp_count":0,"entity_relaxed_label_compatible_gt_found_count":0,"entity_relaxed_label_compatible_detected_tp_count":0,"replacement_count":0,"replacement_missing_final_entity_count":0,"replacement_missing_final_entity_label_counts":{},"replacement_missing_final_value_count":0,"replacement_synthetic_original_collision_count":0,"replacement_synthetic_original_collision_label_counts":{},"replacement_synthetic_original_collision_value_count":0,"original_value_leak_count":0,"original_value_leak_label_counts":{},"run_tags":{"suite_id":"suite","workload_id":"bio","workload_category":"synthetic_biography","config_id":"default","experimental_detection_strategy":"default","experimental_replacement_strategy":"default","dd_parser_compat":"raw_json","entity_label_set_id":"agent","entity_label_count":4,"gliner_threshold":0.3,"repetition":0,"case_id":"bio__default__r000"}} +{"record_type":"record","run_id":"shell__native-local__r000","text_length_tokens":750,"final_entity_count":8,"replacement_count":8,"replacement_missing_final_entity_count":0,"replacement_missing_final_entity_label_counts":{},"replacement_missing_final_value_count":0,"replacement_synthetic_original_collision_count":0,"replacement_synthetic_original_collision_label_counts":{},"replacement_synthetic_original_collision_value_count":0,"original_value_leak_count":1,"original_value_leak_label_counts":{"api_key":1},"run_tags":{"suite_id":"suite","workload_id":"shell","config_id":"native-local","experimental_detection_strategy":"native_single_pass","experimental_replacement_strategy":"custom_replacement_strategy","dd_parser_compat":"raw_json","repetition":0,"case_id":"shell__native-local__r000"}} diff --git a/tests/fixtures/measurement/benchmark-output/traces/bio__default__r000.jsonl b/tests/fixtures/measurement/benchmark-output/traces/bio__default__r000.jsonl new file mode 100644 index 00000000..67ce5dc4 --- /dev/null +++ b/tests/fixtures/measurement/benchmark-output/traces/bio__default__r000.jsonl @@ -0,0 +1,2 @@ +{"record_type":"dd_message_trace","run_id":"bio__default__r000","workflow_name":"entity-detection","model_alias":"local-nemotron-json","status":"error","error_type":"SyncClientUnavailableError","is_async":false,"messages":[{"role":"user","content":"Alice has sk-test"}],"response":"Alice still has sk-test","run_tags":{"suite_id":"suite","workload_id":"bio","config_id":"default","experimental_detection_strategy":"default","experimental_replacement_strategy":"default","dd_parser_compat":"raw_json","repetition":0,"case_id":"bio__default__r000"}} +{"record_type":"dd_message_trace","run_id":"bio__default__r000","workflow_name":"entity-detection","model_alias":"local-nemotron-json","status":"success","is_async":true,"messages":[{"role":"user","content":"sk-test"}],"response":"Alice","run_tags":{"suite_id":"suite","workload_id":"bio","config_id":"default","experimental_detection_strategy":"default","experimental_replacement_strategy":"default","dd_parser_compat":"raw_json","repetition":0,"case_id":"bio__default__r000"}} diff --git a/tests/test_measurement.py b/tests/test_measurement.py new file mode 100644 index 00000000..5be9e361 --- /dev/null +++ b/tests/test_measurement.py @@ -0,0 +1,1794 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json +import logging +from pathlib import Path +from types import SimpleNamespace +from typing import Any, cast +from unittest.mock import Mock + +import numpy as np +import pandas as pd +import pytest +from data_designer.config.column_configs import CustomColumnConfig, LLMStructuredColumnConfig, LLMTextColumnConfig +from data_designer.config.column_types import ColumnConfigT +from data_designer.config.custom_column import custom_column_generator +from data_designer.config.models import ModelConfig +from data_designer.config.run_config import RunConfig +from data_designer.config.utils.constants import TRACE_COLUMN_POSTFIX +from data_designer.config.utils.trace_type import TraceType +from data_designer.interface.data_designer import DataDesigner + +import anonymizer.measurement as measurement +from anonymizer.config.anonymizer_config import AnonymizerConfig, AnonymizerInput, Detect +from anonymizer.config.models import DetectionModelSelection +from anonymizer.config.replace_strategies import Redact +from anonymizer.engine.constants import ( + COL_ANY_HIGH_LEAKED, + COL_FINAL_ENTITIES, + COL_LEAKAGE_MASS, + COL_NEEDS_HUMAN_REVIEW, + COL_NEEDS_REPAIR, + COL_REPAIR_ITERATIONS, + COL_REPLACED_TEXT, + COL_REPLACEMENT_MAP, + COL_SEED_VALIDATION_CANDIDATES, + COL_TEXT, + COL_UTILITY_SCORE, + COL_WEIGHTED_LEAKAGE_RATE, +) +from anonymizer.engine.detection.detection_workflow import EntityDetectionResult, EntityDetectionWorkflow +from anonymizer.engine.ndd.adapter import RECORD_ID_COLUMN, NddAdapter, WorkflowRunResult +from anonymizer.engine.replace.replace_runner import ReplacementResult, ReplacementWorkflow +from anonymizer.engine.rewrite.rewrite_workflow import RewriteResult, RewriteWorkflow +from anonymizer.interface.anonymizer import Anonymizer +from anonymizer.interface.errors import AnonymizerWorkflowError +from anonymizer.measurement import ( + DEFAULT_MEASUREMENT_ENV_PREFIX, + MEASUREMENT_SCHEMA_VERSION, + MeasurementCollector, + MeasurementConfig, + configured_measurement_session, + estimate_llm_calls_by_stage, + measurement_session, + record_record_metrics, + stage_timer, +) + + +class _FailingSink: + def __init__(self, message: str) -> None: + self.message = message + + def write_record(self, record: dict[str, Any]) -> None: + _ = record + raise OSError(self.message) + + def close(self) -> None: + pass + + +@pytest.fixture +def trace_input_df() -> pd.DataFrame: + return pd.DataFrame({"text": ["Alice works at Acme"], RECORD_ID_COLUMN: ["record-a"]}) + + +def _trace_model_configs() -> list[ModelConfig]: + return [ModelConfig(alias="alias", model="dummy-model", provider="provider")] + + +def _run_entity_detection_preview( + adapter: NddAdapter, + input_df: pd.DataFrame, + columns: list[ColumnConfigT], +) -> WorkflowRunResult: + return adapter.run_workflow( + input_df, + model_configs=_trace_model_configs(), + columns=columns, + workflow_name="entity-detection", + preview_num_records=1, + ) + + +def _raw_detected_text_column() -> LLMTextColumnConfig: + return LLMTextColumnConfig(name="raw_detected", prompt="{{ text }}", model_alias="alias") + + +def test_ndd_adapter_records_workflow_measurement_without_raw_text() -> None: + input_df = pd.DataFrame( + { + "text": ["Alice works at Acme", "Bob works at Beta"], + RECORD_ID_COLUMN: ["record-a", "record-b"], + } + ) + mock_dd = Mock(spec=DataDesigner) + mock_dd.preview.return_value = SimpleNamespace(dataset=input_df.iloc[[0]].copy()) + adapter = NddAdapter(data_designer=mock_dd) + collector = MeasurementCollector(record_hash_key="test-key") + + with measurement_session(collector): + result = adapter.run_workflow( + input_df, + model_configs=[ModelConfig(alias="detector", model="dummy")], + columns=[ + LLMTextColumnConfig( + name="raw_detected", + prompt="{{ text }}", + model_alias="detector", + ) + ], + workflow_name="entity-detection", + preview_num_records=2, + ) + + assert len(result.failed_records) == 1 + records = [record for record in collector.records if record["record_type"] == "ndd_workflow"] + assert len(records) == 1 + record = records[0] + assert record["workflow_name"] == "entity-detection" + assert record["model_aliases"] == ["detector"] + assert record["input_row_count"] == 2 + assert record["seed_row_count"] == 2 + assert record["output_row_count"] == 1 + assert record["failed_record_count"] == 1 + assert record["elapsed_sec"] >= 0 + + serialized = json.dumps(record) + assert "Alice" not in serialized + assert "Acme" not in serialized + assert "Bob" not in serialized + + +def test_ndd_adapter_records_datadesigner_model_usage() -> None: + input_df = pd.DataFrame( + { + "text": ["Alice works at Acme"], + RECORD_ID_COLUMN: ["record-a"], + } + ) + + class UsageStats: + def model_dump(self, *, mode: str) -> dict[str, object]: + assert mode == "json" + return { + "token_usage": { + "input_tokens": 12, + "output_tokens": 4, + "total_tokens": 16, + }, + "request_usage": { + "successful_requests": 2, + "failed_requests": 1, + "total_requests": 3, + }, + } + + class ModelRegistry: + def get_model_usage_snapshot(self) -> dict[str, UsageStats]: + return {"dummy-model": UsageStats()} + + class UsageDataDesigner: + def _create_resource_provider(self, *_args: object, **_kwargs: object) -> SimpleNamespace: + return SimpleNamespace(model_registry=ModelRegistry()) + + def preview(self, _config_builder: object, *, num_records: int) -> SimpleNamespace: + self._create_resource_provider("preview-dataset", _config_builder) + return SimpleNamespace(dataset=input_df.iloc[:num_records].copy()) + + adapter = NddAdapter(data_designer=cast(DataDesigner, UsageDataDesigner())) + collector = MeasurementCollector(record_hash_key="test-key") + + with measurement_session(collector): + adapter.run_workflow( + input_df, + model_configs=[ModelConfig(alias="detector", model="dummy")], + columns=[ + LLMTextColumnConfig( + name="raw_detected", + prompt="{{ text }}", + model_alias="detector", + ) + ], + workflow_name="entity-detection", + preview_num_records=1, + ) + + record = next(record for record in collector.records if record["record_type"] == "ndd_workflow") + assert record["model_usage"]["dummy-model"]["token_usage"]["input_tokens"] == 12 + assert record["observed_input_tokens"] == 12 + assert record["observed_output_tokens"] == 4 + assert record["observed_total_tokens"] == 16 + assert record["observed_successful_requests"] == 2 + assert record["observed_failed_requests"] == 1 + assert record["observed_total_requests"] == 3 + assert record["input_rows_per_sec"] >= 0 + assert record["output_rows_per_sec"] >= 0 + assert record["observed_tokens_per_sec"] >= 0 + assert record["observed_requests_per_sec"] >= 0 + assert record["observed_tokens_per_successful_request"] == 8 + + +def test_ndd_adapter_records_datadesigner_model_usage_by_alias_for_shared_model_names() -> None: + input_df = pd.DataFrame( + { + "text": ["Alice works at Acme"], + RECORD_ID_COLUMN: ["record-a"], + } + ) + + class UsageStats: + has_usage = True + + def __init__(self, *, input_tokens: int, output_tokens: int, successful: int, failed: int) -> None: + self.input_tokens = input_tokens + self.output_tokens = output_tokens + self.successful = successful + self.failed = failed + + def model_dump(self, *, mode: str) -> dict[str, object]: + assert mode == "json" + return { + "token_usage": { + "input_tokens": self.input_tokens, + "output_tokens": self.output_tokens, + "total_tokens": self.input_tokens + self.output_tokens, + }, + "request_usage": { + "successful_requests": self.successful, + "failed_requests": self.failed, + "total_requests": self.successful + self.failed, + }, + } + + class ModelRegistry: + def __init__(self) -> None: + self._models = { + "validator": SimpleNamespace( + model_alias="validator", + model_name="shared-model", + model_provider_name="local-vllm", + usage_stats=UsageStats(input_tokens=12, output_tokens=4, successful=2, failed=1), + ), + "augmenter": SimpleNamespace( + model_alias="augmenter", + model_name="shared-model", + model_provider_name="local-vllm", + usage_stats=UsageStats(input_tokens=20, output_tokens=8, successful=1, failed=0), + ), + } + + def get_model_usage_snapshot(self) -> dict[str, UsageStats]: + return { + "shared-model": UsageStats(input_tokens=999, output_tokens=999, successful=99, failed=99), + } + + class UsageDataDesigner: + def _create_resource_provider(self, *_args: object, **_kwargs: object) -> SimpleNamespace: + return SimpleNamespace(model_registry=ModelRegistry()) + + def preview(self, _config_builder: object, *, num_records: int) -> SimpleNamespace: + self._create_resource_provider("preview-dataset", _config_builder) + return SimpleNamespace(dataset=input_df.iloc[:num_records].copy()) + + adapter = NddAdapter(data_designer=cast(DataDesigner, UsageDataDesigner())) + collector = MeasurementCollector(record_hash_key="test-key") + + with measurement_session(collector): + adapter.run_workflow( + input_df, + model_configs=[ModelConfig(alias="validator", model="shared-model")], + columns=[ + LLMTextColumnConfig( + name="raw_detected", + prompt="{{ text }}", + model_alias="validator", + ) + ], + workflow_name="entity-detection", + preview_num_records=1, + ) + + record = next(record for record in collector.records if record["record_type"] == "ndd_workflow") + assert sorted(record["model_usage"]) == ["augmenter", "validator"] + assert record["model_usage"]["validator"]["model_alias"] == "validator" + assert record["model_usage"]["validator"]["model_name"] == "shared-model" + assert record["model_usage"]["validator"]["model_provider_name"] == "local-vllm" + assert record["model_usage"]["validator"]["token_usage"]["input_tokens"] == 12 + assert record["model_usage"]["augmenter"]["token_usage"]["input_tokens"] == 20 + assert record["observed_input_tokens"] == 32 + assert record["observed_output_tokens"] == 12 + assert record["observed_total_tokens"] == 44 + assert record["observed_successful_requests"] == 3 + assert record["observed_failed_requests"] == 1 + assert record["observed_total_requests"] == 4 + + +def test_records_generic_model_workflow_usage_without_raw_text() -> None: + collector = MeasurementCollector(record_hash_key="test-key") + + with measurement_session(collector): + assert hasattr(measurement, "record_model_workflow") + measurement.record_model_workflow( + workflow_name="entity-detection-native-rules-router", + model_aliases=["native-direct"], + input_row_count=1, + output_row_count=1, + failed_record_count=0, + elapsed_sec=0.25, + model_usage={ + "native-direct": { + "model_alias": "native-direct", + "model_name": "nvidia/nemotron-3-super", + "model_provider_name": "local-vllm", + "request_usage": { + "successful_requests": 3, + "failed_requests": 0, + "total_requests": 3, + }, + "token_usage": { + "input_tokens": 30, + "output_tokens": 12, + "total_tokens": 42, + }, + }, + }, + ) + + records = [record for record in collector.records if record["record_type"] == "model_workflow"] + assert len(records) == 1 + record = records[0] + assert record["workflow_name"] == "entity-detection-native-rules-router" + assert record["model_aliases"] == ["native-direct"] + assert record["observed_total_requests"] == 3 + assert record["observed_input_tokens"] == 30 + assert record["observed_output_tokens"] == 12 + assert record["observed_total_tokens"] == 42 + assert record["observed_failed_request_rate"] == 0 + assert record["observed_tokens_per_successful_request"] == 14 + + serialized = json.dumps(record) + assert "Alice" not in serialized + assert "sk-test" not in serialized + + +def test_anonymizer_records_per_record_measurement_without_raw_pii(tmp_path: Path) -> None: + input_csv = tmp_path / "input.csv" + pd.DataFrame({"text": ["Alice works at Acme"]}).to_csv(input_csv, index=False) + final_entities = { + "entities": [ + {"value": "Alice", "label": "first_name", "start_position": 0, "end_position": 5}, + {"value": "Acme", "label": "company_name", "start_position": 15, "end_position": 19}, + ] + } + validation_candidates = { + "candidates": [ + {"value": "Alice", "label": "first_name"}, + {"value": "Acme", "label": "company_name"}, + ] + } + detection_workflow = Mock(spec=EntityDetectionWorkflow) + detection_workflow.run.return_value = EntityDetectionResult( + dataframe=pd.DataFrame( + { + COL_TEXT: ["Alice works at Acme"], + COL_FINAL_ENTITIES: [final_entities], + COL_SEED_VALIDATION_CANDIDATES: [validation_candidates], + } + ), + failed_records=[], + ) + replace_runner = Mock(spec=ReplacementWorkflow) + replace_runner.run.return_value = ReplacementResult( + dataframe=pd.DataFrame( + { + COL_TEXT: ["Alice works at Acme"], + COL_REPLACED_TEXT: ["[REDACTED] works at [REDACTED]"], + COL_FINAL_ENTITIES: [final_entities], + COL_SEED_VALIDATION_CANDIDATES: [validation_candidates], + } + ), + failed_records=[], + ) + rewrite_runner = Mock(spec=RewriteWorkflow) + rewrite_runner.run.return_value = RewriteResult(dataframe=pd.DataFrame(), failed_records=[]) + anonymizer = Anonymizer( + detection_workflow=detection_workflow, + replace_runner=replace_runner, + rewrite_runner=rewrite_runner, + ) + collector = MeasurementCollector(record_hash_key="test-key") + + with measurement_session(collector): + anonymizer.run( + config=AnonymizerConfig(replace=Redact(), detect=Detect(validation_max_entities_per_call=2)), + data=AnonymizerInput(source=str(input_csv)), + ) + + record_metrics = [record for record in collector.records if record["record_type"] == "record"] + assert len(record_metrics) == 1 + record = record_metrics[0] + assert record["mode"] == "replace" + assert record["strategy"] == "Redact" + assert record["text_length_chars"] == len("Alice works at Acme") + assert record["text_length_chars_bucket"] == "1-127" + assert record["text_length_tokens"] > 0 + assert record["text_length_tokens_bucket"] == "1-127" + assert record["final_entity_count"] == 2 + assert record["final_entity_label_counts"] == {"company_name": 1, "first_name": 1} + assert record["detected_candidate_count"] == 2 + assert record["validation_chunk_count"] == 1 + assert record["original_value_leak_count"] == 0 + assert record["original_value_leak_label_counts"] == {} + assert record["llm_calls_estimated_by_stage"] == { + "entity_detection": 3, + "replace_map_generation": 0, + } + assert record["llm_calls_estimated_total"] == 3 + assert len(record["record_hash"]) == 64 + + stage_records = [record for record in collector.records if record["record_type"] == "stage"] + assert any(record["stage"] == "Anonymizer._run_internal" for record in stage_records) + assert any(record.get("input_rows_per_sec") is not None for record in stage_records) + + run_records = [record for record in collector.records if record["record_type"] == "run"] + assert len(run_records) == 1 + run_record = run_records[0] + assert run_record["mode"] == "replace" + assert run_record["strategy"] == "Redact" + assert run_record["input_row_count"] == 1 + assert run_record["input_source"] == {"kind": "local_file", "scheme": None, "suffix": ".csv"} + assert run_record["input_text_column"] == "text" + assert run_record["input_has_id_column"] is False + assert run_record["input_has_data_summary"] is False + assert run_record["detect"]["entity_label_source"] == "default" + assert run_record["detect"]["entity_label_count"] > 0 + assert run_record["replace"]["strategy"] == "Redact" + assert run_record["replace"]["normalize_label"] is True + assert len(run_record["source_hash"]) == 64 + + serialized = json.dumps(collector.records) + assert "Alice" not in serialized + assert "Acme" not in serialized + assert str(input_csv) not in serialized + + +def test_anonymizer_measurement_config_writes_jsonl(tmp_path: Path) -> None: + input_csv = tmp_path / "input.csv" + output_jsonl = tmp_path / "measurements.jsonl" + pd.DataFrame({"text": ["Alice works at Acme"]}).to_csv(input_csv, index=False) + final_entities = { + "entities": [ + {"value": "Alice", "label": "first_name", "start_position": 0, "end_position": 5}, + ] + } + detection_workflow = Mock(spec=EntityDetectionWorkflow) + detection_workflow.run.return_value = EntityDetectionResult( + dataframe=pd.DataFrame( + { + COL_TEXT: ["Alice works at Acme"], + COL_FINAL_ENTITIES: [final_entities], + COL_SEED_VALIDATION_CANDIDATES: [{"candidates": final_entities["entities"]}], + } + ), + failed_records=[], + ) + replace_runner = Mock(spec=ReplacementWorkflow) + replace_runner.run.return_value = ReplacementResult( + dataframe=pd.DataFrame( + { + COL_TEXT: ["Alice works at Acme"], + COL_REPLACED_TEXT: ["[REDACTED] works at Acme"], + COL_FINAL_ENTITIES: [final_entities], + COL_SEED_VALIDATION_CANDIDATES: [{"candidates": final_entities["entities"]}], + } + ), + failed_records=[], + ) + anonymizer = Anonymizer( + detection_workflow=detection_workflow, + replace_runner=replace_runner, + rewrite_runner=Mock(spec=RewriteWorkflow), + ) + + with configured_measurement_session( + MeasurementConfig( + output_path=output_jsonl, + run_id="measurement-run", + record_hash_key="test-key", + run_tags={"config_id": "redact-default", "workload_id": "unit-small"}, + ) + ): + anonymizer.run( + config=AnonymizerConfig(replace=Redact()), + data=AnonymizerInput(source=str(input_csv)), + ) + + records = [json.loads(line) for line in output_jsonl.read_text(encoding="utf-8").splitlines()] + assert {record["record_type"] for record in records} >= {"record", "run", "stage"} + assert {record["run_id"] for record in records} == {"measurement-run"} + assert {record["schema_version"] for record in records} == {MEASUREMENT_SCHEMA_VERSION} + assert {record["run_tags"]["workload_id"] for record in records} == {"unit-small"} + assert all(isinstance(record["timestamp_unix_sec"], float) for record in records) + + serialized = json.dumps(records) + assert "Alice" not in serialized + assert "Acme" not in serialized + assert str(input_csv) not in serialized + + +def test_measurement_records_write_strict_json_safe_values(tmp_path: Path) -> None: + output_jsonl = tmp_path / "measurements.jsonl" + collector = MeasurementCollector(record_hash_key="test-key") + collector.record("run", non_finite=float("nan"), mixed_set={1, "two"}) + + collector.write_jsonl(output_jsonl) + + payload = json.loads(output_jsonl.read_text(encoding="utf-8")) + assert payload["non_finite"] is None + assert payload["mixed_set"] == [1, "two"] + + +def test_measurement_config_record_level_false_skips_record_rows(tmp_path: Path) -> None: + output_json = tmp_path / "measurements.json" + dataframe = pd.DataFrame( + { + COL_TEXT: ["Alice works at Acme"], + COL_FINAL_ENTITIES: [{"entities": [{"value": "Alice", "label": "first_name"}]}], + } + ) + + with configured_measurement_session( + MeasurementConfig( + output_path=output_json, + output_format="json", + record_level=False, + run_id="stage-only", + record_hash_key="test-key", + ) + ): + with stage_timer("example", input_row_count=1): + pass + record_record_metrics( + dataframe, + mode="replace", + strategy="Redact", + text_column=COL_TEXT, + validation_max_entities_per_call=100, + ) + + records = json.loads(output_json.read_text(encoding="utf-8")) + assert [record["record_type"] for record in records] == ["stage"] + assert records[0]["run_id"] == "stage-only" + assert records[0]["input_rows_per_sec"] >= 0 + + +def test_measurement_config_from_env_returns_none_without_output_path( + monkeypatch: pytest.MonkeyPatch, +) -> None: + prefix = "ANON_TEST_EMPTY_MEASUREMENT_" + monkeypatch.setenv(f"{prefix}RUN_ID", "env-run") + + assert MeasurementConfig.from_env(prefix=prefix) is None + + +def test_measurement_config_from_env_parses_supported_values( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + prefix = "ANON_TEST_MEASUREMENT_" + output_path = tmp_path / "measurements.json" + monkeypatch.setenv(f"{prefix}OUTPUT_PATH", str(output_path)) + monkeypatch.setenv(f"{prefix}OUTPUT_FORMAT", "json") + monkeypatch.setenv(f"{prefix}RECORD_LEVEL", "false") + monkeypatch.setenv(f"{prefix}FAIL_ON_WRITE_ERROR", "true") + monkeypatch.setenv(f"{prefix}RUN_ID", "env-run") + monkeypatch.setenv(f"{prefix}RUN_TAGS", '{"config_id": "redact-default", "attempt": 2}') + + config = MeasurementConfig.from_env(prefix=prefix) + + assert config is not None + assert config.output_path == str(output_path) + assert config.output_format == "json" + assert config.record_level is False + assert config.fail_on_write_error is True + assert config.run_id == "env-run" + assert config.run_tags == {"config_id": "redact-default", "attempt": 2} + + +def test_measurement_config_from_sources_keeps_env_opt_in( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + prefix = "ANON_TEST_MEASUREMENT_" + monkeypatch.setenv(f"{prefix}OUTPUT_PATH", str(tmp_path / "env.jsonl")) + explicit = MeasurementConfig(output_path=tmp_path / "explicit.jsonl") + + assert MeasurementConfig.from_sources(env=False, prefix=prefix) is None + assert MeasurementConfig.from_sources(explicit=explicit, env=True, prefix=prefix) is explicit + + +def test_measurement_config_from_env_reports_sanitized_invalid_values( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + prefix = "ANON_TEST_MEASUREMENT_" + secret_payload = "sk-secret-token-value" + monkeypatch.setenv(f"{prefix}OUTPUT_PATH", str(tmp_path / "measurements.jsonl")) + monkeypatch.setenv(f"{prefix}RUN_TAGS", secret_payload) + + with pytest.raises(ValueError) as exc_info: + MeasurementConfig.from_env(prefix=prefix) + + message = str(exc_info.value) + assert f"{prefix}RUN_TAGS" in message + assert secret_payload not in message + assert str(tmp_path) not in message + + +def test_default_measurement_env_prefix_is_anonymizer_scoped() -> None: + assert DEFAULT_MEASUREMENT_ENV_PREFIX == "ANONYMIZER_MEASUREMENT_" + + +def test_measurement_config_write_errors_are_best_effort( + caplog: pytest.LogCaptureFixture, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + def raise_write_error(_self: MeasurementConfig, _collector: MeasurementCollector) -> None: + raise OSError(f"cannot write {_self.output_path}") + + monkeypatch.setattr(MeasurementConfig, "write_collector", raise_write_error) + caplog.set_level(logging.WARNING, logger="anonymizer.measurement") + output_path = tmp_path / "secret-output-sk-live-value.jsonl" + + with configured_measurement_session(MeasurementConfig(output_path=output_path)) as collector: + assert collector is not None + collector.record("example") + + assert "Failed to write Anonymizer measurement records" in caplog.text + assert str(output_path) not in caplog.text + assert "sk-live-value" not in caplog.text + + +def test_measurement_config_strict_write_errors_can_fail_clean_body( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + def raise_write_error(_self: MeasurementConfig, _collector: MeasurementCollector) -> None: + raise OSError("cannot write") + + monkeypatch.setattr(MeasurementConfig, "write_collector", raise_write_error) + + with pytest.raises(OSError, match="cannot write"): + with configured_measurement_session( + MeasurementConfig(output_path=tmp_path / "measurements.jsonl", fail_on_write_error=True) + ) as collector: + assert collector is not None + collector.record("example") + + +def test_measurement_config_strict_write_errors_still_close_collector( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + closed_run_ids: list[str] = [] + + def raise_write_error(_self: MeasurementConfig, _collector: MeasurementCollector) -> None: + raise OSError("cannot write") + + def close_collector(self: MeasurementCollector) -> None: + closed_run_ids.append(self.run_id) + + monkeypatch.setattr(MeasurementConfig, "write_collector", raise_write_error) + monkeypatch.setattr(MeasurementCollector, "close", close_collector) + + with pytest.raises(OSError, match="cannot write"): + with configured_measurement_session( + MeasurementConfig( + output_path=tmp_path / "measurements.jsonl", + fail_on_write_error=True, + run_id="strict-write-run", + ) + ) as collector: + assert collector is not None + collector.record("example") + + assert closed_run_ids == ["strict-write-run"] + + +def test_measurement_config_write_errors_do_not_mask_body_errors( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + def raise_write_error(_self: MeasurementConfig, _collector: MeasurementCollector) -> None: + raise OSError("cannot write") + + monkeypatch.setattr(MeasurementConfig, "write_collector", raise_write_error) + + with pytest.raises(RuntimeError, match="body failed"): + with configured_measurement_session( + MeasurementConfig(output_path=tmp_path / "measurements.jsonl", fail_on_write_error=True) + ) as collector: + assert collector is not None + collector.record("example") + raise RuntimeError("body failed") + + +def test_streaming_measurement_session_writes_jsonl_without_retaining_records(tmp_path: Path) -> None: + output_path = tmp_path / "measurements.jsonl" + + with configured_measurement_session( + MeasurementConfig(output_path=output_path, streaming=True, keep_records=False) + ) as collector: + assert collector is not None + collector.record("example", value=1) + + assert collector.records == [] + assert output_path.read_text(encoding="utf-8").count("\n") == 1 + + collector.record("example", value=2) + + lines = output_path.read_text(encoding="utf-8").splitlines() + assert len(lines) == 2 + assert [json.loads(line)["value"] for line in lines] == [1, 2] + + +def test_measurement_collector_close_attempts_all_sinks_after_failure() -> None: + close_events: list[str] = [] + + class FakeSink: + def __init__(self, name: str, *, fail: bool = False) -> None: + self.name = name + self.fail = fail + + def write_record(self, record: dict[str, Any]) -> None: + _ = record + + def close(self) -> None: + close_events.append(self.name) + if self.fail: + raise OSError(f"{self.name} close failed") + + collector = MeasurementCollector( + record_sink=FakeSink("records", fail=True), + dd_trace_sink=FakeSink("dd-trace"), + dd_task_trace_sink=FakeSink("dd-task-trace"), + ) + + with pytest.raises(OSError, match="records close failed"): + collector.close() + + assert close_events == ["records", "dd-trace", "dd-task-trace"] + + +def test_streaming_measurement_requires_jsonl_output(tmp_path: Path) -> None: + with pytest.raises(ValueError, match="streaming measurement output only supports jsonl"): + MeasurementConfig(output_path=tmp_path / "measurements.json", output_format="json", streaming=True) + + +def test_dd_message_trace_requires_trace_path(tmp_path: Path) -> None: + with pytest.raises(ValueError, match="dd_trace_path is required"): + MeasurementConfig(output_path=tmp_path / "measurements.jsonl", dd_trace="last_message") + + +@pytest.mark.parametrize("structured", [False, True]) +def test_ndd_adapter_writes_native_dd_message_trace_and_strips_trace_columns( + tmp_path: Path, + *, + structured: bool, +) -> None: + input_df = pd.DataFrame( + { + "text": ["Alice works at Acme"], + f"notes{TRACE_COLUMN_POSTFIX}": ["user supplied trace-looking column"], + RECORD_ID_COLUMN: ["record-a"], + } + ) + original_column = ( + LLMStructuredColumnConfig( + name="raw_detected", + prompt="{{ text }}", + model_alias="alias", + output_format={"type": "object", "properties": {"entities": {"type": "array"}}}, + ) + if structured + else LLMTextColumnConfig( + name="raw_detected", + prompt="{{ text }}", + model_alias="alias", + ) + ) + captured_columns: list[LLMTextColumnConfig] = [] + + class TraceDataDesigner: + def preview(self, config_builder: object, *, num_records: int) -> SimpleNamespace: + traced_column = cast(Any, config_builder).get_column_config("raw_detected") + captured_columns.append(traced_column) + output = input_df.iloc[:num_records].copy() + output["raw_detected"] = "[]" + output[f"raw_detected{TRACE_COLUMN_POSTFIX}"] = [ + np.array( + [ + {"role": "system", "content": [{"type": "text", "text": "system secret"}]}, + {"role": "user", "content": [{"type": "text", "text": "prompt secret"}]}, + {"role": "assistant", "content": "secret response", "reasoning_content": "scratch"}, + ], + dtype=object, + ) + ] + return SimpleNamespace(dataset=output) + + adapter = NddAdapter(data_designer=cast(DataDesigner, TraceDataDesigner())) + trace_path = tmp_path / "trace.jsonl" + + with configured_measurement_session( + MeasurementConfig( + output_path=tmp_path / "measurements.jsonl", dd_trace="last_message", dd_trace_path=trace_path + ) + ): + result = _run_entity_detection_preview(adapter, input_df, [original_column]) + + assert original_column.with_trace == TraceType.NONE + assert captured_columns[0].with_trace == TraceType.ALL_MESSAGES + assert f"raw_detected{TRACE_COLUMN_POSTFIX}" not in result.dataframe.columns + assert f"notes{TRACE_COLUMN_POSTFIX}" in result.dataframe.columns + + trace = json.loads(trace_path.read_text(encoding="utf-8").strip()) + assert trace["record_type"] == "dd_message_trace" + assert trace["trace_source"] == "data_designer_column" + assert trace["workflow_name"] == "entity-detection" + assert trace["model_alias"] == "alias" + assert trace["model_name"] == "dummy-model" + assert trace["model_provider_name"] == "provider" + assert trace["status"] == "completed" + assert trace["messages"] == [{"role": "user", "content": [{"type": "text", "text": "prompt secret"}]}] + assert trace["response"]["content"] == "secret response" + assert trace["usage"] is None + + measurements = [json.loads(line) for line in (tmp_path / "measurements.jsonl").read_text().splitlines()] + coverage = [record for record in measurements if record["record_type"] == "dd_trace_coverage"] + assert len(coverage) == 1 + assert coverage[0]["traced_column_count"] == 1 + assert coverage[0]["unsupported_column_count"] == 0 + + serialized_measurements = json.dumps(measurements) + assert "prompt secret" not in serialized_measurements + assert "secret response" not in serialized_measurements + + +class _TraceModelFacade: + model_alias = "alias" + model_name = "dummy-model" + model_provider_name = "provider" + model_provider = SimpleNamespace(endpoint="http://provider/v1") + + def generate(self, prompt: str, **_kwargs: Any) -> str: + messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}] + return self.completion(messages).message.content + + def completion(self, _messages: list[dict[str, Any]]) -> SimpleNamespace: + return SimpleNamespace( + message=SimpleNamespace(content="custom response secret", reasoning_content="scratch", tool_calls=[]), + usage=SimpleNamespace(input_tokens=3, output_tokens=5, total_tokens=8), + ) + + +class _TraceModelRegistry: + def __init__(self, facade: Any | None = None) -> None: + self._models = {"alias": facade or _TraceModelFacade()} + + def get_model(self, *, model_alias: str) -> _TraceModelFacade: + return self._models[model_alias] + + +class _CustomTraceDataDesigner: + def __init__(self, input_df: pd.DataFrame, *, facade: Any | None = None) -> None: + self.input_df = input_df + self.resource_provider = SimpleNamespace(model_registry=_TraceModelRegistry(facade)) + + def _create_resource_provider(self, *_args: Any, **_kwargs: Any) -> SimpleNamespace: + return self.resource_provider + + def preview(self, config_builder: object, *, num_records: int) -> SimpleNamespace: + resource_provider = self._create_resource_provider() + model = resource_provider.model_registry.get_model(model_alias="alias") + output = self.input_df.iloc[:num_records].copy() + for column in cast(Any, config_builder).get_column_configs(): + for row_index, row in output.iterrows(): + generated = column.generator_function( + row.to_dict(), + generator_params=None, + models={"alias": model}, + ) + for key, value in generated.items(): + output.loc[row_index, key] = value + return SimpleNamespace(dataset=output) + + +class _TaskTraceDataDesigner: + def __init__( + self, + input_df: pd.DataFrame, + *, + task_traces: list[Any] | None = None, + error: Exception | None = None, + ) -> None: + self.input_df = input_df + self.task_traces = task_traces or [] + self.error = error + self.run_config = RunConfig(async_trace=False) + self.async_trace_values: list[bool] = [] + + def set_run_config(self, run_config: RunConfig) -> None: + self.async_trace_values.append(run_config.async_trace) + self.run_config = run_config + + def preview(self, _config_builder: object, *, num_records: int) -> SimpleNamespace: + assert self.run_config.async_trace is True + if self.error is not None: + raise self.error + return SimpleNamespace(dataset=self.input_df.iloc[:num_records].copy(), task_traces=self.task_traces) + + +def _custom_trace_column(name: str, *, prompt: str, value: str) -> CustomColumnConfig: + @custom_column_generator(required_columns=["text"], model_aliases=["alias"]) + def generator( + row: dict[str, Any], + generator_params: Any, + models: dict[str, Any], + ) -> dict[str, str]: + _ = row, generator_params + models["alias"].generate(prompt) + return {name: value} + + return CustomColumnConfig(name=name, generator_function=generator) + + +def _local_custom_column(name: str, *, value: str) -> CustomColumnConfig: + @custom_column_generator(required_columns=["text"]) + def generator( + row: dict[str, Any], + generator_params: Any, + models: dict[str, Any], + ) -> dict[str, str]: + _ = row, generator_params, models + return {name: value} + + return CustomColumnConfig(name=name, generator_function=generator) + + +def test_ndd_adapter_writes_custom_column_private_model_facade_dd_trace( + tmp_path: Path, + trace_input_df: pd.DataFrame, +) -> None: + adapter = NddAdapter(data_designer=cast(DataDesigner, _CustomTraceDataDesigner(trace_input_df))) + trace_path = tmp_path / "trace.jsonl" + + with configured_measurement_session( + MeasurementConfig( + output_path=tmp_path / "measurements.jsonl", dd_trace="all_messages", dd_trace_path=trace_path + ) + ): + _run_entity_detection_preview( + adapter, + trace_input_df, + [ + _custom_trace_column("raw_detected", prompt="raw prompt secret", value="[]"), + _custom_trace_column("quality_check", prompt="quality prompt secret", value="ok"), + ], + ) + + traces = [json.loads(line) for line in trace_path.read_text(encoding="utf-8").splitlines()] + traces_by_column = {trace["column_name"]: trace for trace in traces} + assert set(traces_by_column) == {"raw_detected", "quality_check"} + + raw_trace = traces_by_column["raw_detected"] + assert raw_trace["record_type"] == "dd_message_trace" + assert raw_trace["trace_source"] == "anonymizer_private_model_facade" + assert raw_trace["workflow_name"] == "entity-detection" + assert raw_trace["model_alias"] == "alias" + assert raw_trace["model_name"] == "dummy-model" + assert raw_trace["model_provider_name"] == "provider" + assert raw_trace["model_provider_endpoint"] == "http://provider/v1" + assert raw_trace["status"] == "completed" + assert raw_trace["messages"] == [{"role": "user", "content": [{"type": "text", "text": "raw prompt secret"}]}] + assert raw_trace["response"]["content"] == "custom response secret" + assert raw_trace["response"]["reasoning_content"] == "scratch" + assert raw_trace["usage"] == {"input_tokens": 3, "output_tokens": 5, "total_tokens": 8} + + quality_trace = traces_by_column["quality_check"] + assert quality_trace["messages"] == [ + {"role": "user", "content": [{"type": "text", "text": "quality prompt secret"}]} + ] + + measurements = [json.loads(line) for line in (tmp_path / "measurements.jsonl").read_text().splitlines()] + coverage = [record for record in measurements if record["record_type"] == "dd_trace_coverage"] + assert len(coverage) == 1 + assert coverage[0]["trace_backend"] == "anonymizer_private_model_facade" + assert coverage[0]["traced_column_count"] == 2 + assert coverage[0]["private_trace_column_count"] == 2 + assert coverage[0]["private_trace_column_names"] == ["raw_detected", "quality_check"] + assert coverage[0]["unsupported_column_count"] == 0 + + serialized_measurements = json.dumps(measurements) + assert "raw prompt secret" not in serialized_measurements + assert "quality prompt secret" not in serialized_measurements + assert "custom response secret" not in serialized_measurements + + +def test_ndd_adapter_reports_untraced_custom_columns_in_dd_trace_coverage( + tmp_path: Path, + trace_input_df: pd.DataFrame, +) -> None: + adapter = NddAdapter(data_designer=cast(DataDesigner, _CustomTraceDataDesigner(trace_input_df))) + trace_path = tmp_path / "trace.jsonl" + + with configured_measurement_session( + MeasurementConfig( + output_path=tmp_path / "measurements.jsonl", + dd_trace="all_messages", + dd_trace_path=trace_path, + ) + ): + _run_entity_detection_preview( + adapter, + trace_input_df, + [ + _custom_trace_column("raw_detected", prompt="raw prompt secret", value="[]"), + _local_custom_column("local_note", value="ok"), + ], + ) + + measurements = [json.loads(line) for line in (tmp_path / "measurements.jsonl").read_text().splitlines()] + coverage = [record for record in measurements if record["record_type"] == "dd_trace_coverage"] + assert len(coverage) == 1 + assert coverage[0]["traced_column_names"] == ["raw_detected"] + assert coverage[0]["unsupported_column_count"] == 1 + assert coverage[0]["unsupported_column_names"] == ["local_note"] + + +def test_ndd_adapter_private_model_facade_trace_write_error_is_not_wrapped( + trace_input_df: pd.DataFrame, +) -> None: + adapter = NddAdapter(data_designer=cast(DataDesigner, _CustomTraceDataDesigner(trace_input_df))) + collector = MeasurementCollector( + dd_trace_mode="all_messages", + dd_trace_sink=_FailingSink("private trace sidecar unavailable"), + fail_on_write_error=True, + ) + + with measurement_session(collector), pytest.raises(OSError, match="private trace sidecar unavailable"): + _run_entity_detection_preview( + adapter, + trace_input_df, + [_custom_trace_column("raw_detected", prompt="raw prompt secret", value="[]")], + ) + + +def test_ndd_adapter_flushes_private_model_facade_error_trace_when_workflow_fails( + tmp_path: Path, + trace_input_df: pd.DataFrame, +) -> None: + class FailingTraceModelFacade(_TraceModelFacade): + def completion(self, _messages: list[dict[str, Any]]) -> SimpleNamespace: + raise RuntimeError("custom model call failed") + + trace_path = tmp_path / "trace.jsonl" + adapter = NddAdapter( + data_designer=cast(DataDesigner, _CustomTraceDataDesigner(trace_input_df, facade=FailingTraceModelFacade())) + ) + + with pytest.raises(AnonymizerWorkflowError, match="Workflow failed"): + with configured_measurement_session( + MeasurementConfig( + output_path=tmp_path / "measurements.jsonl", + dd_trace="all_messages", + dd_trace_path=trace_path, + ) + ): + _run_entity_detection_preview( + adapter, + trace_input_df, + [_custom_trace_column("raw_detected", prompt="raw prompt secret", value="[]")], + ) + + traces = [json.loads(line) for line in trace_path.read_text(encoding="utf-8").splitlines()] + assert len(traces) == 1 + trace = traces[0] + assert trace["record_type"] == "dd_message_trace" + assert trace["trace_source"] == "anonymizer_private_model_facade" + assert trace["workflow_name"] == "entity-detection" + assert trace["column_name"] == "raw_detected" + assert trace["status"] == "error" + assert trace["error_type"] == "RuntimeError" + assert trace["messages"] == [{"role": "user", "content": [{"type": "text", "text": "raw prompt secret"}]}] + assert "custom model call failed" not in trace_path.read_text(encoding="utf-8") + + +def test_ndd_adapter_private_model_facade_trace_write_error_does_not_mask_workflow_failure( + trace_input_df: pd.DataFrame, +) -> None: + class FailingTraceModelFacade(_TraceModelFacade): + def completion(self, _messages: list[dict[str, Any]]) -> SimpleNamespace: + raise RuntimeError("custom model call failed") + + adapter = NddAdapter( + data_designer=cast(DataDesigner, _CustomTraceDataDesigner(trace_input_df, facade=FailingTraceModelFacade())) + ) + collector = MeasurementCollector( + dd_trace_mode="all_messages", + dd_trace_sink=_FailingSink("private trace sidecar unavailable"), + fail_on_write_error=True, + ) + + with measurement_session(collector), pytest.raises(AnonymizerWorkflowError, match="Workflow failed"): + _run_entity_detection_preview( + adapter, + trace_input_df, + [_custom_trace_column("raw_detected", prompt="raw prompt secret", value="[]")], + ) + + +def test_ndd_adapter_writes_sanitized_dd_task_traces_and_restores_run_config( + tmp_path: Path, + trace_input_df: pd.DataFrame, +) -> None: + task_trace = SimpleNamespace( + column="raw_detected", + row_group=0, + row_index=7, + task_type="llm", + dispatched_at=10.0, + slot_acquired_at=10.25, + completed_at=12.0, + status="error", + error="raw secret token Alice", + ) + data_designer = _TaskTraceDataDesigner(trace_input_df, task_traces=[task_trace]) + adapter = NddAdapter(data_designer=cast(DataDesigner, data_designer)) + task_trace_path = tmp_path / "task-trace.jsonl" + + with configured_measurement_session( + MeasurementConfig(output_path=tmp_path / "measurements.jsonl", dd_task_trace_path=task_trace_path) + ): + _run_entity_detection_preview( + adapter, + trace_input_df, + [_raw_detected_text_column()], + ) + + assert data_designer.async_trace_values == [True, False] + assert data_designer.run_config.async_trace is False + task_trace = json.loads(task_trace_path.read_text(encoding="utf-8").strip()) + assert task_trace["record_type"] == "dd_task_trace" + assert task_trace["workflow_name"] == "entity-detection" + assert task_trace["column"] == "raw_detected" + assert task_trace["row_group"] == 0 + assert task_trace["row_index"] == 7 + assert task_trace["task_type"] == "llm" + assert task_trace["status"] == "error" + assert task_trace["error_present"] is True + assert task_trace["dispatched_offset_sec"] == pytest.approx(0.0) + assert task_trace["slot_acquired_offset_sec"] == pytest.approx(0.25) + assert task_trace["completed_offset_sec"] == pytest.approx(2.0) + assert task_trace["queue_wait_sec"] == pytest.approx(0.25) + assert task_trace["execution_sec"] == pytest.approx(1.75) + assert task_trace["total_sec"] == pytest.approx(2.0) + assert "raw secret token Alice" not in task_trace_path.read_text(encoding="utf-8") + assert "raw secret token Alice" not in (tmp_path / "measurements.jsonl").read_text(encoding="utf-8") + + +def test_ndd_adapter_task_trace_handles_mapping_and_invalid_timestamps( + tmp_path: Path, + trace_input_df: pd.DataFrame, +) -> None: + task_traces = [ + { + "column": "missing_timestamps", + "row_group": 0, + "row_index": 1, + "task_type": "llm", + "status": "completed", + "error": None, + }, + { + "column": "nonpositive_timestamps", + "row_group": 0, + "row_index": 2, + "task_type": "llm", + "dispatched_at": 0.0, + "slot_acquired_at": -1.0, + "completed_at": -2.0, + "status": "completed", + "error": None, + }, + { + "column": "out_of_order_timestamps", + "row_group": 0, + "row_index": 3, + "task_type": "llm", + "dispatched_at": 20.0, + "slot_acquired_at": 19.0, + "completed_at": 18.0, + "status": "completed", + "error": None, + }, + ] + task_trace_path = tmp_path / "task-trace.jsonl" + adapter = NddAdapter( + data_designer=cast(DataDesigner, _TaskTraceDataDesigner(trace_input_df, task_traces=task_traces)) + ) + + with configured_measurement_session( + MeasurementConfig(output_path=tmp_path / "measurements.jsonl", dd_task_trace_path=task_trace_path) + ): + _run_entity_detection_preview( + adapter, + trace_input_df, + [_raw_detected_text_column()], + ) + + traces = [json.loads(line) for line in task_trace_path.read_text(encoding="utf-8").splitlines()] + traces_by_column = {trace["column"]: trace for trace in traces} + + missing = traces_by_column["missing_timestamps"] + assert missing["dispatched_offset_sec"] is None + assert missing["slot_acquired_offset_sec"] is None + assert missing["completed_offset_sec"] is None + assert missing["queue_wait_sec"] is None + assert missing["execution_sec"] is None + assert missing["total_sec"] is None + + nonpositive = traces_by_column["nonpositive_timestamps"] + assert nonpositive["dispatched_offset_sec"] is None + assert nonpositive["slot_acquired_offset_sec"] is None + assert nonpositive["completed_offset_sec"] is None + assert nonpositive["queue_wait_sec"] is None + assert nonpositive["execution_sec"] is None + assert nonpositive["total_sec"] is None + + out_of_order = traces_by_column["out_of_order_timestamps"] + assert out_of_order["dispatched_offset_sec"] == pytest.approx(0.0) + assert out_of_order["slot_acquired_offset_sec"] is None + assert out_of_order["completed_offset_sec"] is None + assert out_of_order["queue_wait_sec"] is None + assert out_of_order["execution_sec"] is None + assert out_of_order["total_sec"] is None + + +def test_ndd_adapter_task_trace_write_error_is_not_wrapped_as_workflow_error( + trace_input_df: pd.DataFrame, +) -> None: + task_trace = SimpleNamespace( + column="raw_detected", + row_group=0, + row_index=7, + task_type="llm", + dispatched_at=10.0, + slot_acquired_at=10.25, + completed_at=12.0, + status="completed", + error=None, + ) + adapter = NddAdapter( + data_designer=cast(DataDesigner, _TaskTraceDataDesigner(trace_input_df, task_traces=[task_trace])) + ) + collector = MeasurementCollector( + dd_task_trace_sink=_FailingSink("task trace sidecar unavailable"), + fail_on_write_error=True, + ) + + with measurement_session(collector), pytest.raises(OSError, match="task trace sidecar unavailable"): + _run_entity_detection_preview( + adapter, + trace_input_df, + [_raw_detected_text_column()], + ) + + +def test_ndd_adapter_trace_write_error_is_not_wrapped_as_workflow_error( + trace_input_df: pd.DataFrame, +) -> None: + class TraceDataDesigner: + def preview(self, _config_builder: object, *, num_records: int) -> SimpleNamespace: + output = trace_input_df.iloc[:num_records].copy() + output["raw_detected"] = "[]" + output[f"raw_detected{TRACE_COLUMN_POSTFIX}"] = [ + [ + {"role": "user", "content": [{"type": "text", "text": "prompt secret"}]}, + {"role": "assistant", "content": "secret response"}, + ] + ] + return SimpleNamespace(dataset=output) + + adapter = NddAdapter(data_designer=cast(DataDesigner, TraceDataDesigner())) + collector = MeasurementCollector( + dd_trace_mode="last_message", + dd_trace_sink=_FailingSink("trace sidecar unavailable"), + fail_on_write_error=True, + ) + + with measurement_session(collector), pytest.raises(OSError, match="trace sidecar unavailable"): + _run_entity_detection_preview( + adapter, + trace_input_df, + [_raw_detected_text_column()], + ) + + +def test_ndd_adapter_restores_run_config_when_task_traced_workflow_fails( + tmp_path: Path, + trace_input_df: pd.DataFrame, +) -> None: + data_designer = _TaskTraceDataDesigner(trace_input_df, error=RuntimeError("raw secret failure")) + adapter = NddAdapter(data_designer=cast(DataDesigner, data_designer)) + + with pytest.raises(AnonymizerWorkflowError, match="Workflow failed"): + with configured_measurement_session( + MeasurementConfig(output_path=tmp_path / "measurements.jsonl", dd_task_trace_path=tmp_path / "task.jsonl") + ): + _run_entity_detection_preview( + adapter, + trace_input_df, + [_raw_detected_text_column()], + ) + + assert data_designer.async_trace_values == [True, False] + assert data_designer.run_config.async_trace is False + + +def test_record_metrics_capture_generic_counts_without_raw_values() -> None: + final_entities = { + "entities": [ + {"value": "Alice", "label": "first_name", "start_position": 0, "end_position": 5}, + {"value": "Acme", "label": "company_name", "start_position": 15, "end_position": 19}, + ] + } + ground_truth_entities = { + "entities": [ + {"value": "Alice", "label": "first_name", "start_position": 0, "end_position": 5}, + {"value": "Beta", "label": "company_name", "start_position": 15, "end_position": 19}, + ] + } + replacement_map = { + "replacements": [ + {"original": "Alice", "label": "first_name", "synthetic": "Maya"}, + {"original": "Acme", "label": "company_name", "synthetic": "Maya"}, + ] + } + dataframe = pd.DataFrame( + { + COL_TEXT: ["Alice works at Acme"], + COL_FINAL_ENTITIES: [final_entities], + "ground_truth_entities": [ground_truth_entities], + COL_REPLACEMENT_MAP: [replacement_map], + COL_SEED_VALIDATION_CANDIDATES: [{"candidates": final_entities["entities"]}], + COL_REPAIR_ITERATIONS: [2], + COL_UTILITY_SCORE: [0.82], + COL_LEAKAGE_MASS: [0.2], + COL_WEIGHTED_LEAKAGE_RATE: [0.1], + COL_ANY_HIGH_LEAKED: [False], + COL_NEEDS_HUMAN_REVIEW: [True], + COL_NEEDS_REPAIR: [False], + } + ) + collector = MeasurementCollector(record_hash_key="test-key") + + with measurement_session(collector): + record_record_metrics( + dataframe, + mode="rewrite", + strategy="Rewrite", + text_column=COL_TEXT, + validation_max_entities_per_call=2, + ) + + record = collector.records[0] + assert record["ground_truth_entity_count"] == 2 + assert record["ground_truth_entity_label_counts"] == {"company_name": 1, "first_name": 1} + assert record["entity_true_positive_count"] == 1 + assert record["entity_false_positive_count"] == 1 + assert record["entity_false_negative_count"] == 1 + assert record["entity_precision"] == 0.5 + assert record["entity_recall"] == 0.5 + assert record["entity_f1"] == 0.5 + assert record["entity_relaxed_gt_found_count"] == 2 + assert record["entity_relaxed_detected_tp_count"] == 2 + assert record["entity_relaxed_label_compatible_gt_found_count"] == 2 + assert record["entity_relaxed_label_compatible_detected_tp_count"] == 2 + assert record["entity_relaxed_precision"] == 1.0 + assert record["entity_relaxed_recall"] == 1.0 + assert record["entity_relaxed_f1"] == 1.0 + assert record["entity_relaxed_label_compatible_precision"] == 1.0 + assert record["entity_relaxed_label_compatible_recall"] == 1.0 + assert record["entity_relaxed_label_compatible_f1"] == 1.0 + assert record["replacement_count"] == 2 + assert record["replacement_label_counts"] == {"company_name": 1, "first_name": 1} + assert record["replacement_duplicate_value_count"] == 1 + assert record["replacement_missing_final_entity_count"] == 0 + assert record["replacement_missing_final_entity_label_counts"] == {} + assert record["replacement_missing_final_value_count"] == 0 + assert record["replacement_synthetic_original_collision_count"] == 0 + assert record["replacement_synthetic_original_collision_label_counts"] == {} + assert record["replacement_synthetic_original_collision_value_count"] == 0 + assert record["repair_iterations"] == 2 + assert record["utility_score"] == 0.82 + assert record["leakage_mass"] == 0.2 + assert record["weighted_leakage_rate"] == 0.1 + assert record["any_high_leaked"] is False + assert record["needs_human_review"] is True + assert record["needs_repair"] is False + + serialized = json.dumps(collector.records) + assert "Alice" not in serialized + assert "Acme" not in serialized + assert "Beta" not in serialized + assert "Maya" not in serialized + + +def test_record_metrics_counts_duplicate_ground_truth_entities_by_occurrence() -> None: + final_entities = { + "entities": [ + {"value": "Alice", "label": "first_name", "start_position": 0, "end_position": 5}, + ] + } + ground_truth_entities = { + "entities": [ + {"value": "Alice", "label": "first_name", "start_position": 0, "end_position": 5}, + {"value": "Alice", "label": "first_name", "start_position": 18, "end_position": 23}, + ] + } + dataframe = pd.DataFrame( + { + COL_TEXT: ["Alice talked with Alice"], + COL_FINAL_ENTITIES: [final_entities], + "ground_truth_entities": [ground_truth_entities], + } + ) + collector = MeasurementCollector(record_hash_key="test-key") + + with measurement_session(collector): + record_record_metrics( + dataframe, + mode="replace", + strategy="Redact", + text_column=COL_TEXT, + validation_max_entities_per_call=2, + ) + + record = collector.records[0] + assert record["ground_truth_entity_count"] == 2 + assert record["ground_truth_entity_label_counts"] == {"first_name": 2} + assert record["entity_true_positive_count"] == 1 + assert record["entity_false_positive_count"] == 0 + assert record["entity_false_negative_count"] == 1 + assert record["entity_precision"] == 1.0 + assert record["entity_recall"] == 0.5 + assert record["entity_f1"] == pytest.approx(2 / 3) + assert record["entity_relaxed_gt_found_count"] == 1 + assert record["entity_relaxed_detected_tp_count"] == 1 + assert record["entity_relaxed_precision"] == 1.0 + assert record["entity_relaxed_recall"] == 0.5 + + +def test_record_metrics_capture_relaxed_gt_label_equivalence_without_raw_values() -> None: + final_entities = { + "entities": [ + {"value": "builduser42", "label": "user_name", "start_position": 4, "end_position": 15}, + ] + } + ground_truth_entities = { + "entities": [ + {"value": "legacy-user", "label": "username", "start_position": 6, "end_position": 14}, + ] + } + dataframe = pd.DataFrame( + { + COL_TEXT: ["ssh builduser42@host"], + COL_FINAL_ENTITIES: [final_entities], + "ground_truth_entities": [ground_truth_entities], + } + ) + collector = MeasurementCollector(record_hash_key="test-key") + + with measurement_session(collector): + record_record_metrics( + dataframe, + mode="replace", + strategy="Redact", + text_column=COL_TEXT, + validation_max_entities_per_call=2, + ) + + record = collector.records[0] + assert record["entity_true_positive_count"] == 0 + assert record["entity_false_positive_count"] == 1 + assert record["entity_false_negative_count"] == 1 + assert record["entity_relaxed_gt_found_count"] == 1 + assert record["entity_relaxed_detected_tp_count"] == 1 + assert record["entity_relaxed_label_compatible_gt_found_count"] == 1 + assert record["entity_relaxed_label_compatible_detected_tp_count"] == 1 + assert record["entity_relaxed_precision"] == 1.0 + assert record["entity_relaxed_recall"] == 1.0 + assert record["entity_relaxed_f1"] == 1.0 + assert record["entity_relaxed_label_compatible_precision"] == 1.0 + assert record["entity_relaxed_label_compatible_recall"] == 1.0 + assert record["entity_relaxed_label_compatible_f1"] == 1.0 + + serialized = json.dumps(collector.records) + assert "builduser42" not in serialized + assert "legacy-user" not in serialized + + +def test_record_metrics_counts_missing_replacement_map_entries_without_raw_values() -> None: + final_entities = { + "entities": [ + {"value": "Alice", "label": "first_name", "start_position": 0, "end_position": 5}, + {"value": "2030-01-01", "label": "date", "start_position": 13, "end_position": 23}, + {"value": "2030-01-01", "label": "date", "start_position": 27, "end_position": 37}, + ] + } + replacement_map = { + "replacements": [ + {"original": "Alice", "label": "first_name", "synthetic": "Maya"}, + ] + } + dataframe = pd.DataFrame( + { + COL_TEXT: ["Alice filed 2030-01-01 and 2030-01-01"], + COL_FINAL_ENTITIES: [final_entities], + COL_REPLACEMENT_MAP: [replacement_map], + } + ) + collector = MeasurementCollector(record_hash_key="test-key") + + with measurement_session(collector): + record_record_metrics( + dataframe, + mode="replace", + strategy="Substitute", + text_column=COL_TEXT, + validation_max_entities_per_call=100, + ) + + record = collector.records[0] + assert record["replacement_missing_final_entity_count"] == 2 + assert record["replacement_missing_final_entity_label_counts"] == {"date": 2} + assert record["replacement_missing_final_value_count"] == 1 + assert record["replacement_synthetic_original_collision_count"] == 0 + assert record["replacement_synthetic_original_collision_label_counts"] == {} + assert record["replacement_synthetic_original_collision_value_count"] == 0 + + serialized = json.dumps(collector.records) + assert "Alice" not in serialized + assert "2030-01-01" not in serialized + assert "Maya" not in serialized + + +def test_record_metrics_counts_synthetic_original_collisions_without_raw_values() -> None: + final_entities = { + "entities": [ + {"value": "Alice", "label": "first_name", "start_position": 0, "end_position": 5}, + {"value": "2030-01-01", "label": "date", "start_position": 13, "end_position": 23}, + ] + } + replacement_map = { + "replacements": [ + {"original": "Alice", "label": "first_name", "synthetic": "Maya"}, + {"original": "2029-12-01", "label": "date", "synthetic": "2030-01-01"}, + ] + } + dataframe = pd.DataFrame( + { + COL_TEXT: ["Alice filed 2030-01-01"], + COL_FINAL_ENTITIES: [final_entities], + COL_REPLACEMENT_MAP: [replacement_map], + } + ) + collector = MeasurementCollector(record_hash_key="test-key") + + with measurement_session(collector): + record_record_metrics( + dataframe, + mode="replace", + strategy="Substitute", + text_column=COL_TEXT, + validation_max_entities_per_call=100, + ) + + record = collector.records[0] + assert record["replacement_synthetic_original_collision_count"] == 1 + assert record["replacement_synthetic_original_collision_label_counts"] == {"date": 1} + assert record["replacement_synthetic_original_collision_value_count"] == 1 + + serialized = json.dumps(collector.records) + assert "Alice" not in serialized + assert "2030-01-01" not in serialized + assert "2029-12-01" not in serialized + assert "Maya" not in serialized + + +def test_record_metrics_counts_original_value_replacement_leaks_without_raw_values() -> None: + leaked_key = "sk-test-AAAAAAAAAAAAAAAAAAAAAAAA" + dataframe = pd.DataFrame( + { + COL_TEXT: [f"token={leaked_key}"], + COL_REPLACED_TEXT: [f"still token={leaked_key}"], + COL_FINAL_ENTITIES: [{"entities": [{"value": leaked_key, "label": "api_key"}]}], + } + ) + collector = MeasurementCollector(record_hash_key="test-key") + + with measurement_session(collector): + record_record_metrics( + dataframe, + mode="replace", + strategy="Hash", + text_column=COL_TEXT, + validation_max_entities_per_call=100, + ) + + record = collector.records[0] + assert record["original_value_leak_count"] == 1 + assert record["original_value_leak_label_counts"] == {"api_key": 1} + assert leaked_key not in json.dumps(collector.records) + + +def test_record_metrics_ignores_short_value_inside_hash_replacement_token() -> None: + dataframe = pd.DataFrame( + { + COL_TEXT: ["Alice is 34 years old."], + COL_REPLACED_TEXT: ["Alice is years old."], + COL_FINAL_ENTITIES: [{"entities": [{"value": "34", "label": "age"}]}], + } + ) + collector = MeasurementCollector(record_hash_key="test-key") + + with measurement_session(collector): + record_record_metrics( + dataframe, + mode="replace", + strategy="Hash", + text_column=COL_TEXT, + validation_max_entities_per_call=100, + ) + + record = collector.records[0] + assert record["original_value_leak_count"] == 0 + assert record["original_value_leak_label_counts"] == {} + + +def test_record_metrics_counts_standalone_short_value_replacement_leaks() -> None: + dataframe = pd.DataFrame( + { + COL_TEXT: ["Alice is 34 years old."], + COL_REPLACED_TEXT: ["Alice is 34 years old."], + COL_FINAL_ENTITIES: [{"entities": [{"value": "34", "label": "age"}]}], + } + ) + collector = MeasurementCollector(record_hash_key="test-key") + + with measurement_session(collector): + record_record_metrics( + dataframe, + mode="replace", + strategy="Hash", + text_column=COL_TEXT, + validation_max_entities_per_call=100, + ) + + record = collector.records[0] + assert record["original_value_leak_count"] == 1 + assert record["original_value_leak_label_counts"] == {"age": 1} + + +def test_record_metrics_normalizes_integral_row_index_types() -> None: + dataframe = pd.DataFrame( + { + COL_TEXT: ["Alice works at Acme"], + COL_FINAL_ENTITIES: [{"entities": [{"value": "Alice", "label": "first_name"}]}], + }, + index=pd.Index([np.int64(7)]), + ) + collector = MeasurementCollector(record_hash_key="test-key") + + with measurement_session(collector): + record_record_metrics( + dataframe, + mode="replace", + strategy="Redact", + text_column=COL_TEXT, + validation_max_entities_per_call=100, + ) + + assert collector.records[0]["row_index"] == 7 + + +def test_record_hash_uses_run_scoped_secret_by_default() -> None: + first = MeasurementCollector() + second = MeasurementCollector() + + assert first.record_hash(row_index=0, text="Alice works at Acme") != second.record_hash( + row_index=0, + text="Alice works at Acme", + ) + + +def test_stage_timer_records_errors() -> None: + workflow = EntityDetectionWorkflow(adapter=Mock(spec=NddAdapter)) + collector = MeasurementCollector(record_hash_key="test-key") + + with measurement_session(collector), pytest.raises(ValueError, match="privacy_goal is required"): + workflow.run( + pd.DataFrame({COL_TEXT: ["Alice"]}), + model_configs=[], + selected_models=DetectionModelSelection( + entity_detector="detector", + entity_validator=["validator"], + entity_augmenter="augmenter", + latent_detector="latent", + ), + gliner_detection_threshold=0.3, + tag_latent_entities=True, + privacy_goal=None, + ) + + stage_records = [record for record in collector.records if record["record_type"] == "stage"] + assert len(stage_records) == 1 + record = stage_records[0] + assert record["schema_version"] == MEASUREMENT_SCHEMA_VERSION + assert record["record_type"] == "stage" + assert record["run_id"] == collector.run_id + assert record["run_tags"] == {} + assert isinstance(record["timestamp_unix_sec"], float) + assert record["stage"] == "EntityDetectionWorkflow.run" + assert record["status"] == "error" + assert record["elapsed_sec"] >= 0 + assert record["input_row_count"] == 1 + assert record["input_rows_per_sec"] >= 0 + assert record["output_rows_per_sec"] is None + assert record["tag_latent_entities"] is True + + +def test_rewrite_llm_call_estimate_splits_by_stage() -> None: + calls = estimate_llm_calls_by_stage( + mode="rewrite", + strategy="Rewrite", + has_grouped_entities=True, + validation_chunk_count=2, + repair_iterations=2, + ) + + assert calls == { + "entity_detection": 4, + "latent_entity_detection": 1, + "replace_map_generation": 1, + "rewrite_pipeline": 5, + "rewrite_evaluate": 9, + "rewrite_repair": 2, + "rewrite_final_judge": 1, + } + + +def test_rewrite_llm_call_estimate_skips_rewrite_body_without_entities() -> None: + calls = estimate_llm_calls_by_stage( + mode="rewrite", + strategy="Rewrite", + has_grouped_entities=False, + validation_chunk_count=0, + repair_iterations=2, + ) + + assert calls == { + "entity_detection": 2, + "latent_entity_detection": 0, + "replace_map_generation": 0, + "rewrite_pipeline": 0, + "rewrite_evaluate": 0, + "rewrite_repair": 0, + "rewrite_final_judge": 0, + } diff --git a/tests/tools/test_benchmark_output_analysis.py b/tests/tools/test_benchmark_output_analysis.py new file mode 100644 index 00000000..0a88236e --- /dev/null +++ b/tests/tools/test_benchmark_output_analysis.py @@ -0,0 +1,688 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import importlib.util +import json +import shutil +import sys +from pathlib import Path +from types import ModuleType + +import pandas as pd +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def load_tool(module_name: str, path: Path) -> ModuleType: + spec = importlib.util.spec_from_file_location(module_name, path) + assert spec is not None + assert spec.loader is not None + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + sys.path.insert(0, str(path.parent)) + spec.loader.exec_module(module) + return module + + +def _write_jsonl(path: Path, rows: list[dict[str, object]]) -> None: + path.write_text("".join(json.dumps(row) + "\n" for row in rows), encoding="utf-8") + + +def _copy_fixture(tmp_path: Path, fixture_name: str) -> Path: + fixture_dir = REPO_ROOT / "tests" / "fixtures" / "measurement" / fixture_name + destination = tmp_path / fixture_name + shutil.copytree(fixture_dir, destination) + return destination + + +def test_analyze_benchmark_output_joins_measurements_and_detection_artifacts(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_output_analysis", + REPO_ROOT / "tools/measurement/analyze_benchmark_output.py", + ) + benchmark_dir = _copy_fixture(tmp_path, "benchmark-output") + + result = tool.analyze_benchmark_output(benchmark_dir) + + assert result.case_count == 2 + assert result.group_count == 2 + assert result.model_usage_count == 2 + assert result.model_usage_group_count == 2 + + cases = {row.case_id: row for row in result.cases} + bio = cases["bio__default__r000"] + assert bio.workload_category == "synthetic_biography" + assert bio.observed_failed_request_rate == pytest.approx(1 / 4) + assert bio.dd_trace_error_count == 1 + assert bio.observed_bridge_fallback_requests == 1 + assert bio.record_count == 2 + assert bio.entity_precision == pytest.approx(10 / 14) + assert bio.entity_recall == pytest.approx(10 / 22) + assert bio.replacement_missing_final_entity_label_counts == {"date": 2} + assert bio.replacement_synthetic_original_collision_label_counts == {"date": 1} + assert bio.artifact_final_detector_entity_count == 11 + assert bio.artifact_final_augmenter_entity_count == 3 + assert bio.artifact_final_entity_signature_hashes == ["bio-hash-a", "bio-hash-b"] + assert bio.artifact_final_entity_signature_details["bio-hash-a"] == { + "label": "person", + "source": "detector", + "row_index": 0, + "start_position": 0, + "end_position": 5, + "value_length": 5, + } + + shell = cases["shell__native-local__r000"] + assert shell.experimental_replacement_strategy == "custom_replacement_strategy" + assert shell.original_value_leak_count == 1 + assert shell.artifact_final_entity_signature_details["shell-hash-a"]["source"] == "native" + + model_rows = {row.model_name: row for row in result.model_usage} + assert model_rows["nvidia/gliner-pii"].observed_total_tokens == 1100 + assert model_rows["nvidia/nemotron-3-super"].model_provider_name == "local-vllm" + assert model_rows["nvidia/nemotron-3-super"].observed_failed_request_rate == pytest.approx(1 / 3) + + bio_group = next(group for group in result.groups if group.workload_id == "bio") + assert bio_group.total_record_count == 2 + assert bio_group.micro_entity_precision == pytest.approx(10 / 14) + assert bio_group.replacement_missing_final_entity_label_counts == {"date": 2} + assert bio_group.replacement_synthetic_original_collision_label_counts == {"date": 1} + + shell_group = next(group for group in result.groups if group.workload_id == "shell") + assert shell_group.experimental_replacement_strategy == "custom_replacement_strategy" + assert shell_group.sum_original_value_leak_count == 1 + + serialized = result.model_dump_json() + assert "Alice" not in serialized + assert "sk-test" not in serialized + + +def test_analyze_benchmark_output_counts_generic_model_workflow_records(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_output_analysis_model_workflow", + REPO_ROOT / "tools/measurement/analyze_benchmark_output.py", + ) + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + _write_jsonl( + benchmark_dir / "measurements.jsonl", + [ + { + "record_type": "model_workflow", + "run_id": "bio__native__r000", + "workflow_name": "entity-detection-native-single-pass", + "elapsed_sec": 0.25, + "observed_total_requests": 3, + "observed_successful_requests": 3, + "observed_failed_requests": 0, + "observed_input_tokens": 30, + "observed_output_tokens": 12, + "observed_total_tokens": 42, + "model_usage": { + "native-direct": { + "model_alias": "native-direct", + "model_name": "nvidia/nemotron-3-super", + "model_provider_name": "local-vllm", + "request_usage": { + "successful_requests": 3, + "failed_requests": 0, + "total_requests": 3, + }, + "token_usage": { + "input_tokens": 30, + "output_tokens": 12, + "total_tokens": 42, + }, + } + }, + "run_tags": { + "suite_id": "suite", + "workload_id": "bio", + "config_id": "native", + "experimental_detection_strategy": "native_single_pass", + "experimental_replacement_strategy": "default", + "dd_parser_compat": "raw_json", + "repetition": 0, + "case_id": "bio__native__r000", + }, + }, + { + "record_type": "record", + "run_id": "bio__native__r000", + "final_entity_count": 2, + "replacement_count": 2, + "original_value_leak_count": 0, + "original_value_leak_label_counts": {}, + "run_tags": { + "suite_id": "suite", + "workload_id": "bio", + "config_id": "native", + "experimental_detection_strategy": "native_single_pass", + "experimental_replacement_strategy": "default", + "dd_parser_compat": "raw_json", + "repetition": 0, + "case_id": "bio__native__r000", + }, + }, + ], + ) + + result = tool.analyze_benchmark_output(benchmark_dir) + + assert result.case_count == 1 + case = result.cases[0] + assert case.observed_total_requests == 3 + assert case.observed_total_tokens == 42 + assert case.observed_failed_request_rate == 0 + assert result.model_usage_count == 1 + model_row = result.model_usage[0] + assert model_row.workflow_name == "entity-detection-native-single-pass" + assert model_row.model_alias == "native-direct" + assert model_row.model_name == "nvidia/nemotron-3-super" + assert model_row.observed_total_tokens == 42 + assert result.groups[0].median_observed_total_requests == 3 + assert result.model_usage_groups[0].sum_observed_total_tokens == 42 + + +def test_analyze_benchmark_output_rolls_up_evaluation_records(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_output_analysis_evaluation_rollups", + REPO_ROOT / "tools/measurement/analyze_benchmark_output.py", + ) + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + _write_jsonl( + benchmark_dir / "measurements.jsonl", + [ + { + "record_type": "evaluation_record", + "run_id": "bio__substitute__r000", + "detection_valid": True, + "detection_invalid_entity_count": 0, + "type_fidelity_valid": True, + "type_fidelity_invalid_replacement_count": 0, + "relational_consistency_valid": False, + "relational_consistency_invalid_relation_count": 2, + "attribute_fidelity_valid": True, + "attribute_fidelity_invalid_entity_count": 0, + "run_tags": { + "workload_id": "bio", + "config_id": "substitute", + "case_id": "bio__substitute__r000", + }, + }, + { + "record_type": "evaluation_record", + "run_id": "bio__substitute__r000", + "detection_valid": False, + "detection_invalid_entity_count": 3, + "type_fidelity_valid": True, + "type_fidelity_invalid_replacement_count": 0, + "relational_consistency_valid": True, + "relational_consistency_invalid_relation_count": 0, + "attribute_fidelity_valid": None, + "attribute_fidelity_invalid_entity_count": 0, + "run_tags": { + "workload_id": "bio", + "config_id": "substitute", + "case_id": "bio__substitute__r000", + }, + }, + { + "record_type": "evaluation_record", + "run_id": "bio__substitute__r001", + "detection_valid": True, + "detection_invalid_entity_count": 1, + "type_fidelity_valid": False, + "type_fidelity_invalid_replacement_count": 4, + "relational_consistency_valid": True, + "relational_consistency_invalid_relation_count": 0, + "attribute_fidelity_valid": False, + "attribute_fidelity_invalid_entity_count": 5, + "run_tags": { + "workload_id": "bio", + "config_id": "substitute", + "case_id": "bio__substitute__r001", + }, + }, + ], + ) + + result = tool.analyze_benchmark_output(benchmark_dir) + + cases = {row.case_id: row for row in result.cases} + first_case = cases["bio__substitute__r000"] + assert first_case.detection_judged_record_count == 2 + assert first_case.detection_valid_record_count == 1 + assert first_case.detection_valid_rate == pytest.approx(0.5) + assert first_case.detection_invalid_entity_count == 3 + assert first_case.relational_consistency_judged_record_count == 2 + assert first_case.relational_consistency_valid_rate == pytest.approx(0.5) + assert first_case.attribute_fidelity_judged_record_count == 1 + assert first_case.attribute_fidelity_valid_rate == pytest.approx(1.0) + + second_case = cases["bio__substitute__r001"] + assert second_case.type_fidelity_judged_record_count == 1 + assert second_case.type_fidelity_valid_record_count == 0 + assert second_case.type_fidelity_valid_rate == pytest.approx(0.0) + assert second_case.type_fidelity_invalid_replacement_count == 4 + + group = result.groups[0] + assert group.sum_detection_judged_record_count == 3 + assert group.sum_detection_valid_record_count == 2 + assert group.micro_detection_valid_rate == pytest.approx(2 / 3) + assert group.sum_detection_invalid_entity_count == 4 + assert group.sum_type_fidelity_judged_record_count == 3 + assert group.sum_type_fidelity_valid_record_count == 2 + assert group.micro_type_fidelity_valid_rate == pytest.approx(2 / 3) + assert group.sum_type_fidelity_invalid_replacement_count == 4 + assert group.sum_attribute_fidelity_judged_record_count == 2 + assert group.sum_attribute_fidelity_valid_record_count == 1 + assert group.micro_attribute_fidelity_valid_rate == pytest.approx(0.5) + assert group.sum_attribute_fidelity_invalid_entity_count == 5 + + +def test_analyze_benchmark_output_accepts_detection_artifact_override(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_output_analysis_artifact_override", + REPO_ROOT / "tools/measurement/analyze_benchmark_output.py", + ) + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + _write_jsonl( + benchmark_dir / "measurements.jsonl", + [ + { + "record_type": "record", + "run_id": "bio__default__r000", + "final_entity_count": 2, + "run_tags": { + "workload_id": "bio", + "config_id": "default", + "experimental_detection_strategy": "default", + "case_id": "bio__default__r000", + }, + } + ], + ) + _write_jsonl( + benchmark_dir / "detection-artifacts.jsonl", + [ + { + "case_id": "bio__default__r000", + "run_id": "bio__default__r000", + "final_entity_count": 2, + "final_entity_signature_hashes": ["stale-hash"], + "final_entity_signature_count": 1, + } + ], + ) + refreshed_artifacts = tmp_path / "refreshed-detection-artifacts.jsonl" + _write_jsonl( + refreshed_artifacts, + [ + { + "case_id": "bio__default__r000", + "run_id": "bio__default__r000", + "final_entity_count": 2, + "final_entity_signature_hashes": ["fresh-hash-a", "fresh-hash-b"], + "final_entity_signature_labels": {"fresh-hash-a": "person", "fresh-hash-b": "email"}, + "final_entity_signature_count": 2, + } + ], + ) + + default_result = tool.analyze_benchmark_output(benchmark_dir) + override_result = tool.analyze_benchmark_output(benchmark_dir, detection_artifacts=refreshed_artifacts) + + assert default_result.cases[0].artifact_final_entity_signature_hashes == ["stale-hash"] + assert override_result.detection_artifacts_path == str(refreshed_artifacts) + assert override_result.cases[0].artifact_final_entity_signature_hashes == ["fresh-hash-a", "fresh-hash-b"] + assert override_result.cases[0].artifact_final_entity_signature_labels == { + "fresh-hash-a": "person", + "fresh-hash-b": "email", + } + + +def test_analyze_benchmark_output_requires_detection_artifact_override_path(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_output_analysis_artifact_override_missing", + REPO_ROOT / "tools/measurement/analyze_benchmark_output.py", + ) + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + _write_jsonl( + benchmark_dir / "measurements.jsonl", + [ + { + "record_type": "record", + "run_id": "bio__default__r000", + "run_tags": {"case_id": "bio__default__r000"}, + } + ], + ) + + with pytest.raises(ValueError, match="input path does not exist"): + tool.analyze_benchmark_output(benchmark_dir, detection_artifacts=tmp_path / "missing.jsonl") + + +def test_write_analysis_tables_exports_case_and_group_tables(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_output_analysis_export", + REPO_ROOT / "tools/measurement/analyze_benchmark_output.py", + ) + result = tool.BenchmarkOutputAnalysis( + benchmark_dir=str(tmp_path / "benchmark"), + cases=[ + tool.CaseAnalysisRow( + suite_id="suite", + workload_id="shell", + config_id="native", + experimental_detection_strategy="native_single_pass", + experimental_replacement_strategy="custom_replacement_strategy", + dd_parser_compat="raw_json", + repetition=0, + case_id="shell__native__r000", + run_id="shell__native__r000", + final_entity_count=8, + ) + ], + groups=[ + tool.GroupAnalysisRow( + workload_id="shell", + config_id="native", + experimental_detection_strategy="native_single_pass", + experimental_replacement_strategy="custom_replacement_strategy", + case_count=1, + median_final_entity_count=8, + median_observed_successful_requests=0, + median_observed_input_tokens=0, + median_observed_output_tokens=0, + median_observed_failed_request_rate=0, + median_artifact_final_entity_count=8, + ) + ], + model_usage=[ + tool.ModelUsageAnalysisRow( + workload_id="shell", + config_id="native", + experimental_detection_strategy="native_single_pass", + experimental_replacement_strategy="custom_replacement_strategy", + dd_parser_compat="raw_json", + case_id="shell__native__r000", + run_id="shell__native__r000", + workflow_name="entity-detection", + model_name="nvidia/gliner-pii", + observed_total_requests=1, + observed_successful_requests=1, + observed_total_tokens=1200, + ) + ], + model_usage_groups=[ + tool.ModelUsageGroupAnalysisRow( + workload_id="shell", + config_id="native", + experimental_detection_strategy="native_single_pass", + experimental_replacement_strategy="custom_replacement_strategy", + dd_parser_compat="raw_json", + workflow_name="entity-detection", + model_name="nvidia/gliner-pii", + case_count=1, + workflow_count=1, + sum_observed_total_requests=1, + sum_observed_successful_requests=1, + sum_observed_total_tokens=1200, + median_observed_total_requests=1, + median_observed_total_tokens=1200, + ) + ], + ) + + output_dir = tmp_path / "tables" + tool.write_analysis_tables(result, output_dir, tool.ExportFormat.csv) + + assert pd.read_csv(output_dir / "case_analysis.csv")["case_id"].tolist() == ["shell__native__r000"] + assert pd.read_csv(output_dir / "case_analysis.csv")["experimental_replacement_strategy"].tolist() == [ + "custom_replacement_strategy" + ] + assert pd.read_csv(output_dir / "group_analysis.csv")["case_count"].tolist() == [1] + assert pd.read_csv(output_dir / "model_analysis.csv")["model_name"].tolist() == ["nvidia/gliner-pii"] + assert pd.read_csv(output_dir / "model_group_analysis.csv")["workflow_count"].tolist() == [1] + assert (output_dir / "manifest.json").exists() + + +def test_analyze_benchmark_output_preserves_zero_entity_cases(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_output_analysis_zero", + REPO_ROOT / "tools/measurement/analyze_benchmark_output.py", + ) + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + _write_jsonl( + benchmark_dir / "measurements.jsonl", + [ + { + "record_type": "record", + "run_id": "empty__redact__r000", + "final_entity_count": 0, + "replacement_count": 0, + "run_tags": { + "workload_id": "empty", + "config_id": "redact", + "experimental_detection_strategy": "default", + "case_id": "empty__redact__r000", + }, + } + ], + ) + + result = tool.analyze_benchmark_output(benchmark_dir) + + assert result.cases[0].final_entity_count == 0 + assert result.groups[0].median_final_entity_count == 0 + + +def test_analyze_benchmark_output_groups_replacement_strategies_separately(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_output_analysis_replacement_strategy_groups", + REPO_ROOT / "tools/measurement/analyze_benchmark_output.py", + ) + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + _write_jsonl( + benchmark_dir / "measurements.jsonl", + [ + { + "record_type": "record", + "run_id": "secrets__candidate__r000", + "final_entity_count": 4, + "run_tags": { + "workload_id": "secrets", + "config_id": "candidate", + "experimental_detection_strategy": "native_single_pass", + "experimental_replacement_strategy": "default", + "case_id": "secrets__candidate__r000", + }, + }, + { + "record_type": "record", + "run_id": "secrets__candidate__r001", + "final_entity_count": 4, + "run_tags": { + "workload_id": "secrets", + "config_id": "candidate", + "experimental_detection_strategy": "native_single_pass", + "experimental_replacement_strategy": "custom_replacement_strategy", + "case_id": "secrets__candidate__r001", + }, + }, + ], + ) + + result = tool.analyze_benchmark_output(benchmark_dir) + + assert result.group_count == 2 + assert {group.experimental_replacement_strategy for group in result.groups} == { + "default", + "custom_replacement_strategy", + } + + +def test_analyze_benchmark_output_surfaces_failed_cases(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_output_analysis_failures", + REPO_ROOT / "tools/measurement/analyze_benchmark_output.py", + ) + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + _write_jsonl( + benchmark_dir / "measurements.jsonl", + [ + { + "record_type": "stage", + "run_id": "shell__candidate__r000", + "stage": "Anonymizer._run_internal", + "status": "completed", + "elapsed_sec": 1.2, + "run_tags": { + "workload_id": "shell", + "config_id": "candidate", + "experimental_detection_strategy": "detector_only", + "repetition": 0, + "case_id": "shell__candidate__r000", + }, + }, + { + "record_type": "ndd_workflow", + "run_id": "shell__candidate__r001", + "workflow_name": "entity-detection", + "status": "error", + "elapsed_sec": 0.2, + "run_tags": { + "workload_id": "shell", + "config_id": "candidate", + "experimental_detection_strategy": "detector_only", + "repetition": 1, + "case_id": "shell__candidate__r001", + }, + }, + { + "record_type": "stage", + "run_id": "shell__candidate__r001", + "stage": "Anonymizer._run_internal", + "status": "error", + "elapsed_sec": 0.2, + "run_tags": { + "workload_id": "shell", + "config_id": "candidate", + "experimental_detection_strategy": "detector_only", + "repetition": 1, + "case_id": "shell__candidate__r001", + }, + }, + ], + ) + + result = tool.analyze_benchmark_output(benchmark_dir) + + cases = {row.case_id: row for row in result.cases} + assert cases["shell__candidate__r000"].case_failed is False + assert cases["shell__candidate__r000"].error_stage_count == 0 + assert cases["shell__candidate__r000"].error_ndd_workflow_count == 0 + assert cases["shell__candidate__r001"].case_failed is True + assert cases["shell__candidate__r001"].error_stage_count == 1 + assert cases["shell__candidate__r001"].error_ndd_workflow_count == 1 + assert result.groups[0].failed_case_count == 1 + assert result.groups[0].failed_case_rate == pytest.approx(0.5) + assert result.groups[0].error_stage_count == 1 + assert result.groups[0].error_ndd_workflow_count == 1 + assert "failed_cases=1/2" in tool.render_result(result, json_output=False) + + +def test_analyze_benchmark_output_groups_artifact_contribution_metrics(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_output_analysis_artifact_group", + REPO_ROOT / "tools/measurement/analyze_benchmark_output.py", + ) + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + _write_jsonl( + benchmark_dir / "measurements.jsonl", + [ + { + "record_type": "record", + "run_id": "bio__default__r000", + "final_entity_count": 10, + "run_tags": { + "workload_id": "bio", + "config_id": "default", + "experimental_detection_strategy": "default", + "case_id": "bio__default__r000", + }, + }, + { + "record_type": "record", + "run_id": "bio__default__r001", + "final_entity_count": 14, + "run_tags": { + "workload_id": "bio", + "config_id": "default", + "experimental_detection_strategy": "default", + "case_id": "bio__default__r001", + }, + }, + ], + ) + _write_jsonl( + benchmark_dir / "detection-artifacts.jsonl", + [ + { + "workload_id": "bio", + "config_id": "default", + "case_id": "bio__default__r000", + "run_id": "bio__default__r000", + "seed_entity_count": 9, + "seed_validation_candidate_count": 9, + "augmented_entity_count": 4, + "augmented_new_final_value_count": 1, + "final_entity_count": 11, + "final_source_counts": {"detector": 10, "augmenter": 1}, + "final_entity_signature_hashes": ["a", "b"], + "final_entity_signature_count": 2, + }, + { + "workload_id": "bio", + "config_id": "default", + "case_id": "bio__default__r001", + "run_id": "bio__default__r001", + "seed_entity_count": 13, + "seed_validation_candidate_count": 13, + "augmented_entity_count": 8, + "augmented_new_final_value_count": 3, + "final_entity_count": 15, + "final_source_counts": {"detector": 12, "augmenter": 3}, + "final_entity_signature_hashes": ["a", "b", "c", "d"], + "final_entity_signature_count": 4, + }, + ], + ) + + result = tool.analyze_benchmark_output(benchmark_dir) + + group = result.groups[0] + assert group.median_final_entity_count == 12 + assert group.median_observed_successful_requests == 0 + assert group.median_observed_input_tokens == 0 + assert group.median_observed_output_tokens == 0 + assert group.median_observed_failed_request_rate is None + assert group.median_seed_entity_count == 11 + assert group.median_seed_validation_candidate_count == 11 + assert group.median_augmented_entity_count == 6 + assert group.median_augmented_new_final_value_count == 2 + assert group.median_artifact_final_entity_count == 13 + assert group.median_artifact_final_detector_entity_count == 11 + assert group.median_artifact_final_augmenter_entity_count == 2 + assert group.median_artifact_final_entity_signature_count == 3 diff --git a/tests/tools/test_detection_artifact_analysis.py b/tests/tools/test_detection_artifact_analysis.py new file mode 100644 index 00000000..f99fffbd --- /dev/null +++ b/tests/tools/test_detection_artifact_analysis.py @@ -0,0 +1,159 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import importlib.util +import json +import sys +from pathlib import Path +from types import ModuleType + +import pandas as pd + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def load_tool(module_name: str, path: Path) -> ModuleType: + spec = importlib.util.spec_from_file_location(module_name, path) + assert spec is not None + assert spec.loader is not None + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + sys.path.insert(0, str(path.parent)) + spec.loader.exec_module(module) + return module + + +def _entity(value: str, label: str, start: int, end: int, *, source: str = "detector") -> dict[str, object]: + return { + "id": f"{label}_{start}_{end}", + "value": value, + "label": label, + "start_position": start, + "end_position": end, + "score": 1.0, + "source": source, + } + + +def _write_artifact(root: Path, workflow: str, rows: list[dict[str, object]]) -> None: + parquet_dir = root / workflow / "parquet-files" + parquet_dir.mkdir(parents=True) + pd.DataFrame(rows).to_parquet(parquet_dir / "batch_00000.parquet", index=False) + + +def test_detection_artifact_analysis_reports_augmentation_contribution(tmp_path: Path) -> None: + tool = load_tool( + "measurement_detection_artifact_analysis", + REPO_ROOT / "tools/measurement/analyze_detection_artifacts.py", + ) + artifact_root = tmp_path / "artifacts" + _write_artifact( + artifact_root, + "entity-detection", + [ + { + "_seed_entities_json": json.dumps([_entity("Alice", "first_name", 0, 5)]), + "_seed_validation_candidates": json.dumps( + {"candidates": [{"id": "first_name_0_5", "value": "Alice", "label": "first_name"}]} + ), + "_augmented_entities": json.dumps( + { + "entities": [ + {"value": "Alice", "label": "first_name", "reason": "duplicate"}, + {"value": "12 February 1980", "label": "api_key", "reason": "date mislabeled"}, + ] + } + ), + "_detected_entities": json.dumps( + { + "entities": [ + _entity("Alice", "first_name", 0, 5), + _entity("12 February 1980", "api_key", 20, 36, source="augmenter"), + ] + } + ), + "_validation_candidates": json.dumps( + { + "candidates": [ + {"id": "first_name_0_5", "value": "Alice", "label": "first_name"}, + {"id": "api_key_20_36", "value": "12 February 1980", "label": "api_key"}, + ] + } + ), + } + ], + ) + + result = tool.analyze_artifacts(artifact_root) + + assert len(result.rows) == 1 + row = result.rows[0] + assert row.seed_entity_count == 1 + assert row.seed_validation_candidate_count == 1 + assert row.merged_validation_candidate_count == 2 + assert row.augmented_entity_count == 2 + assert row.augmented_duplicate_seed_value_count == 1 + assert row.augmented_new_value_count == 1 + assert row.augmented_new_final_value_count == 1 + assert row.final_entity_count == 2 + assert row.weak_api_key_shape_count == 1 + assert row.weak_api_key_shape_label_counts == {"api_key": 1} + assert row.final_entity_signature_count == 2 + assert len(row.final_entity_signature_hashes) == 2 + assert set(row.final_entity_signature_labels) == set(row.final_entity_signature_hashes) + assert sorted(row.final_entity_signature_labels.values()) == ["api_key", "first_name"] + assert set(row.final_entity_signature_details) == set(row.final_entity_signature_hashes) + first_name_detail = next( + detail for detail in row.final_entity_signature_details.values() if detail["label"] == "first_name" + ) + assert first_name_detail["source"] == "detector" + assert first_name_detail["row_index"] == 0 + assert first_name_detail["start_position"] == 0 + assert first_name_detail["end_position"] == 5 + assert first_name_detail["value_length"] == 5 + assert "value_hash" not in first_name_detail + + serialized = row.model_dump_json() + assert "Alice" not in serialized + assert "12 February" not in serialized + + +def test_detection_artifact_analysis_handles_no_augment_rows(tmp_path: Path) -> None: + tool = load_tool( + "measurement_detection_artifact_analysis_no_augment", + REPO_ROOT / "tools/measurement/analyze_detection_artifacts.py", + ) + artifact_root = tmp_path / "artifacts" + _write_artifact( + artifact_root, + "entity-detection-no-augment", + [ + { + "_seed_entities_json": json.dumps([_entity("Aydin", "city", 12, 17)]), + "_seed_validation_candidates": json.dumps( + {"candidates": [{"id": "city_12_17", "value": "Aydin", "label": "city"}]} + ), + "_augmented_entities": json.dumps({"entities": []}), + "_detected_entities": json.dumps({"entities": [_entity("Aydin", "city", 12, 17)]}), + } + ], + ) + + result = tool.analyze_artifacts(artifact_root) + + assert len(result.rows) == 1 + row = result.rows[0] + assert row.workflow_name == "entity-detection-no-augment" + assert row.seed_entity_count == 1 + assert row.seed_validation_candidate_count == 1 + assert row.merged_validation_candidate_count == 0 + assert row.augmented_entity_count == 0 + assert row.augmented_new_value_count == 0 + assert row.augmented_new_final_value_count == 0 + assert row.final_entity_count == 1 + assert row.final_source_counts == {"detector": 1} + assert row.final_entity_signature_count == 1 + assert row.final_entity_signature_hashes == sorted(row.final_entity_signature_hashes) + assert list(row.final_entity_signature_labels.values()) == ["city"] diff --git a/tests/tools/test_measurement_tools.py b/tests/tools/test_measurement_tools.py new file mode 100644 index 00000000..3aa4a009 --- /dev/null +++ b/tests/tools/test_measurement_tools.py @@ -0,0 +1,1125 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import importlib.util +import json +import sys +from collections.abc import Iterator +from contextlib import contextmanager +from pathlib import Path +from types import ModuleType +from typing import Any + +import pandas as pd +import pytest +import yaml +from pydantic import ValidationError + +from anonymizer.config.rewrite import DEFAULT_PRESERVE_TEXT + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def load_tool(module_name: str, path: Path) -> ModuleType: + spec = importlib.util.spec_from_file_location(module_name, path) + assert spec is not None + assert spec.loader is not None + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + sys.path.insert(0, str(path.parent)) + spec.loader.exec_module(module) + return module + + +def _minimal_case_contexts(tool: ModuleType, spec: Any, tmp_path: Path) -> dict[str, Any]: + return { + "base_dir": tmp_path, + "workloads": {workload.id: workload for workload in spec.workloads}, + "configs": {config.id: config for config in spec.configs}, + "raw_dir": tmp_path / "raw", + "dd_trace": tool.DDTraceMode.none, + "trace_dir": tmp_path / "traces", + "dd_task_trace": False, + "task_trace_dir": tmp_path / "task-traces", + "artifact_path": tmp_path / "artifacts", + } + + +def _minimal_benchmark_spec( + tool: ModuleType, + *, + suite_id: str = "suite", + configs: list[Any] | None = None, + case_retries: int = 0, + case_retry_backoff_sec: float = 0.0, + run_tags: dict[str, Any] | None = None, +) -> Any: + return tool.BenchmarkSpec( + suite_id=suite_id, + case_retries=case_retries, + case_retry_backoff_sec=case_retry_backoff_sec, + run_tags=run_tags or {}, + workloads=[tool.WorkloadSpec(id="input", source="input.csv")], + configs=configs or [tool.ConfigSpec(id="redact", replace="redact")], + ) + + +def _minimal_benchmark_case( + tool: ModuleType, + *, + suite_id: str = "suite", + workload_id: str = "input", + config_id: str = "redact", + repetition: int = 0, +) -> Any: + return tool.BenchmarkCase( + suite_id=suite_id, + workload_id=workload_id, + config_id=config_id, + repetition=repetition, + case_id=f"{workload_id}__{config_id}__r{repetition:03d}", + ) + + +def _write_text_input(tmp_path: Path, text: str = "Alice works at Acme") -> Path: + input_path = tmp_path / "input.csv" + pd.DataFrame({"text": [text]}).to_csv(input_path, index=False) + return input_path + + +def _copy_biography_data(tmp_path: Path, filename: str = "input.csv") -> Path: + source = REPO_ROOT / "docs" / "data" / "NVIDIA_synthetic_biographies.csv" + destination = tmp_path / filename + destination.write_bytes(source.read_bytes()) + return destination + + +def test_benchmark_spec_rejects_duplicate_matrix_entries() -> None: + tool = load_tool("measurement_benchmark_tool_duplicate_matrix", REPO_ROOT / "tools/measurement/run_benchmarks.py") + + with pytest.raises(ValidationError, match="duplicate matrix workload/config entry"): + tool.BenchmarkSpec( + suite_id="duplicate-suite", + workloads=[tool.WorkloadSpec(id="input", source="input.csv")], + configs=[tool.ConfigSpec(id="redact", replace="redact")], + matrix=[ + tool.MatrixEntry(workload="input", config="redact"), + tool.MatrixEntry(workload="input", config="redact"), + ], + ) + + +def test_benchmark_spec_rejects_reserved_run_tags() -> None: + tool = load_tool("measurement_benchmark_tool_reserved_tags", REPO_ROOT / "tools/measurement/run_benchmarks.py") + + with pytest.raises(ValidationError, match="reserved benchmark tag"): + tool.BenchmarkSpec( + suite_id="tag-suite", + run_tags={"pipeline_id": "1234", "case_id": "manual"}, + workloads=[tool.WorkloadSpec(id="input", source="input.csv")], + configs=[tool.ConfigSpec(id="redact", replace="redact")], + ) + + +def test_benchmark_config_rejects_evaluate_on_rewrite() -> None: + tool = load_tool("measurement_benchmark_tool_evaluate_rewrite", REPO_ROOT / "tools/measurement/run_benchmarks.py") + + with pytest.raises(ValidationError, match="evaluate is only supported for replace configs"): + tool.ConfigSpec( + id="rewrite-evaluate", + rewrite=tool.RewriteSpec(), + evaluate=True, + ) + + +def test_export_measurements_groups_records_by_type(tmp_path: Path) -> None: + tool = load_tool("measurement_export_tool", REPO_ROOT / "tools/measurement/export_measurements.py") + dataframe = pd.DataFrame( + [ + {"record_type": "run", "run_id": "case-a", "run_tags": {"suite_id": "suite-a"}}, + {"record_type": "stage", "run_id": "case-a", "stage": "detect", "metrics": {"rows": 2}}, + ] + ) + + result = tool.export_tables( + dataframe, + input_path=tmp_path / "measurements.jsonl", + output_dir=tmp_path / "tables", + export_format=tool.ExportFormat.csv, + overwrite=False, + ) + + assert result.total_rows == 2 + assert {table.record_type for table in result.tables} == {"run", "stage"} + assert (tmp_path / "tables/run.csv").exists() + assert (tmp_path / "tables/stage.csv").exists() + assert (tmp_path / "tables/manifest.json").exists() + + +def test_benchmark_exports_detection_artifact_analysis(tmp_path: Path) -> None: + tool = load_tool("measurement_benchmark_tool_artifact_analysis", REPO_ROOT / "tools/measurement/run_benchmarks.py") + artifact_root = tmp_path / "artifacts" + parquet_dir = artifact_root / "entity-detection" / "parquet-files" + parquet_dir.mkdir(parents=True) + pd.DataFrame( + [ + { + "_seed_entities_json": '[{"value":"Alice","label":"first_name","start_position":0,"end_position":5}]', + "_augmented_entities": '{"entities":[{"value":"Alice","label":"first_name"}]}', + "_detected_entities": ( + '{"entities":[{"value":"Alice","label":"first_name",' + '"start_position":0,"end_position":5,"source":"detector"}]}' + ), + } + ] + ).to_parquet(parquet_dir / "batch_00000.parquet", index=False) + output_path = tmp_path / "detection-artifacts.jsonl" + + result_path = tool.export_detection_artifact_analysis(artifact_root, output_path) + + assert result_path == output_path + rows = [pd.read_json(output_path, lines=True).iloc[0].to_dict()] + assert rows[0]["augmented_duplicate_seed_value_count"] == 1 + assert "Alice" not in output_path.read_text(encoding="utf-8") + + +def test_benchmark_detection_artifact_analysis_ignores_stale_artifacts(tmp_path: Path) -> None: + tool = load_tool("measurement_benchmark_tool_artifact_delta", REPO_ROOT / "tools/measurement/run_benchmarks.py") + artifact_root = tmp_path / "artifacts" + stale_dir = artifact_root / "entity-detection-old" / "parquet-files" + stale_dir.mkdir(parents=True) + pd.DataFrame( + [ + { + "_seed_entities_json": "[]", + "_augmented_entities": '{"entities":[]}', + "_detected_entities": '{"entities":[]}', + } + ] + ).to_parquet(stale_dir / "batch_00000.parquet", index=False) + snapshot = tool.snapshot_detection_artifacts(artifact_root) + fresh_dir = artifact_root / "entity-detection-new" / "parquet-files" + fresh_dir.mkdir(parents=True) + pd.DataFrame( + [ + { + "_seed_entities_json": "[]", + "_augmented_entities": '{"entities":[]}', + "_detected_entities": ( + '{"entities":[{"value":"sk-test-AAAAAAAAAAAAAAAAAAAAAAAA",' + '"label":"api_key","start_position":0,"end_position":32,"source":"augmenter"}]}' + ), + } + ] + ).to_parquet(fresh_dir / "batch_00000.parquet", index=False) + output_path = tmp_path / "detection-artifacts.jsonl" + + result_path = tool.export_detection_artifact_analysis( + artifact_root, + output_path, + artifact_snapshot=snapshot, + ) + + assert result_path == output_path + rows = pd.read_json(output_path, lines=True) + assert rows["workflow_name"].tolist() == ["entity-detection-new"] + assert rows["final_entity_count"].tolist() == [1] + + +def test_benchmark_case_detection_artifact_analysis_adds_case_metadata(tmp_path: Path) -> None: + tool = load_tool("measurement_benchmark_tool_artifact_case", REPO_ROOT / "tools/measurement/run_benchmarks.py") + artifact_root = tmp_path / "artifacts" + parquet_dir = artifact_root / "entity-detection" / "parquet-files" + parquet_dir.mkdir(parents=True) + snapshot = tool.snapshot_detection_artifacts(artifact_root) + pd.DataFrame( + [ + { + "_seed_entities_json": "[]", + "_augmented_entities": '{"entities":[]}', + "_detected_entities": ( + '{"entities":[{"value":"sk-test-AAAAAAAAAAAAAAAAAAAAAAAA",' + '"label":"api_key","start_position":0,"end_position":32,"source":"detector"}]}' + ), + } + ] + ).to_parquet(parquet_dir / "batch_00000.parquet", index=False) + case = tool.BenchmarkCase( + suite_id="suite-a", + workload_id="shell", + config_id="rules", + repetition=2, + case_id="shell__rules__r002", + ) + output_path = tmp_path / "raw" / "shell__rules__r002.detection-artifacts.jsonl" + + result_path = tool.export_case_detection_artifact_analysis( + artifact_root, + output_path, + case=case, + artifact_snapshot=snapshot, + ) + + assert result_path == output_path + rows = pd.read_json(output_path, lines=True) + assert rows["suite_id"].tolist() == ["suite-a"] + assert rows["workload_id"].tolist() == ["shell"] + assert rows["config_id"].tolist() == ["rules"] + assert rows["case_id"].tolist() == ["shell__rules__r002"] + assert rows["run_id"].tolist() == ["shell__rules__r002"] + assert rows["repetition"].tolist() == [2] + assert "sk-test" not in output_path.read_text(encoding="utf-8") + + +def test_run_suite_records_detection_artifact_analysis_path( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + tool = load_tool("measurement_benchmark_tool_run_suite_artifact", REPO_ROOT / "tools/measurement/run_benchmarks.py") + spec = _minimal_benchmark_spec(tool, suite_id="artifact-suite") + output_dir = tmp_path / "output" + output_dir.mkdir() + artifact_path = output_dir / "artifacts" + analysis_path = output_dir / "detection-artifacts.jsonl" + + class FakeAnonymizer: + def __init__(self, **kwargs: Any) -> None: + assert kwargs["artifact_path"] == artifact_path + + def fake_run_case(case: Any, *_args: Any, **_kwargs: Any) -> Any: + assert _kwargs["export_detection_artifacts"] is True + raw_path = output_dir / "raw" / f"{case.case_id}.jsonl" + raw_path.parent.mkdir() + raw_path.write_text('{"record_type":"run","run_id":"case"}\n', encoding="utf-8") + artifact_output_path = output_dir / "raw" / f"{case.case_id}.detection-artifacts.jsonl" + artifact_output_path.write_text( + '{"case_id":"input__redact__r000","workflow_name":"entity-detection"}\n', + encoding="utf-8", + ) + return case.model_copy( + update={ + "status": tool.CaseStatus.completed, + "measurement_path": str(raw_path), + "detection_artifact_path": str(artifact_output_path), + } + ) + + monkeypatch.setattr(tool, "Anonymizer", FakeAnonymizer) + monkeypatch.setattr(tool, "_run_case", fake_run_case) + monkeypatch.setattr(tool, "export_measurement_tables", lambda *_args: output_dir / "tables") + + result = tool.run_suite( + spec, + spec_path=tmp_path / "suite.yaml", + output_dir=output_dir, + export=True, + fail_fast=False, + dd_trace=tool.DDTraceMode.none, + trace_dir=None, + ) + + assert result.detection_artifact_analysis_path == str(analysis_path) + assert analysis_path.read_text(encoding="utf-8") == ( + '{"case_id":"input__redact__r000","workflow_name":"entity-detection"}\n' + ) + summary = (output_dir / "summary.json").read_text(encoding="utf-8") + assert "detection_artifact_analysis_path" in summary + assert "detection_artifact_path" in summary + + +def test_run_suite_skips_detection_artifact_analysis_when_export_disabled( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + tool = load_tool( + "measurement_benchmark_tool_run_suite_no_export_artifact", REPO_ROOT / "tools/measurement/run_benchmarks.py" + ) + spec = _minimal_benchmark_spec(tool, suite_id="artifact-suite") + output_dir = tmp_path / "output" + output_dir.mkdir() + + class FakeAnonymizer: + def __init__(self, **_kwargs: Any) -> None: + pass + + def fake_run_case(case: Any, *_args: Any, **_kwargs: Any) -> Any: + assert _kwargs["export_detection_artifacts"] is False + raw_path = output_dir / "raw" / f"{case.case_id}.jsonl" + raw_path.parent.mkdir() + raw_path.write_text('{"record_type":"run","run_id":"case"}\n', encoding="utf-8") + return case.model_copy(update={"status": tool.CaseStatus.completed, "measurement_path": str(raw_path)}) + + monkeypatch.setattr(tool, "Anonymizer", FakeAnonymizer) + monkeypatch.setattr(tool, "_run_case", fake_run_case) + monkeypatch.setattr( + tool, + "combine_detection_artifact_analysis", + lambda *_args: pytest.fail("artifact analysis should not be combined"), + ) + + result = tool.run_suite( + spec, + spec_path=tmp_path / "suite.yaml", + output_dir=output_dir, + export=False, + fail_fast=False, + dd_trace=tool.DDTraceMode.none, + trace_dir=None, + ) + + assert result.detection_artifact_analysis_path is None + assert not (output_dir / "detection-artifacts.jsonl").exists() + + +def test_benchmark_case_retries_transient_errors_and_records_attempts( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + tool = load_tool("measurement_benchmark_tool_case_retry_success", REPO_ROOT / "tools/measurement/run_benchmarks.py") + attempts: list[Path] = [] + spec = _minimal_benchmark_spec(tool, suite_id="retry-suite", case_retries=1) + case = _minimal_benchmark_case(tool, suite_id="retry-suite") + _write_text_input(tmp_path, text="Alice") + + def fake_execute_case(*_args: Any, raw_path: Path, **_kwargs: Any) -> None: + attempts.append(raw_path) + if len(attempts) == 1: + raise RuntimeError("transient provider health check failure") + raw_path.parent.mkdir(parents=True, exist_ok=True) + raw_path.write_text('{"record_type":"run"}\n', encoding="utf-8") + + monkeypatch.setattr(tool, "_execute_case", fake_execute_case) + + result = tool._run_case( + case, + spec, + contexts=_minimal_case_contexts(tool, spec, tmp_path), + anonymizer=object(), + fail_fast=False, + export_detection_artifacts=False, + ) + + assert result.status == tool.CaseStatus.completed + assert result.attempt_count == 2 + assert result.attempt_errors == ["transient provider health check failure"] + assert attempts == [tmp_path / "raw" / "input__redact__r000.jsonl"] * 2 + + +def test_benchmark_case_records_persistent_retry_failures( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + tool = load_tool("measurement_benchmark_tool_case_retry_failure", REPO_ROOT / "tools/measurement/run_benchmarks.py") + spec = _minimal_benchmark_spec(tool, suite_id="retry-suite", case_retries=1) + case = _minimal_benchmark_case(tool, suite_id="retry-suite") + + attempts = 0 + errors: list[str] = [] + + def fake_execute_case(*_args: Any, **_kwargs: Any) -> Any: + nonlocal attempts + attempts += 1 + raise RuntimeError(f"provider unavailable #{attempts}") + + def capture_error(case: Any, *, error: Exception, **kwargs: Any) -> Any: + errors.append(str(error)) + return original_run_case_error(case, error=error, **kwargs) + + original_run_case_error = tool._run_case_error + monkeypatch.setattr(tool, "_execute_case", fake_execute_case) + monkeypatch.setattr(tool, "_run_case_error", capture_error) + + result = tool._run_case( + case, + spec, + contexts=_minimal_case_contexts(tool, spec, tmp_path), + anonymizer=object(), + fail_fast=False, + export_detection_artifacts=False, + ) + + assert result.status == tool.CaseStatus.error + assert result.error == "provider unavailable #2" + assert result.attempt_count == 2 + assert result.attempt_errors == ["provider unavailable #1", "provider unavailable #2"] + assert errors == ["provider unavailable #2"] + assert attempts == 2 + + +def test_benchmark_case_fail_fast_skips_retries( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + tool = load_tool( + "measurement_benchmark_tool_case_retry_fail_fast", REPO_ROOT / "tools/measurement/run_benchmarks.py" + ) + spec = _minimal_benchmark_spec(tool, suite_id="retry-suite", case_retries=3) + case = _minimal_benchmark_case(tool, suite_id="retry-suite") + attempts = 0 + + def fake_execute_case(*_args: Any, **_kwargs: Any) -> Any: + nonlocal attempts + attempts += 1 + raise RuntimeError("fail fast") + + monkeypatch.setattr(tool, "_execute_case", fake_execute_case) + + with pytest.raises(RuntimeError, match="fail fast"): + tool._run_case( + case, + spec, + contexts=_minimal_case_contexts(tool, spec, tmp_path), + anonymizer=object(), + fail_fast=True, + export_detection_artifacts=False, + ) + + assert attempts == 1 + + +def test_combine_detection_artifact_analysis_separates_jsonl_chunks(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_tool_combine_artifact_newlines", + REPO_ROOT / "tools/measurement/run_benchmarks.py", + ) + first = tmp_path / "first.jsonl" + second = tmp_path / "second.jsonl" + first.write_text('{"case_id":"one"}', encoding="utf-8") + second.write_text('{"case_id":"two"}\n', encoding="utf-8") + destination = tmp_path / "combined.jsonl" + cases = [ + tool.BenchmarkCase( + suite_id="suite", + workload_id="input", + config_id="first", + repetition=0, + case_id="first", + detection_artifact_path=str(first), + ), + tool.BenchmarkCase( + suite_id="suite", + workload_id="input", + config_id="second", + repetition=0, + case_id="second", + detection_artifact_path=str(second), + ), + ] + + result = tool.combine_detection_artifact_analysis(cases, destination) + + assert result == destination + assert destination.read_text(encoding="utf-8") == '{"case_id":"one"}\n{"case_id":"two"}\n' + + +def test_benchmark_spec_validates_matrix_references(tmp_path: Path) -> None: + tool = load_tool("measurement_benchmark_tool", REPO_ROOT / "tools/measurement/run_benchmarks.py") + spec_path = tmp_path / "suite.yaml" + spec_path.write_text( + """ +suite_id: bad-suite +workloads: + - id: biography + source: input.csv +configs: + - id: redact + replace: redact +matrix: + - workload: missing + config: redact +""", + encoding="utf-8", + ) + + with pytest.raises(ValidationError, match="unknown workload"): + tool.load_spec(spec_path) + + +def test_benchmark_partial_rewrite_goal_uses_public_defaults() -> None: + tool = load_tool("measurement_benchmark_tool_defaults", REPO_ROOT / "tools/measurement/run_benchmarks.py") + + rewrite = tool.build_rewrite(tool.RewriteSpec(protect="Direct payroll identifiers")) + + assert rewrite.privacy_goal.protect == "Direct payroll identifiers" + assert rewrite.privacy_goal.preserve == DEFAULT_PRESERVE_TEXT + + +def test_benchmark_output_dir_requires_overwrite_for_existing_files(tmp_path: Path) -> None: + tool = load_tool("measurement_benchmark_tool_output", REPO_ROOT / "tools/measurement/run_benchmarks.py") + output_dir = tmp_path / "benchmark-output" + output_dir.mkdir() + existing = output_dir / "summary.json" + existing.write_text("{}", encoding="utf-8") + + with pytest.raises(ValueError, match="not empty"): + tool.prepare_output_dir(output_dir, overwrite=False, dry_run=False) + + tool.prepare_output_dir(output_dir, overwrite=True, dry_run=False) + + assert (output_dir / "raw").is_dir() + assert not existing.exists() + + +def test_benchmark_dry_run_expands_cases_without_writing(tmp_path: Path) -> None: + tool = load_tool("measurement_benchmark_tool_dry_run", REPO_ROOT / "tools/measurement/run_benchmarks.py") + spec_path = tmp_path / "suite.yaml" + spec_path.write_text( + """ +suite_id: smoke-suite +workloads: + - id: biography + source: biographies.csv + text_column: biography +configs: + - id: redact + replace: redact +matrix: + - workload: biography + config: redact + repetitions: 2 +""", + encoding="utf-8", + ) + _copy_biography_data(tmp_path, "biographies.csv") + output_dir = tmp_path / "dry-run-output" + + result = tool.run_or_plan( + spec_path, + output=output_dir, + overwrite=False, + dry_run=True, + export=False, + fail_fast=False, + ) + + assert len(result.cases) == 2 + assert result.table_dir is None + assert {case.status for case in result.cases} == {tool.CaseStatus.planned} + assert not output_dir.exists() + + +def test_benchmark_preflight_rejects_missing_text_column(tmp_path: Path) -> None: + tool = load_tool("measurement_benchmark_tool_preflight_input", REPO_ROOT / "tools/measurement/run_benchmarks.py") + input_path = tmp_path / "input.csv" + pd.DataFrame({"body": ["Alice works at Acme"]}).to_csv(input_path, index=False) + spec_path = tmp_path / "suite.yaml" + spec_path.write_text( + """ +suite_id: bad-input-suite +workloads: + - id: biography + source: input.csv + text_column: text +configs: + - id: redact + replace: redact +""", + encoding="utf-8", + ) + spec = tool.load_spec(spec_path) + + with pytest.raises(ValueError, match="workload 'biography' text_column 'text' not found"): + tool.preflight_suite(spec, spec_path=spec_path) + + +def test_build_input_materializes_sliced_csv_workload(tmp_path: Path) -> None: + tool = load_tool("measurement_benchmark_tool_sliced_input", REPO_ROOT / "tools/measurement/run_benchmarks.py") + input_path = tmp_path / "input.csv" + pd.DataFrame({"id": ["a", "b", "c", "d"], "text": ["row-a", "row-b", "row-c", "row-d"]}).to_csv( + input_path, index=False + ) + workload = tool.WorkloadSpec( + id="slice", + source="input.csv", + text_column="text", + id_column="id", + row_offset=1, + row_limit=2, + ) + + anonymizer_input = tool.build_input( + workload, + tmp_path, + slice_dir=tmp_path / "slices", + case_id="slice__redact__r000", + ) + + assert anonymizer_input.text_column == "text" + assert anonymizer_input.id_column == "id" + assert Path(anonymizer_input.source) != input_path + sliced = pd.read_csv(anonymizer_input.source) + assert sliced.to_dict("records") == [ + {"id": "b", "text": "row-b"}, + {"id": "c", "text": "row-c"}, + ] + + +def test_benchmark_preflight_rejects_sliced_remote_workload(tmp_path: Path) -> None: + tool = load_tool("measurement_benchmark_tool_sliced_remote", REPO_ROOT / "tools/measurement/run_benchmarks.py") + spec_path = tmp_path / "suite.yaml" + spec_path.write_text( + """ +suite_id: bad-remote-slice +workloads: + - id: remote + source: s3://bucket/input.csv + row_limit: 2 +configs: + - id: redact + replace: redact +""", + encoding="utf-8", + ) + spec = tool.load_spec(spec_path) + + with pytest.raises(ValueError, match="row slicing requires a local workload source"): + tool.preflight_suite(spec, spec_path=spec_path) + + +def test_benchmark_preflight_rejects_bad_model_alias_references(tmp_path: Path) -> None: + tool = load_tool("measurement_benchmark_tool_preflight_models", REPO_ROOT / "tools/measurement/run_benchmarks.py") + _copy_biography_data(tmp_path) + spec_path = tmp_path / "suite.yaml" + spec_path.write_text( + """ +suite_id: bad-model-suite +model_configs: | + selected_models: + detection: + entity_detector: detector + entity_validator: [validator] + entity_augmenter: augmenter + replace: + replacement_generator: missing-replacer + model_configs: + - alias: detector + model: test/detector + - alias: validator + model: test/validator + - alias: augmenter + model: test/augmenter +workloads: + - id: biography + source: input.csv + text_column: biography +configs: + - id: substitute + replace: substitute +""", + encoding="utf-8", + ) + spec = tool.load_spec(spec_path) + + with pytest.raises(ValueError, match="missing-replacer"): + tool.preflight_suite(spec, spec_path=spec_path) + + +def test_benchmark_preflight_rejects_missing_evaluate_model_alias(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_tool_preflight_evaluate_models", REPO_ROOT / "tools/measurement/run_benchmarks.py" + ) + _copy_biography_data(tmp_path) + spec_path = tmp_path / "suite.yaml" + spec_path.write_text( + """ +suite_id: bad-evaluate-model-suite +model_configs: | + selected_models: + detection: + entity_detector: detector + entity_validator: [validator] + entity_augmenter: augmenter + evaluate: + detection_validity_judge: missing-evaluator + model_configs: + - alias: detector + model: test/detector + - alias: validator + model: test/validator + - alias: augmenter + model: test/augmenter +workloads: + - id: biography + source: input.csv + text_column: biography +configs: + - id: redact-evaluate + replace: redact + evaluate: true +""", + encoding="utf-8", + ) + spec = tool.load_spec(spec_path) + + with pytest.raises(ValueError, match="evaluate.detection_validity_judge='missing-evaluator'"): + tool.preflight_suite(spec, spec_path=spec_path) + + +def test_benchmark_example_suites_are_portable() -> None: + example_paths = sorted((REPO_ROOT / "tools/measurement/examples").glob("*.yaml")) + assert example_paths + + allowed_public_endpoints = {"https://integrate.api.nvidia.com/v1"} + machine_specific_fragments = ( + "/root/", + "/Users/", + "/stable-cache/", + "gpu-dev-pod", + "serve-svc", + ) + path_fields = {"source", "model_configs", "model_providers", "artifact_path"} + + def walk(value: Any) -> Iterator[tuple[str, Any]]: + if isinstance(value, dict): + for key, item in value.items(): + yield str(key), item + yield from walk(item) + elif isinstance(value, list): + for item in value: + yield from walk(item) + + for example_path in example_paths: + payload = yaml.safe_load(example_path.read_text(encoding="utf-8")) + assert isinstance(payload, dict) + + for key, value in walk(payload): + if isinstance(value, str): + assert not any(fragment in value for fragment in machine_specific_fragments), ( + f"{example_path} contains machine-specific value for {key}: {value}" + ) + if key in path_fields: + assert not Path(value).is_absolute(), f"{example_path} uses absolute path for {key}: {value}" + if key in {"endpoint", "gliner_endpoint"}: + assert value in allowed_public_endpoints, ( + f"{example_path} should use an approved portable endpoint for {key}: {value}" + ) + + +def test_benchmark_ci_dd_trace_options_match_runner_enum() -> None: + tool = load_tool("measurement_benchmark_tool_ci", REPO_ROOT / "tools/measurement/run_benchmarks.py") + workflow = yaml.safe_load((REPO_ROOT / ".github/workflows/benchmark-ci.yml").read_text(encoding="utf-8")) + on_section = workflow.get("on", workflow.get(True)) + + options = on_section["workflow_dispatch"]["inputs"]["dd_trace"]["options"] + + assert options == [mode.value for mode in tool.DDTraceMode] + + +def test_benchmark_preflight_rejects_bad_provider_config(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_tool_preflight_providers", REPO_ROOT / "tools/measurement/run_benchmarks.py" + ) + _copy_biography_data(tmp_path) + provider_path = tmp_path / "providers.yaml" + provider_path.write_text("not_providers: []\n", encoding="utf-8") + spec_path = tmp_path / "suite.yaml" + spec_path.write_text( + """ +suite_id: bad-provider-suite +model_providers: providers.yaml +workloads: + - id: biography + source: input.csv + text_column: biography +configs: + - id: redact + replace: redact +""", + encoding="utf-8", + ) + spec = tool.load_spec(spec_path) + + with pytest.raises(ValueError, match="providers"): + tool.preflight_suite(spec, spec_path=spec_path) + + +def test_benchmark_preflight_accepts_provider_config_path(tmp_path: Path) -> None: + tool = load_tool( + "measurement_benchmark_tool_preflight_provider_path", REPO_ROOT / "tools/measurement/run_benchmarks.py" + ) + _copy_biography_data(tmp_path) + provider_path = tmp_path / "providers.yaml" + provider_path.write_text( + """ +providers: + - name: test-provider + endpoint: https://example.com/v1 + provider_type: openai + api_key: TEST_API_KEY +""", + encoding="utf-8", + ) + spec_path = tmp_path / "suite.yaml" + spec_path.write_text( + """ +suite_id: provider-path-suite +model_providers: providers.yaml +workloads: + - id: biography + source: input.csv + text_column: biography +configs: + - id: redact + replace: redact +""", + encoding="utf-8", + ) + spec = tool.load_spec(spec_path) + + tool.preflight_suite(spec, spec_path=spec_path) + + +def test_benchmark_case_passes_dd_trace_config_to_measurement_session( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + tool = load_tool("measurement_benchmark_tool_dd_trace", REPO_ROOT / "tools/measurement/run_benchmarks.py") + captured: list[Any] = [] + + @contextmanager + def fake_measurement_session(config: Any) -> Iterator[None]: + captured.append(config) + yield None + + class FakeAnonymizer: + def run(self, *, config: Any, data: Any) -> None: + assert config.replace is not None + assert data.text_column == "text" + + monkeypatch.setattr(tool, "configured_measurement_session", fake_measurement_session) + + spec = _minimal_benchmark_spec( + tool, + suite_id="trace-suite", + run_tags={"commit_sha": "abc123", "pipeline_id": "456"}, + ) + _write_text_input(tmp_path) + case = _minimal_benchmark_case(tool, suite_id="trace-suite") + trace_path = tmp_path / "traces" / "input__redact__r000.jsonl" + task_trace_path = tmp_path / "task-traces" / "input__redact__r000.jsonl" + + tool._execute_case( + FakeAnonymizer(), + spec.workloads[0], + spec.configs[0], + raw_path=tmp_path / "raw" / "input__redact__r000.jsonl", + trace_path=trace_path, + task_trace_path=task_trace_path, + case=case, + spec=spec, + base_dir=tmp_path, + dd_trace=tool.DDTraceMode.all_messages, + ) + + assert len(captured) == 1 + assert captured[0].dd_trace == "all_messages" + assert captured[0].dd_trace_path == trace_path + assert captured[0].dd_task_trace_path == task_trace_path + assert captured[0].streaming is True + assert captured[0].keep_records is False + assert captured[0].run_tags == { + "suite_id": "trace-suite", + "workload_id": "input", + "config_id": "redact", + "repetition": 0, + "case_id": "input__redact__r000", + "commit_sha": "abc123", + "pipeline_id": "456", + } + + +def test_benchmark_case_can_run_optional_evaluation( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + tool = load_tool("measurement_benchmark_tool_evaluate", REPO_ROOT / "tools/measurement/run_benchmarks.py") + from anonymizer.interface.results import AnonymizerResult + + calls: list[Any] = [] + run_result = AnonymizerResult( + dataframe=pd.DataFrame({"text": ["Alice works at Acme"]}), + trace_dataframe=pd.DataFrame({"text": ["Alice works at Acme"]}), + resolved_text_column="text", + failed_records=[], + replace_method=None, + ) + + @contextmanager + def fake_measurement_session(_config: Any) -> Iterator[None]: + yield None + + class FakeAnonymizer: + def run(self, *, config: Any, data: Any) -> object: + calls.append(("run", config.replace, data.text_column)) + return run_result + + def evaluate(self, result: object) -> object: + calls.append(("evaluate", result)) + return result + + monkeypatch.setattr(tool, "configured_measurement_session", fake_measurement_session) + + spec = _minimal_benchmark_spec( + tool, + suite_id="evaluate-suite", + configs=[tool.ConfigSpec(id="redact", replace="redact", evaluate=True)], + ) + _write_text_input(tmp_path) + case = _minimal_benchmark_case(tool, suite_id="evaluate-suite") + + tool._execute_case( + FakeAnonymizer(), + spec.workloads[0], + spec.configs[0], + raw_path=tmp_path / "raw" / "input__redact__r000.jsonl", + trace_path=None, + task_trace_path=None, + case=case, + spec=spec, + base_dir=tmp_path, + dd_trace=tool.DDTraceMode.none, + ) + + assert calls == [("run", tool.Redact(), "text"), ("evaluate", run_result)] + + +def test_benchmark_optional_evaluation_records_sanitized_judge_metrics(tmp_path: Path) -> None: + tool = load_tool("measurement_benchmark_tool_evaluate_metrics", REPO_ROOT / "tools/measurement/run_benchmarks.py") + from anonymizer.interface.results import AnonymizerResult + + dangerous_values = [ + "alice@example.com", + "bob@example.com", + "sk-secret-123", + "replacement-output-secret", + "nested-malformed-secret", + "raw judge prompt", + "raw judge response", + ] + run_result = AnonymizerResult( + dataframe=pd.DataFrame({"text": ["Alice has sk-secret-123"]}), + trace_dataframe=pd.DataFrame({"text": ["Alice has sk-secret-123"]}), + resolved_text_column="text", + failed_records=[], + replace_method=None, + ) + evaluated_public_columns = { + "text": ["Alice has sk-secret-123"], + "text_replaced": ["Avery has replacement-output-secret"], + "final_entities": [[{"value": "alice@example.com", "label": "email"}]], + "detection_valid": [False], + "detection_invalid_entities": [{"invalid_entities": [{"value": "alice@example.com", "label": "email"}]}], + "type_fidelity_valid": [False], + "type_fidelity_invalid_replacements": [ + {"invalid_replacements": [{"original": "alice@example.com", "synthetic": "bob@example.com"}]} + ], + "relational_consistency_valid": [False], + "relational_consistency_invalid_relations": [{"invalid_relations": [{"reasoning": "raw judge response"}]}], + "attribute_fidelity_valid": [False], + "attribute_fidelity_invalid_entities": ['[{"entity": "nested-malformed-secret"}'], + } + evaluated_result = AnonymizerResult( + dataframe=pd.DataFrame(evaluated_public_columns), + trace_dataframe=pd.DataFrame( + { + **evaluated_public_columns, + "_detection_judge": [ + { + "prompt": "raw judge prompt", + "response": "raw judge response", + "invalid_entities": [{"value": "alice@example.com"}], + } + ], + "_type_fidelity_judge": [ + {"invalid_replacements": [{"original": "alice@example.com", "synthetic": "bob@example.com"}]} + ], + } + ), + resolved_text_column="text", + failed_records=[], + replace_method=None, + ) + + class FakeAnonymizer: + def run(self, *, config: Any, data: Any) -> AnonymizerResult: + return run_result + + def evaluate(self, result: AnonymizerResult) -> AnonymizerResult: + assert result is run_result + return evaluated_result + + spec = _minimal_benchmark_spec( + tool, + suite_id="evaluate-suite", + configs=[ + tool.ConfigSpec( + id="substitute", + replace=tool.ReplaceSpec(strategy=tool.ReplaceKind.substitute), + evaluate=True, + ) + ], + ) + _write_text_input(tmp_path, "Alice has sk-secret-123") + case = _minimal_benchmark_case(tool, suite_id="evaluate-suite", config_id="substitute") + measurement_path = tmp_path / "raw" / "input__substitute__r000.jsonl" + + tool._execute_case( + FakeAnonymizer(), + spec.workloads[0], + spec.configs[0], + raw_path=measurement_path, + trace_path=None, + task_trace_path=None, + case=case, + spec=spec, + base_dir=tmp_path, + dd_trace=tool.DDTraceMode.none, + ) + + serialized = measurement_path.read_text(encoding="utf-8") + rows = [json.loads(line) for line in serialized.splitlines()] + evaluation_rows = [row for row in rows if row["record_type"] == "evaluation_record"] + + assert len(evaluation_rows) == 1 + assert { + "record_type": "evaluation_record", + "mode": "replace", + "strategy": "Substitute", + "row_index": 0, + "detection_valid": False, + "detection_invalid_entity_count": 1, + "type_fidelity_valid": False, + "type_fidelity_invalid_replacement_count": 1, + "relational_consistency_valid": False, + "relational_consistency_invalid_relation_count": 1, + "attribute_fidelity_valid": False, + "attribute_fidelity_invalid_entity_count": 0, + }.items() <= evaluation_rows[0].items() + forbidden_fields = { + "text", + "text_replaced", + "text_with_spans", + "final_entities", + "detection_invalid_entities", + "type_fidelity_invalid_replacements", + "relational_consistency_invalid_relations", + "attribute_fidelity_invalid_entities", + "_detection_judge", + "_type_fidelity_judge", + "_relational_consistency_judge", + "_attribute_fidelity_judge", + } + assert forbidden_fields.isdisjoint(evaluation_rows[0]) + for raw_value in dangerous_values: + assert raw_value not in serialized + + table_dir = tmp_path / "tables" + tool.export_measurement_tables(measurement_path, table_dir) + exported = pd.read_parquet(table_dir / "evaluation_record.parquet") + exported_text = str(exported.to_json(orient="records")) + + assert forbidden_fields.isdisjoint(exported.columns) + for raw_value in dangerous_values: + assert raw_value not in exported_text diff --git a/tools/measurement/AGENTS.md b/tools/measurement/AGENTS.md new file mode 100644 index 00000000..a2db7203 --- /dev/null +++ b/tools/measurement/AGENTS.md @@ -0,0 +1,46 @@ + + + +# Measurement tool agent notes + +This directory is for benchmark and analysis tooling around Anonymizer. Keep +product behavior in `src/anonymizer` and keep benchmark-only strategy switches +inside `tools/measurement`. + +## Boundaries + +- The measurement layer records facts about Anonymizer runs. It should not + decide production defaults. +- `run_benchmarks.py` owns local benchmark suite execution, preflight checks, + per-case raw shards, and measurement export. +- Direct and staged probe scripts are prompt/runtime experiments. Promote a + probe into `run_benchmarks.py` only after it has stable artifacts, analysis + fields, and regression coverage. +- Distributed DataDesigner execution belongs outside this directory. Detection + export APIs build configs for an external runtime; the measurement tools + should analyze the artifacts that runtime writes. +- Shared command-line concerns live in `measurement_tools/`: CLI logging, + output formats, table writing, and small numeric aggregations. Do not + redefine `LogFormat`, `ExportFormat`, bad-input logging, or model-row table + export in each script. +- Prefer explicit specs and functions over analyzer base classes. A script + should own its row models, parsing, and metric semantics; shared helpers + should own boring IO/aggregation policy. + +## Tests + +Prefer fixtures that look like tool inputs over large constructed tables inside +test functions. For analysis tools, checked-in fixture directories under +`tests/fixtures/measurement/` are easier to review than hundreds of inline +JSON-like rows. + +Keep tests focused on contracts: + +- input files accepted by the tool +- output table shape and key grouping fields +- safety gates and verdicts +- sensitive values excluded from sanitized analysis output +- preflight failures for user-actionable mistakes + +Avoid exhaustive assertions for every derived metric in one test. Add a focused +test when a metric has non-obvious behavior or has regressed before. diff --git a/tools/measurement/README.md b/tools/measurement/README.md new file mode 100644 index 00000000..f98f5360 --- /dev/null +++ b/tools/measurement/README.md @@ -0,0 +1,438 @@ + + + +# Measurement tools + +This directory contains developer tools for measuring Anonymizer runs and +exporting measurement JSONL to tables. Run the tools inside the project +environment, either with an activated venv or through `uv run`. + +Use these tools when you need evidence about cost, latency, reliability, or +anonymization quality. They are not product entry points. + +## Quick export to DataFrames or CSV + +Start here when you have a `measurements.jsonl` file and want to analyze it in +pandas, Polars, a spreadsheet, or another local tool. + +```bash +uv run python tools/measurement/export_measurements.py \ + benchmark-runs/suite/measurements.jsonl \ + --output benchmark-runs/suite/tables \ + --overwrite +``` + +By default, the exporter writes one Parquet table per measurement record type +plus `manifest.json`: + +- `run.parquet` +- `stage.parquet` +- `record.parquet` +- `evaluation_record.parquet` when replace judge evaluation is enabled +- `ndd_workflow.parquet` when DataDesigner adapter records are present +- `model_workflow.parquet` when direct model workflow records are present + +Use CSV or JSONL when those are easier to inspect: + +```bash +uv run python tools/measurement/export_measurements.py \ + benchmark-runs/suite/measurements.jsonl \ + --output benchmark-runs/suite/tables-csv \ + --format csv \ + --overwrite +``` + +Then load the tables directly: + +```python +import pandas as pd + +records = pd.read_parquet("benchmark-runs/suite/tables/record.parquet") +stages = pd.read_parquet("benchmark-runs/suite/tables/stage.parquet") +ndd = pd.read_parquet("benchmark-runs/suite/tables/ndd_workflow.parquet") +``` + +You can also read the raw log, but the exporter is the better default because +it splits records by `record_type` and normalizes nested fields into columns. + +```python +import pandas as pd + +raw = pd.read_json("benchmark-runs/suite/measurements.jsonl", lines=True) +``` + +## System overview + +The measurement system has three layers: + +- Instrumentation in Anonymizer emits JSONL records for runs, stages, + DataDesigner workflows, direct model workflows, per-record safety metrics, + and optional sanitized replace-judge evaluation metrics. +- Benchmark runners create repeatable workloads and write those JSONL records + plus optional sidecars such as detection artifacts and DataDesigner traces. +- Analysis tools convert raw run artifacts into case, group, and model tables. + +External/distributed execution is a separate boundary. Detection export APIs are +responsible for building DataDesigner configs that an external runtime can +execute. The tools here should consume the resulting measurement JSONL, +detection artifacts, and trace sidecars; they should not own SLURM +orchestration or distributed DataDesigner execution. + +## Tool map + +| Task | Tool | +| --- | --- | +| Export raw measurement JSONL to tables | `export_measurements.py` | +| Run repeatable Anonymizer suites | `run_benchmarks.py` | +| Inspect detection artifact sidecars | `analyze_detection_artifacts.py` | +| Analyze benchmark output directories | `analyze_benchmark_output.py` | + +Most workflows start with `run_benchmarks.py`, then either export the raw +measurement log with `export_measurements.py` or summarize the benchmark output +directory with `analyze_benchmark_output.py`. + +## Implementation shape + +The scripts keep workload-specific row models and metric logic local, but share +boring command and export policy through `measurement_tools/`: + +- `measurement_tools.cli`: `LogFormat`, logging setup, and structured + bad-input errors. +- `measurement_tools.tables`: `ExportFormat`, model-row table specs, manifest + writing, and CSV/Parquet/JSONL output. +- `measurement_tools.stats`: small numeric helpers used by analysis groupers. + +This is intentionally composition-based. New analysis tools should declare +their own row models and call the shared helpers rather than inheriting from a +common analyzer base class. + +## Benchmark runner + +`run_benchmarks.py` runs repeatable Anonymizer workloads and writes the same +measurement JSONL format, one raw file per benchmark case plus a combined +`measurements.jsonl`. + +```bash +uv run python tools/measurement/run_benchmarks.py suite.yaml --output benchmark-runs/suite +uv run python tools/measurement/run_benchmarks.py suite.yaml --dry-run --json +uv run python tools/measurement/run_benchmarks.py suite.yaml \ + --output benchmark-runs/suite \ + --dd-trace last_message +uv run python tools/measurement/run_benchmarks.py suite.yaml \ + --output benchmark-runs/suite \ + --dd-task-trace +``` + +The repo-data smoke suite can be run with DataDesigner traces enabled: + +```bash +bash tools/measurement/examples/run-repo-data-smoke-with-dd-traces.sh +``` + +The script writes to `/tmp/anonymizer-repo-data-smoke-dd-traces` by default. +Pass a different output directory as the first argument, or set +`DD_TRACE_MODE=all_messages` when full chat history is needed: + +```bash +DD_TRACE_MODE=all_messages \ + bash tools/measurement/examples/run-repo-data-smoke-with-dd-traces.sh \ + /tmp/anonymizer-repo-data-smoke-dd-traces-full +``` + +## Benchmark CI + +`.github/workflows/benchmark-ci.yml` runs the same benchmark runner from a +manual GitHub Actions dispatch. It targets the self-hosted +`anonymizer-evals` runner, checks out the requested ref, installs the project +environment, runs a suite, appends a short case summary to the GitHub step +summary, and uploads the full output directory as a workflow artifact. + +The job is intentionally manual. It runs only through `workflow_dispatch`; it +does not run on `push`, `pull_request`, `schedule`, or the default PR CI path. +GitHub exposes manual dispatch only after the workflow file exists on the +repository default branch. After that, launch it from the Actions UI, GitHub +CLI, or API. + +The default suite is `tools/measurement/examples/repo-data-smoke.yaml`. Dispatch +inputs let operators choose the ref, suite path, output directory, +DataDesigner message trace mode, sanitized scheduler task traces, and +fail-fast behavior. The workflow requires the repository secret +`NVIDIA_API_KEY` because the default model configuration uses NVIDIA-hosted +models. + +The `ref` input defaults to `main`. To benchmark a PR or experiment branch, set +`ref` to that branch name or commit SHA. The workflow checks out that ref and +uses the benchmark runner and suite files from the checkout, so the selected ref +must contain `tools/measurement/run_benchmarks.py` and the requested suite path. + +Benchmark suites are YAML files with three parts: + +- `workloads`: input datasets and text-column metadata. +- `configs`: Anonymizer replace or rewrite configurations. +- `matrix`: optional workload/config pairs and repetition counts. When omitted, + every workload is crossed with every config once. + +Example: + +```yaml +suite_id: biography-smoke +model_configs: ./model-configs.yaml +model_providers: ./providers.yaml +run_tags: + anonymizer_ref: main + commit_sha: abc123 + pipeline_id: "456" +case_retries: 1 +case_retry_backoff_sec: 10 +workloads: + - id: biographies + source: ./data/biographies.csv + text_column: text + row_limit: 25 + - id: support + source: ./data/support.csv + text_column: body + id_column: ticket_id + row_offset: 100 + row_limit: 50 +configs: + - id: redact-default + replace: redact + evaluate: true + - id: hash-agent-labels + detect: + entity_labels: [person, email, api_key, password] + replace: + strategy: hash + digest_length: 12 + - id: rewrite-low-risk + rewrite: + risk_tolerance: low + max_repair_iterations: 1 +matrix: + - workload: biographies + config: redact-default + repetitions: 3 + - workload: support + config: hash-agent-labels +``` + +Use `row_limit` and `row_offset` to create cheap, repeatable slices of a local +CSV or Parquet workload. The runner materializes a per-case sliced input under +`raw/inputs/` before calling Anonymizer, so each case keeps a stable input file +even when the matrix has multiple configs or repetitions. Slicing is rejected +for URL-like sources because the runner cannot safely materialize a local +subset without downloading the whole dataset first. + +Relative paths in suite files are resolved from the suite file's directory. +The runner refuses to write into a non-empty output directory unless +`--overwrite` is set. By default it also exports Parquet tables into `tables/`; +pass `--no-export` when only raw measurement JSONL is needed. + +Set `model_configs` and `model_providers` explicitly in checked-in or CI suites. +Relying on Anonymizer defaults makes a run depend on the caller's installed +defaults and provider environment. In provider YAML, put environment variable +names such as `NVIDIA_API_KEY` in `api_key`; do not commit raw keys. The bundled +`repo-data-smoke.yaml` follows this pattern with adjacent model/provider files. + +Use `run_tags` for stable suite-level metadata copied into every measurement +record, such as source refs, commit SHAs, CI pipeline IDs, topology labels, or +benchmark-suite revisions. The runner reserves `suite_id`, `workload_id`, +`config_id`, `repetition`, and `case_id` for its own case identity tags. + +Set `evaluate: true` on a replace config when the benchmark should run +`Anonymizer.evaluate()` after `run()` and capture the LLM-as-judge work in the +same case. This is intentionally replace-only for now; rewrite runs already +perform their internal evaluation/repair loop during `run()`. + +When evaluation is enabled, the safe measurement log includes +`evaluation_record` rows with judge verdict booleans and invalid-item counts. +It does not persist the evaluated result dataframe or trace dataframe. Those +dataframes can contain original text, entity values, replacement values, raw +judge outputs, prompts, and model responses. + +Before starting a real run, the benchmark runner performs cheap preflight +checks: suite/config parsing, local dataset existence, CSV/Parquet text-column +metadata, provider YAML shape, and active model-alias references. `--dry-run` +runs those same checks, expands the planned matrix, and skips output-dir writes +and model work. + +Use `case_retries` and `case_retry_backoff_sec` for long-running suites on +shared model endpoints. Retries are disabled by default. When enabled, a failed +case is retried with the same `case_id` and output paths; the final case still +records `attempt_count` and `attempt_errors` in `summary.json`. `--fail-fast` +remains fail-fast and bypasses retries. + +## DataDesigner traces + +For debugging DataDesigner calls, pass `--dd-trace last_message` or +`--dd-trace all_messages`. Trace records are written separately from sanitized +measurements, under `traces/{case_id}.jsonl` by default. Use `--trace-dir` to +choose another directory. `last_message` stores only the final prompt message +for each DataDesigner model call; `all_messages` stores the full message list. + +DataDesigner traces may contain raw input text, prompts, model outputs, entity +values, replacement values, secrets, and PII. Treat them as debug artifacts: +keep them out of shared benchmark bundles unless they have been reviewed or +redacted. + +Anonymizer requests standard LLM-column traces through DataDesigner native LLM +column trace side effects. That covers `LLMTextColumnConfig` and +`LLMStructuredColumnConfig`. Model-backed `CustomColumnConfig` generator +functions are traced through a temporary Anonymizer shim that instruments the +per-run DataDesigner model registry and returned model facades. This is a +brittle bridge over private DataDesigner internals until DataDesigner exposes a +public model-call trace sink. + +Safe measurement output includes a `dd_trace_coverage` record with native, +private-facade, and unsupported column counts so trace-enabled runs can detect +which path covered each workflow. + +## DataDesigner Scheduler Task Traces + +Pass `--dd-task-trace` to collect sanitized DataDesigner async scheduler task +timing records. The benchmark runner writes one sidecar per case under +`task-traces/{case_id}.jsonl` by default; use `--task-trace-dir` to choose +another directory. + +Task trace records are separate from raw message traces. They include scheduler +metadata such as workflow name, column, row group, row index, task type, status, +relative dispatch/slot-acquired/completion offsets, queue wait time, execution +time, total time, and whether an error was present. They intentionally do not +store raw DataDesigner error strings because those can contain prompts, outputs, +or source values. + +Offsets are relative to the earliest positive `dispatched_at` timestamp in each +DataDesigner workflow trace batch written into the case sidecar. They are meant +for timeline analysis without storing host-specific wall-clock timestamps. + +```bash +uv run python tools/measurement/run_benchmarks.py \ + suite.yaml \ + --output benchmark-runs/suite \ + --dd-task-trace +``` + +## Benchmark analysis + +`analyze_benchmark_output.py` joins `measurements.jsonl`, optional +DataDesigner traces, and detection artifact sidecars into richer case/group +tables: + +```bash +uv run python tools/measurement/analyze_benchmark_output.py \ + benchmark-runs/suite-id \ + --output benchmark-runs/suite-id/analysis \ + --format csv +``` + +Important outputs: + +- `case_analysis.*`: one row per benchmark case. +- `group_analysis.*`: median and aggregate metrics grouped by workload/config. +- `model_usage.*`: one row per measured model usage entry. +- `model_usage_group_analysis.*`: model usage rolled up by workflow/model. + +Use `--detection-artifacts` to provide an explicit detection artifact JSONL +sidecar. Otherwise, the analyzer reads `detection-artifacts.jsonl` in the +benchmark directory when present. + +## Pandas patterns + +Analysis tables are regular CSV/Parquet files. A typical local workflow: + +```python +import pandas as pd + +cases = pd.read_parquet("benchmark-runs/suite/analysis/case_analysis.parquet") +groups = pd.read_parquet("benchmark-runs/suite/analysis/group_analysis.parquet") + +cols = [ + "workload_id", + "config_id", + "median_pipeline_elapsed_sec", + "median_observed_total_requests", + "median_observed_total_tokens", + "median_artifact_final_entity_signature_count", +] +print(groups[cols].sort_values(["workload_id", "median_pipeline_elapsed_sec"])) + +failures = cases[ + (cases["case_failed"]) | + (cases["observed_failed_requests"] > 0) | + (cases["dd_trace_error_count"] > 0) +] +print(failures[["case_id", "config_id", "observed_failed_requests", "dd_trace_error_count"]]) +``` + +## Metric interpretation + +Use metrics as signals, not as a single score. + +Latency and throughput: + +- `elapsed_sec`: wall time for a measured stage or workflow. +- `pipeline_elapsed_sec`: end-to-end Anonymizer wall time for a case. +- `records_per_pipeline_sec`: completed input records per pipeline second. +- `input_text_tokens_per_pipeline_sec`: input text tokens processed per + pipeline second. + +Model work: + +- `observed_total_requests`: measured model requests from DataDesigner or direct + model workflow records. +- `observed_total_tokens`: measured input plus output tokens. +- `observed_failed_requests`: provider-level failed requests. +- `observed_bridge_fallback_requests`: sync-client fallback requests recorded + from DataDesigner traces. +- `observed_non_bridge_failed_requests`: failed requests after subtracting + sync-client bridge fallbacks. Prefer this field when judging endpoint + reliability from trace-enabled runs. + +Detection artifacts: + +- `seed_entity_count`: detector or direct-seed candidate count before + validation. +- `seed_validation_candidate_count`: candidates sent to validation. +- `estimated_seed_validation_chunk_count`: estimated validator chunks from the + active validation chunk size. +- `augmented_entity_count`: augmenter suggestions. +- `augmented_new_final_value_count`: augmenter suggestions that add values not + already present in the seed/final set. +- `artifact_final_detector_entity_count` and + `artifact_final_augmenter_entity_count`: final entity source counts derived + from detection artifact sidecars. +- `artifact_final_entity_signature_count` and + `artifact_final_entity_signature_hashes`: opaque final-span signatures derived + from detection artifacts. These do not include raw entity values. + +Safety and replacement: + +- `original_value_leak_count`: count of protected original values still present + in replaced output. +- `replacement_missing_final_entity_count`: final entity occurrences whose + original value has no replacement-map entry. +- `replacement_missing_final_value_count`: unique final entity values with no + replacement-map entry. +- `replacement_synthetic_original_collision_count`: final entity occurrences + whose original value was reused as a synthetic replacement value elsewhere in + the same record. + +Replace judge evaluation: + +- `detection_valid`, `type_fidelity_valid`, + `relational_consistency_valid`, and `attribute_fidelity_valid`: per-record + judge verdicts when `evaluate: true` is enabled. +- `detection_invalid_entity_count`, + `type_fidelity_invalid_replacement_count`, + `relational_consistency_invalid_relation_count`, and + `attribute_fidelity_invalid_entity_count`: counts of invalid judge findings. + These fields count structures returned by the judges but do not include raw + values, replacement strings, or judge reasoning text. +- `case_analysis` also includes per-case rollups for each judge family: + `{family}_judged_record_count`, `{family}_valid_record_count`, + `{family}_valid_rate`, and the corresponding invalid-count field. +- `group_analysis` includes grouped micro-rate rollups: + `sum_{family}_judged_record_count`, `sum_{family}_valid_record_count`, + `micro_{family}_valid_rate`, and `sum_{invalid_count_field}`. These rates are + computed from summed counts, not medians of case-level rates. diff --git a/tools/measurement/analyze_benchmark_output.py b/tools/measurement/analyze_benchmark_output.py new file mode 100644 index 00000000..434cef6f --- /dev/null +++ b/tools/measurement/analyze_benchmark_output.py @@ -0,0 +1,1552 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Analyze joined benchmark measurements and detection artifact sidecars. + +Usage: + uv run python tools/measurement/analyze_benchmark_output.py benchmark-runs/suite-id + uv run python tools/measurement/analyze_benchmark_output.py benchmark-runs/suite-id --output analysis + uv run python tools/measurement/analyze_benchmark_output.py benchmark-runs/suite-id --detection-artifacts current.jsonl + uv run python tools/measurement/analyze_benchmark_output.py benchmark-runs/suite-id --json +""" + +from __future__ import annotations + +import json +import logging +import math +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Annotated, Any, cast + +import cyclopts +import pandas as pd +from measurement_tools.cli import LogFormat, configure_logging, log_bad_input +from measurement_tools.stats import median_or_none as _median_or_none +from measurement_tools.stats import none_if_nan as _none_if_nan +from measurement_tools.stats import sum_int_or_zero as _sum_int_or_zero +from measurement_tools.stats import sum_or_none as _sum_or_none +from measurement_tools.stats import sum_or_zero as _sum_or_zero +from measurement_tools.tables import AnalysisExportResult, ExportFormat, ModelTableSpec +from measurement_tools.tables import write_analysis_tables as _write_analysis_table_specs +from pydantic import BaseModel, Field, computed_field + +app = cyclopts.App(help=__doc__) +logger = logging.getLogger("measurement.benchmark_output") + +_SYNC_CLIENT_UNAVAILABLE_ERROR = "SyncClientUnavailableError" +_SIGNATURE_DETAIL_FIELDS = { + "label", + "source", + "row_index", + "start_position", + "end_position", + "value_length", +} + + +@dataclass(frozen=True) +class _EvaluationRollup: + prefix: str + valid_column: str + invalid_count_column: str + + +_EVALUATION_ROLLUPS = ( + _EvaluationRollup("detection", "detection_valid", "detection_invalid_entity_count"), + _EvaluationRollup("type_fidelity", "type_fidelity_valid", "type_fidelity_invalid_replacement_count"), + _EvaluationRollup( + "relational_consistency", + "relational_consistency_valid", + "relational_consistency_invalid_relation_count", + ), + _EvaluationRollup("attribute_fidelity", "attribute_fidelity_valid", "attribute_fidelity_invalid_entity_count"), +) + + +class CaseAnalysisRow(BaseModel): + suite_id: str | None = None + workload_id: str | None = None + workload_category: str | None = None + config_id: str | None = None + experimental_detection_strategy: str | None = None + experimental_replacement_strategy: str | None = None + dd_parser_compat: str | None = None + entity_label_set_id: str | None = None + entity_label_count: int | None = None + gliner_threshold: float | None = None + repetition: int | None = None + case_id: str + run_id: str + case_failed: bool = False + error_stage_count: int = 0 + error_ndd_workflow_count: int = 0 + error_model_workflow_count: int = 0 + pipeline_elapsed_sec: float | None = None + ndd_workflow_count: int = 0 + ndd_elapsed_sec_total: float = 0.0 + observed_total_requests: int = 0 + observed_successful_requests: int = 0 + observed_input_tokens: int = 0 + observed_output_tokens: int = 0 + observed_total_tokens: int = 0 + observed_failed_requests: int = 0 + observed_failed_request_rate: float | None = None + dd_trace_record_count: int = 0 + dd_trace_error_count: int = 0 + dd_trace_sync_client_unavailable_count: int = 0 + observed_bridge_fallback_requests: int | None = None + observed_non_bridge_total_requests: int | None = None + observed_non_bridge_failed_requests: int | None = None + observed_non_bridge_failed_request_rate: float | None = None + record_count: int = 0 + input_text_tokens_total: int | None = None + records_per_pipeline_sec: float | None = None + records_per_ndd_sec: float | None = None + input_text_tokens_per_pipeline_sec: float | None = None + input_text_tokens_per_ndd_sec: float | None = None + topology_endpoint_count: float | None = None + topology_gpu_count: float | None = None + topology_tensor_parallelism: float | None = None + topology_shard_count: float | None = None + input_text_tokens_per_endpoint_sec: float | None = None + input_text_tokens_per_gpu_sec: float | None = None + final_entity_count: float | None = None + empty_detection_count: int = 0 + empty_detection_rate: float | None = None + empty_detection_with_ground_truth_count: int = 0 + empty_detection_with_ground_truth_rate: float | None = None + ground_truth_record_count: int = 0 + ground_truth_entity_count: float | None = None + entity_true_positive_count: float | None = None + entity_false_positive_count: float | None = None + entity_false_negative_count: float | None = None + entity_precision: float | None = None + entity_recall: float | None = None + entity_f1: float | None = None + entity_relaxed_gt_found_count: float | None = None + entity_relaxed_detected_tp_count: float | None = None + entity_relaxed_label_compatible_gt_found_count: float | None = None + entity_relaxed_label_compatible_detected_tp_count: float | None = None + entity_relaxed_precision: float | None = None + entity_relaxed_recall: float | None = None + entity_relaxed_f1: float | None = None + entity_relaxed_label_compatible_precision: float | None = None + entity_relaxed_label_compatible_recall: float | None = None + entity_relaxed_label_compatible_f1: float | None = None + replacement_count: float | None = None + replacement_missing_final_entity_count: float | None = None + replacement_missing_final_entity_label_counts: dict[str, int] = Field(default_factory=dict) + replacement_missing_final_value_count: float | None = None + replacement_synthetic_original_collision_count: float | None = None + replacement_synthetic_original_collision_label_counts: dict[str, int] = Field(default_factory=dict) + replacement_synthetic_original_collision_value_count: float | None = None + original_value_leak_count: float | None = None + original_value_leak_record_count: int = 0 + original_value_leak_label_counts: dict[str, int] = Field(default_factory=dict) + detection_judged_record_count: int = 0 + detection_valid_record_count: int = 0 + detection_valid_rate: float | None = None + detection_invalid_entity_count: int = 0 + type_fidelity_judged_record_count: int = 0 + type_fidelity_valid_record_count: int = 0 + type_fidelity_valid_rate: float | None = None + type_fidelity_invalid_replacement_count: int = 0 + relational_consistency_judged_record_count: int = 0 + relational_consistency_valid_record_count: int = 0 + relational_consistency_valid_rate: float | None = None + relational_consistency_invalid_relation_count: int = 0 + attribute_fidelity_judged_record_count: int = 0 + attribute_fidelity_valid_record_count: int = 0 + attribute_fidelity_valid_rate: float | None = None + attribute_fidelity_invalid_entity_count: int = 0 + validation_max_entities_per_call: int | None = None + detection_artifact_rows: int = 0 + seed_entity_count: float | None = None + seed_validation_candidate_count: float | None = None + estimated_seed_validation_chunk_count: float | None = None + augmented_entity_count: float | None = None + augmented_new_final_value_count: float | None = None + artifact_final_entity_count: float | None = None + artifact_final_detector_entity_count: float | None = None + artifact_final_augmenter_entity_count: float | None = None + artifact_final_entity_signature_count: float | None = None + artifact_final_entity_signature_hashes: list[str] = Field(default_factory=list) + artifact_final_entity_signature_labels: dict[str, str] = Field(default_factory=dict) + artifact_final_entity_signature_details: dict[str, dict[str, Any]] = Field(default_factory=dict) + + +class GroupAnalysisRow(BaseModel): + workload_id: str | None = None + workload_category: str | None = None + config_id: str | None = None + experimental_detection_strategy: str | None = None + experimental_replacement_strategy: str | None = None + entity_label_set_id: str | None = None + entity_label_count: int | None = None + gliner_threshold: float | None = None + case_count: int + failed_case_count: int = 0 + failed_case_rate: float | None = None + error_stage_count: int = 0 + error_ndd_workflow_count: int = 0 + error_model_workflow_count: int = 0 + median_pipeline_elapsed_sec: float | None = None + median_ndd_elapsed_sec_total: float | None = None + median_observed_total_requests: float | None = None + median_observed_successful_requests: float | None = None + median_observed_input_tokens: float | None = None + median_observed_output_tokens: float | None = None + median_observed_total_tokens: float | None = None + median_observed_failed_requests: float | None = None + median_observed_failed_request_rate: float | None = None + median_observed_bridge_fallback_requests: float | None = None + median_observed_non_bridge_total_requests: float | None = None + median_observed_non_bridge_failed_requests: float | None = None + median_observed_non_bridge_failed_request_rate: float | None = None + total_record_count: int = 0 + median_record_count: float | None = None + total_input_text_tokens: int | None = None + median_input_text_tokens_total: float | None = None + median_records_per_pipeline_sec: float | None = None + median_records_per_ndd_sec: float | None = None + median_input_text_tokens_per_pipeline_sec: float | None = None + median_input_text_tokens_per_ndd_sec: float | None = None + median_topology_endpoint_count: float | None = None + median_topology_gpu_count: float | None = None + median_topology_tensor_parallelism: float | None = None + median_topology_shard_count: float | None = None + median_input_text_tokens_per_endpoint_sec: float | None = None + median_input_text_tokens_per_gpu_sec: float | None = None + median_final_entity_count: float | None = None + total_empty_detection_count: int = 0 + empty_detection_rate: float | None = None + total_empty_detection_with_ground_truth_count: int = 0 + empty_detection_with_ground_truth_rate: float | None = None + total_ground_truth_record_count: int = 0 + sum_ground_truth_entity_count: float | None = None + sum_entity_true_positive_count: float | None = None + sum_entity_false_positive_count: float | None = None + sum_entity_false_negative_count: float | None = None + micro_entity_precision: float | None = None + micro_entity_recall: float | None = None + micro_entity_f1: float | None = None + sum_entity_relaxed_gt_found_count: float | None = None + sum_entity_relaxed_detected_tp_count: float | None = None + sum_entity_relaxed_label_compatible_gt_found_count: float | None = None + sum_entity_relaxed_label_compatible_detected_tp_count: float | None = None + micro_entity_relaxed_precision: float | None = None + micro_entity_relaxed_recall: float | None = None + micro_entity_relaxed_f1: float | None = None + micro_entity_relaxed_label_compatible_precision: float | None = None + micro_entity_relaxed_label_compatible_recall: float | None = None + micro_entity_relaxed_label_compatible_f1: float | None = None + median_entity_relaxed_f1: float | None = None + median_entity_relaxed_label_compatible_f1: float | None = None + median_replacement_missing_final_entity_count: float | None = None + median_replacement_missing_final_value_count: float | None = None + replacement_missing_final_entity_label_counts: dict[str, int] = Field(default_factory=dict) + median_replacement_synthetic_original_collision_count: float | None = None + median_replacement_synthetic_original_collision_value_count: float | None = None + replacement_synthetic_original_collision_label_counts: dict[str, int] = Field(default_factory=dict) + sum_original_value_leak_count: float | None = None + leaking_case_count: int = 0 + median_original_value_leak_count: float | None = None + sum_detection_judged_record_count: int = 0 + sum_detection_valid_record_count: int = 0 + micro_detection_valid_rate: float | None = None + sum_detection_invalid_entity_count: int = 0 + sum_type_fidelity_judged_record_count: int = 0 + sum_type_fidelity_valid_record_count: int = 0 + micro_type_fidelity_valid_rate: float | None = None + sum_type_fidelity_invalid_replacement_count: int = 0 + sum_relational_consistency_judged_record_count: int = 0 + sum_relational_consistency_valid_record_count: int = 0 + micro_relational_consistency_valid_rate: float | None = None + sum_relational_consistency_invalid_relation_count: int = 0 + sum_attribute_fidelity_judged_record_count: int = 0 + sum_attribute_fidelity_valid_record_count: int = 0 + micro_attribute_fidelity_valid_rate: float | None = None + sum_attribute_fidelity_invalid_entity_count: int = 0 + median_seed_entity_count: float | None = None + median_seed_validation_candidate_count: float | None = None + median_estimated_seed_validation_chunk_count: float | None = None + median_augmented_entity_count: float | None = None + median_augmented_new_final_value_count: float | None = None + median_artifact_final_entity_count: float | None = None + median_artifact_final_detector_entity_count: float | None = None + median_artifact_final_augmenter_entity_count: float | None = None + median_artifact_final_entity_signature_count: float | None = None + + +class ModelUsageAnalysisRow(BaseModel): + suite_id: str | None = None + workload_id: str | None = None + config_id: str | None = None + experimental_detection_strategy: str | None = None + experimental_replacement_strategy: str | None = None + dd_parser_compat: str | None = None + repetition: int | None = None + case_id: str + run_id: str + workflow_name: str | None = None + model_alias: str | None = None + model_name: str + model_provider_name: str | None = None + ndd_elapsed_sec: float | None = None + observed_total_requests: int = 0 + observed_successful_requests: int = 0 + observed_failed_requests: int = 0 + observed_input_tokens: int = 0 + observed_output_tokens: int = 0 + observed_total_tokens: int = 0 + observed_reasoning_tokens: int | None = None + observed_failed_request_rate: float | None = None + + +class ModelUsageGroupAnalysisRow(BaseModel): + workload_id: str | None = None + config_id: str | None = None + experimental_detection_strategy: str | None = None + experimental_replacement_strategy: str | None = None + dd_parser_compat: str | None = None + workflow_name: str | None = None + model_alias: str | None = None + model_name: str + model_provider_name: str | None = None + case_count: int + workflow_count: int + sum_observed_total_requests: int = 0 + sum_observed_successful_requests: int = 0 + sum_observed_failed_requests: int = 0 + sum_observed_input_tokens: int = 0 + sum_observed_output_tokens: int = 0 + sum_observed_total_tokens: int = 0 + sum_observed_reasoning_tokens: int | None = None + observed_failed_request_rate: float | None = None + median_observed_total_requests: float | None = None + median_observed_failed_requests: float | None = None + median_observed_total_tokens: float | None = None + + +class BenchmarkOutputAnalysis(BaseModel): + benchmark_dir: str + detection_artifacts_path: str | None = None + cases: list[CaseAnalysisRow] = Field(default_factory=list) + groups: list[GroupAnalysisRow] = Field(default_factory=list) + model_usage: list[ModelUsageAnalysisRow] = Field(default_factory=list) + model_usage_groups: list[ModelUsageGroupAnalysisRow] = Field(default_factory=list) + + @computed_field + @property + def case_count(self) -> int: + return len(self.cases) + + @computed_field + @property + def group_count(self) -> int: + return len(self.groups) + + @computed_field + @property + def model_usage_count(self) -> int: + return len(self.model_usage) + + @computed_field + @property + def model_usage_group_count(self) -> int: + return len(self.model_usage_groups) + + +def analyze_benchmark_output( + benchmark_dir: Path, + *, + detection_artifacts: Path | None = None, +) -> BenchmarkOutputAnalysis: + measurements = read_jsonl_table(benchmark_dir / "measurements.jsonl", required=True) + artifacts_path = detection_artifacts or benchmark_dir / "detection-artifacts.jsonl" + artifacts = read_jsonl_table(artifacts_path, required=detection_artifacts is not None) + traces = read_trace_summary_table(benchmark_dir / "traces") + cases = [ + _build_case_row(case_id, measurements, artifacts, traces) + for case_id in _case_ids(measurements, artifacts, traces) + ] + model_usage = build_model_usage_rows(measurements) + return BenchmarkOutputAnalysis( + benchmark_dir=str(benchmark_dir), + detection_artifacts_path=str(artifacts_path) if not artifacts.empty else None, + cases=cases, + groups=build_group_rows(cases), + model_usage=model_usage, + model_usage_groups=build_model_usage_group_rows(model_usage), + ) + + +def read_jsonl_table(path: Path, *, required: bool) -> pd.DataFrame: + if not path.exists(): + if required: + raise ValueError(f"input path does not exist: {path}") + return pd.DataFrame() + if path.is_dir(): + raise ValueError(f"input path is a directory: {path}") + raw = pd.read_json(path, lines=True) + if raw.empty: + return raw + return pd.json_normalize(raw.to_dict("records"), sep=".") + + +def read_trace_summary_table(trace_path: Path) -> pd.DataFrame: + """Read DD trace files into a sanitized table with no prompt/response text.""" + if not trace_path.exists(): + return pd.DataFrame() + if trace_path.is_file(): + paths = [trace_path] + elif trace_path.is_dir(): + paths = sorted(trace_path.rglob("*.jsonl")) + else: + raise ValueError(f"trace path is not a file or directory: {trace_path}") + + rows: list[dict[str, Any]] = [] + for path in paths: + for line in path.read_text(encoding="utf-8").splitlines(): + if not line.strip(): + continue + record = json.loads(line) + if not isinstance(record, dict) or record.get("record_type") != "dd_message_trace": + continue + run_tags = record.get("run_tags") if isinstance(record.get("run_tags"), dict) else {} + rows.append( + { + "record_type": "dd_message_trace", + "run_id": record.get("run_id"), + "run_tags.case_id": run_tags.get("case_id"), + "run_tags.workload_id": run_tags.get("workload_id"), + "run_tags.config_id": run_tags.get("config_id"), + "run_tags.experimental_detection_strategy": run_tags.get("experimental_detection_strategy"), + "run_tags.experimental_replacement_strategy": run_tags.get("experimental_replacement_strategy"), + "run_tags.dd_parser_compat": run_tags.get("dd_parser_compat"), + "run_tags.repetition": run_tags.get("repetition"), + "workflow_name": record.get("workflow_name"), + "model_alias": record.get("model_alias"), + "status": record.get("status"), + "error_type": record.get("error_type"), + "is_async": record.get("is_async"), + } + ) + return pd.DataFrame(rows) + + +def _case_ids(*frames: pd.DataFrame) -> list[str]: + values: set[str] = set() + for dataframe in frames: + for column in ("run_tags.case_id", "case_id", "run_id"): + if column in dataframe.columns: + values.update(str(value) for value in dataframe[column].dropna().tolist()) + return sorted(values) + + +def _build_case_row( + case_id: str, + measurements: pd.DataFrame, + artifacts: pd.DataFrame, + traces: pd.DataFrame, +) -> CaseAnalysisRow: + measurement_rows = _rows_for_case(measurements, case_id) + artifact_rows = _rows_for_case(artifacts, case_id) + trace_rows = _rows_for_case(traces, case_id) + record_rows = _records_of_type(measurement_rows, "record") + evaluation_rows = _records_of_type(measurement_rows, "evaluation_record") + ndd_rows = _records_of_type(measurement_rows, "ndd_workflow") + model_rows = _model_workflow_rows(measurement_rows) + stage_rows = _records_of_type(measurement_rows, "stage") + pipeline_rows = _pipeline_stage_rows(measurement_rows) + validation_max_entities_per_call = _first_int([measurement_rows], ["detect.validation_max_entities_per_call"]) + request_metrics = _case_request_metrics(model_rows) + pipeline_elapsed_sec = _sum_or_none(pipeline_rows, "elapsed_sec") + ndd_elapsed_sec_total = _sum_or_zero(ndd_rows, "elapsed_sec") + record_count = len(record_rows) + input_text_tokens_total = _sum_int_or_none(record_rows, "text_length_tokens") + records_per_pipeline_sec = _safe_rate(record_count, pipeline_elapsed_sec) + records_per_ndd_sec = _safe_rate(record_count, ndd_elapsed_sec_total) + input_text_tokens_per_pipeline_sec = _safe_rate(input_text_tokens_total, pipeline_elapsed_sec) + input_text_tokens_per_ndd_sec = _safe_rate(input_text_tokens_total, ndd_elapsed_sec_total) + final_entity_count = _coalesce_number( + _sum_or_none(record_rows, "final_entity_count"), + _sum_or_none(artifact_rows, "final_entity_count"), + ) + return CaseAnalysisRow( + suite_id=_first_value([measurement_rows, artifact_rows, trace_rows], ["run_tags.suite_id", "suite_id"]), + workload_id=_first_value( + [measurement_rows, artifact_rows, trace_rows], ["run_tags.workload_id", "workload_id"] + ), + workload_category=_first_value( + [measurement_rows, artifact_rows, trace_rows], + ["run_tags.workload_category", "run_tags.dataset_category", "workload_category", "dataset_category"], + ), + config_id=_first_value([measurement_rows, artifact_rows, trace_rows], ["run_tags.config_id", "config_id"]), + experimental_detection_strategy=_first_value([measurement_rows], ["run_tags.experimental_detection_strategy"]), + experimental_replacement_strategy=_first_value( + [measurement_rows, trace_rows], + ["run_tags.experimental_replacement_strategy"], + ), + dd_parser_compat=_first_value([measurement_rows], ["run_tags.dd_parser_compat"]), + entity_label_set_id=_first_value( + [measurement_rows], + [ + "run_tags.entity_label_set_id", + "run_tags.entity_label_set", + "run_tags.label_set", + "detect.entity_label_source", + ], + ), + entity_label_count=_first_int([measurement_rows], ["run_tags.entity_label_count", "detect.entity_label_count"]), + gliner_threshold=_first_float([measurement_rows], ["run_tags.gliner_threshold", "detect.gliner_threshold"]), + repetition=_first_int([measurement_rows, artifact_rows, trace_rows], ["run_tags.repetition", "repetition"]), + case_id=case_id, + run_id=_first_value([measurement_rows, artifact_rows, trace_rows], ["run_id"]) or case_id, + **_case_failure_metrics(stage_rows=stage_rows, ndd_rows=ndd_rows, model_rows=model_rows), + pipeline_elapsed_sec=pipeline_elapsed_sec, + ndd_workflow_count=len(ndd_rows), + ndd_elapsed_sec_total=ndd_elapsed_sec_total, + **request_metrics, + **_case_trace_metrics(trace_rows, request_metrics=request_metrics), + record_count=record_count, + input_text_tokens_total=input_text_tokens_total, + records_per_pipeline_sec=records_per_pipeline_sec, + records_per_ndd_sec=records_per_ndd_sec, + input_text_tokens_per_pipeline_sec=input_text_tokens_per_pipeline_sec, + input_text_tokens_per_ndd_sec=input_text_tokens_per_ndd_sec, + **_case_topology_metrics( + measurement_rows, + input_text_tokens_per_pipeline_sec=input_text_tokens_per_pipeline_sec, + ), + final_entity_count=final_entity_count, + **_case_empty_detection_metrics(record_rows, record_count=record_count), + **_case_ground_truth_metrics(record_rows, final_entity_count=final_entity_count), + replacement_count=_sum_or_none(record_rows, "replacement_count"), + replacement_missing_final_entity_count=_sum_or_none(record_rows, "replacement_missing_final_entity_count"), + replacement_missing_final_entity_label_counts=_sum_prefixed_ints( + record_rows, + "replacement_missing_final_entity_label_counts.", + ), + replacement_missing_final_value_count=_sum_or_none(record_rows, "replacement_missing_final_value_count"), + replacement_synthetic_original_collision_count=_sum_or_none( + record_rows, + "replacement_synthetic_original_collision_count", + ), + replacement_synthetic_original_collision_label_counts=_sum_prefixed_ints( + record_rows, + "replacement_synthetic_original_collision_label_counts.", + ), + replacement_synthetic_original_collision_value_count=_sum_or_none( + record_rows, + "replacement_synthetic_original_collision_value_count", + ), + original_value_leak_count=_sum_or_none(record_rows, "original_value_leak_count"), + original_value_leak_record_count=_positive_count(record_rows, "original_value_leak_count"), + original_value_leak_label_counts=_sum_prefixed_ints(record_rows, "original_value_leak_label_counts."), + **_case_evaluation_metrics(evaluation_rows), + validation_max_entities_per_call=validation_max_entities_per_call, + **_case_artifact_metrics( + artifact_rows, + validation_max_entities_per_call=validation_max_entities_per_call, + ), + ) + + +def _case_request_metrics(model_rows: pd.DataFrame) -> dict[str, int | float | None]: + observed_total_requests = int(_sum_or_zero(model_rows, "observed_total_requests")) + observed_failed_requests = int(_sum_or_zero(model_rows, "observed_failed_requests")) + return { + "observed_total_requests": observed_total_requests, + "observed_successful_requests": int(_sum_or_zero(model_rows, "observed_successful_requests")), + "observed_input_tokens": int(_sum_or_zero(model_rows, "observed_input_tokens")), + "observed_output_tokens": int(_sum_or_zero(model_rows, "observed_output_tokens")), + "observed_total_tokens": int(_sum_or_zero(model_rows, "observed_total_tokens")), + "observed_failed_requests": observed_failed_requests, + "observed_failed_request_rate": _request_failure_rate( + failed=observed_failed_requests, + total=observed_total_requests, + ), + } + + +def _case_trace_metrics( + trace_rows: pd.DataFrame, + *, + request_metrics: dict[str, int | float | None], +) -> dict[str, int | float | None]: + trace_record_count = len(trace_rows) + if trace_record_count == 0: + return { + "dd_trace_record_count": 0, + "dd_trace_error_count": 0, + "dd_trace_sync_client_unavailable_count": 0, + "observed_bridge_fallback_requests": None, + "observed_non_bridge_total_requests": None, + "observed_non_bridge_failed_requests": None, + "observed_non_bridge_failed_request_rate": None, + } + + status = trace_rows["status"].astype(str) if "status" in trace_rows.columns else pd.Series(dtype=str) + error_type = trace_rows["error_type"].astype(str) if "error_type" in trace_rows.columns else pd.Series(dtype=str) + error_count = int((status == "error").sum()) + bridge_fallbacks = int(((status == "error") & (error_type == _SYNC_CLIENT_UNAVAILABLE_ERROR)).sum()) + observed_total = int(request_metrics["observed_total_requests"] or 0) + observed_failed = int(request_metrics["observed_failed_requests"] or 0) + non_bridge_total = max(observed_total - bridge_fallbacks, 0) + non_bridge_failed = max(observed_failed - bridge_fallbacks, 0) + return { + "dd_trace_record_count": trace_record_count, + "dd_trace_error_count": error_count, + "dd_trace_sync_client_unavailable_count": bridge_fallbacks, + "observed_bridge_fallback_requests": bridge_fallbacks, + "observed_non_bridge_total_requests": non_bridge_total, + "observed_non_bridge_failed_requests": non_bridge_failed, + "observed_non_bridge_failed_request_rate": _request_failure_rate( + failed=non_bridge_failed, + total=non_bridge_total, + ), + } + + +def _case_failure_metrics( + *, + stage_rows: pd.DataFrame, + ndd_rows: pd.DataFrame, + model_rows: pd.DataFrame, +) -> dict[str, bool | int]: + error_stage_count = _error_status_count(stage_rows) + error_ndd_workflow_count = _error_status_count(ndd_rows) + error_model_workflow_count = _error_status_count(model_rows) + return { + "case_failed": error_stage_count > 0 or error_ndd_workflow_count > 0 or error_model_workflow_count > 0, + "error_stage_count": error_stage_count, + "error_ndd_workflow_count": error_ndd_workflow_count, + "error_model_workflow_count": error_model_workflow_count, + } + + +def _case_topology_metrics( + measurement_rows: pd.DataFrame, + *, + input_text_tokens_per_pipeline_sec: float | None, +) -> dict[str, float | None]: + endpoint_count = _first_float( + [measurement_rows], + [ + "run_tags.topology_endpoint_count", + "run_tags.endpoint_count", + "run_tags.n_endpoints", + "run_tags.n_llm_endpoints", + ], + ) + gpu_count = _first_float( + [measurement_rows], + [ + "run_tags.topology_gpu_count", + "run_tags.gpu_count", + "run_tags.n_gpus", + "run_tags.n_llm_gpus", + ], + ) + tensor_parallelism = _first_float( + [measurement_rows], + [ + "run_tags.topology_tensor_parallelism", + "run_tags.tensor_parallelism", + "run_tags.gpus_per_endpoint", + "run_tags.tp", + ], + ) + shard_count = _first_float( + [measurement_rows], + ["run_tags.topology_shard_count", "run_tags.shard_count", "run_tags.n_shards"], + ) + return { + "topology_endpoint_count": endpoint_count, + "topology_gpu_count": gpu_count, + "topology_tensor_parallelism": tensor_parallelism, + "topology_shard_count": shard_count, + "input_text_tokens_per_endpoint_sec": _safe_ratio(input_text_tokens_per_pipeline_sec, endpoint_count), + "input_text_tokens_per_gpu_sec": _safe_ratio(input_text_tokens_per_pipeline_sec, gpu_count), + } + + +def _case_empty_detection_metrics(record_rows: pd.DataFrame, *, record_count: int) -> dict[str, int | float | None]: + empty_detection_count = _zero_count(record_rows, "final_entity_count") + ground_truth_record_count = _non_null_count(record_rows, "ground_truth_entity_count") + empty_detection_with_gt_count = _zero_with_positive_count( + record_rows, + zero_column="final_entity_count", + positive_column="ground_truth_entity_count", + ) + return { + "empty_detection_count": empty_detection_count, + "empty_detection_rate": _safe_ratio(empty_detection_count, record_count), + "empty_detection_with_ground_truth_count": empty_detection_with_gt_count, + "empty_detection_with_ground_truth_rate": _safe_ratio( + empty_detection_with_gt_count, + ground_truth_record_count, + ), + "ground_truth_record_count": ground_truth_record_count, + } + + +def _case_ground_truth_metrics( + record_rows: pd.DataFrame, + *, + final_entity_count: float | None, +) -> dict[str, float | None]: + ground_truth_entity_count = _sum_or_none(record_rows, "ground_truth_entity_count") + true_positive = _sum_or_none(record_rows, "entity_true_positive_count") + false_positive = _sum_or_none(record_rows, "entity_false_positive_count") + false_negative = _sum_or_none(record_rows, "entity_false_negative_count") + relaxed_gt_found = _sum_or_none(record_rows, "entity_relaxed_gt_found_count") + relaxed_detected_tp = _sum_or_none(record_rows, "entity_relaxed_detected_tp_count") + label_compatible_gt_found = _sum_or_none(record_rows, "entity_relaxed_label_compatible_gt_found_count") + label_compatible_detected_tp = _sum_or_none( + record_rows, + "entity_relaxed_label_compatible_detected_tp_count", + ) + strict_precision = _safe_ratio(true_positive, _sum_optional_numbers(true_positive, false_positive)) + strict_recall = _safe_ratio(true_positive, _sum_optional_numbers(true_positive, false_negative)) + relaxed_precision = _safe_ratio(relaxed_detected_tp, final_entity_count) + relaxed_recall = _safe_ratio(relaxed_gt_found, ground_truth_entity_count) + label_compatible_precision = _safe_ratio(label_compatible_detected_tp, final_entity_count) + label_compatible_recall = _safe_ratio(label_compatible_gt_found, ground_truth_entity_count) + return { + "ground_truth_entity_count": ground_truth_entity_count, + "entity_true_positive_count": true_positive, + "entity_false_positive_count": false_positive, + "entity_false_negative_count": false_negative, + "entity_precision": strict_precision, + "entity_recall": strict_recall, + "entity_f1": _f1(strict_precision, strict_recall), + "entity_relaxed_gt_found_count": relaxed_gt_found, + "entity_relaxed_detected_tp_count": relaxed_detected_tp, + "entity_relaxed_label_compatible_gt_found_count": label_compatible_gt_found, + "entity_relaxed_label_compatible_detected_tp_count": label_compatible_detected_tp, + "entity_relaxed_precision": relaxed_precision, + "entity_relaxed_recall": relaxed_recall, + "entity_relaxed_f1": _f1(relaxed_precision, relaxed_recall), + "entity_relaxed_label_compatible_precision": label_compatible_precision, + "entity_relaxed_label_compatible_recall": label_compatible_recall, + "entity_relaxed_label_compatible_f1": _f1(label_compatible_precision, label_compatible_recall), + } + + +def _error_status_count(rows: pd.DataFrame) -> int: + if "status" not in rows.columns: + return 0 + statuses = rows["status"].astype(str).str.lower() + return int(statuses.isin({"error", "failed"}).sum()) + + +def _case_evaluation_metrics(evaluation_rows: pd.DataFrame) -> dict[str, int | float | None]: + metrics: dict[str, int | float | None] = {} + for rollup in _EVALUATION_ROLLUPS: + judged_count, valid_count = _evaluation_judged_and_valid_counts(evaluation_rows, rollup.valid_column) + metrics[f"{rollup.prefix}_judged_record_count"] = judged_count + metrics[f"{rollup.prefix}_valid_record_count"] = valid_count + metrics[f"{rollup.prefix}_valid_rate"] = _safe_ratio(valid_count, judged_count) + metrics[rollup.invalid_count_column] = _sum_int_or_zero(evaluation_rows, rollup.invalid_count_column) + return metrics + + +def _evaluation_judged_and_valid_counts(evaluation_rows: pd.DataFrame, valid_column: str) -> tuple[int, int]: + if valid_column not in evaluation_rows.columns: + return 0, 0 + verdicts = [_optional_bool(value) for value in evaluation_rows[valid_column].tolist()] + judged_count = sum(verdict is not None for verdict in verdicts) + valid_count = sum(verdict is True for verdict in verdicts) + return judged_count, valid_count + + +def _optional_bool(value: object) -> bool | None: + if value is None or pd.isna(value): + return None + if isinstance(value, bool): + return value + if isinstance(value, str): + normalized = value.strip().lower() + if normalized in {"true", "1", "yes"}: + return True + if normalized in {"false", "0", "no"}: + return False + return None + if isinstance(value, int | float): + return bool(value) + return None + + +def _case_artifact_metrics( + artifact_rows: pd.DataFrame, + *, + validation_max_entities_per_call: int | None, +) -> dict[str, int | float | list[str] | dict[str, str] | None]: + signature_hashes = _artifact_signature_hashes(artifact_rows) + return { + "detection_artifact_rows": len(artifact_rows), + "seed_entity_count": _sum_or_none(artifact_rows, "seed_entity_count"), + "seed_validation_candidate_count": _sum_or_none(artifact_rows, "seed_validation_candidate_count"), + "estimated_seed_validation_chunk_count": _estimated_validation_chunk_count( + artifact_rows, + validation_max_entities_per_call=validation_max_entities_per_call, + ), + "augmented_entity_count": _sum_or_none(artifact_rows, "augmented_entity_count"), + "augmented_new_final_value_count": _sum_or_none(artifact_rows, "augmented_new_final_value_count"), + "artifact_final_entity_count": _sum_or_none(artifact_rows, "final_entity_count"), + "artifact_final_detector_entity_count": _sum_or_none(artifact_rows, "final_source_counts.detector"), + "artifact_final_augmenter_entity_count": _sum_or_none(artifact_rows, "final_source_counts.augmenter"), + "artifact_final_entity_signature_count": _signature_count(artifact_rows, signature_hashes=signature_hashes), + "artifact_final_entity_signature_hashes": signature_hashes, + "artifact_final_entity_signature_labels": _artifact_signature_labels(artifact_rows), + "artifact_final_entity_signature_details": _artifact_signature_details(artifact_rows), + } + + +def _rows_for_case(dataframe: pd.DataFrame, case_id: str) -> pd.DataFrame: + if dataframe.empty: + return dataframe + masks = [ + dataframe[column].astype(str) == case_id + for column in ("run_tags.case_id", "case_id", "run_id") + if column in dataframe.columns + ] + if not masks: + return dataframe.iloc[0:0] + mask = masks[0] + for next_mask in masks[1:]: + mask = mask | next_mask + return dataframe[mask] + + +def _records_of_type(dataframe: pd.DataFrame, record_type: str) -> pd.DataFrame: + if "record_type" not in dataframe.columns: + return dataframe.iloc[0:0] + return dataframe[dataframe["record_type"] == record_type] + + +def _records_of_types(dataframe: pd.DataFrame, record_types: set[str]) -> pd.DataFrame: + if "record_type" not in dataframe.columns: + return dataframe.iloc[0:0] + return dataframe[dataframe["record_type"].isin(record_types)] + + +def _model_workflow_rows(dataframe: pd.DataFrame) -> pd.DataFrame: + return _records_of_types(dataframe, {"ndd_workflow", "model_workflow"}) + + +def _pipeline_stage_rows(dataframe: pd.DataFrame) -> pd.DataFrame: + stages = _records_of_type(dataframe, "stage") + if "stage" not in stages.columns: + return stages.iloc[0:0] + return stages[stages["stage"] == "Anonymizer._run_internal"] + + +_MODEL_USAGE_SUFFIXES = { + ".request_usage.total_requests": "observed_total_requests", + ".request_usage.successful_requests": "observed_successful_requests", + ".request_usage.failed_requests": "observed_failed_requests", + ".token_usage.input_tokens": "observed_input_tokens", + ".token_usage.output_tokens": "observed_output_tokens", + ".token_usage.total_tokens": "observed_total_tokens", + ".token_usage.reasoning_tokens": "observed_reasoning_tokens", +} + +_MODEL_USAGE_METADATA_SUFFIXES = { + ".model_alias": "model_alias", + ".model_name": "model_name", + ".model_provider_name": "model_provider_name", +} + + +def build_model_usage_rows(measurements: pd.DataFrame) -> list[ModelUsageAnalysisRow]: + model_rows = _model_workflow_rows(measurements) + if model_rows.empty: + return [] + model_usage_keys = _model_usage_keys(model_rows.columns) + rows: list[ModelUsageAnalysisRow] = [] + for _, measurement in model_rows.iterrows(): + data = measurement.to_dict() + case_id = _string_from_row(data, ["run_tags.case_id", "run_id"]) + run_id = _string_from_row(data, ["run_id", "run_tags.case_id"]) + if case_id is None or run_id is None: + continue + for model_usage_key in model_usage_keys: + usage = _model_usage_metrics(data, model_usage_key) + if not _has_observed_model_usage(usage): + continue + metadata = _model_usage_metadata(data, model_usage_key) + rows.append( + ModelUsageAnalysisRow( + suite_id=_string_from_row(data, ["run_tags.suite_id"]), + workload_id=_string_from_row(data, ["run_tags.workload_id"]), + config_id=_string_from_row(data, ["run_tags.config_id"]), + experimental_detection_strategy=_string_from_row( + data, ["run_tags.experimental_detection_strategy"] + ), + experimental_replacement_strategy=_string_from_row( + data, ["run_tags.experimental_replacement_strategy"] + ), + dd_parser_compat=_string_from_row(data, ["run_tags.dd_parser_compat"]), + repetition=_int_from_row(data, ["run_tags.repetition"]), + case_id=case_id, + run_id=run_id, + workflow_name=_string_from_row(data, ["workflow_name"]), + model_alias=metadata.get("model_alias"), + model_name=metadata.get("model_name") or model_usage_key, + model_provider_name=metadata.get("model_provider_name"), + ndd_elapsed_sec=_float_from_row(data, ["elapsed_sec"]), + **usage, + ) + ) + return rows + + +def _model_usage_keys(columns: pd.Index) -> list[str]: + keys: set[str] = set() + for column in columns: + parsed = _model_usage_column_parts(str(column)) + if parsed is not None: + keys.add(parsed[0]) + return sorted(keys) + + +def _model_usage_column_parts(column: str) -> tuple[str, str] | None: + prefix = "model_usage." + if not column.startswith(prefix): + return None + for suffix, metric in {**_MODEL_USAGE_SUFFIXES, **_MODEL_USAGE_METADATA_SUFFIXES}.items(): + if column.endswith(suffix): + return column[len(prefix) : -len(suffix)], metric + return None + + +def _model_usage_metrics(data: dict[str, Any], model_usage_key: str) -> dict[str, int | float | None]: + values: dict[str, int | float | None] = { + "observed_total_requests": 0, + "observed_successful_requests": 0, + "observed_failed_requests": 0, + "observed_input_tokens": 0, + "observed_output_tokens": 0, + "observed_total_tokens": 0, + "observed_reasoning_tokens": None, + "observed_failed_request_rate": None, + } + for suffix, metric in _MODEL_USAGE_SUFFIXES.items(): + value = data.get(f"model_usage.{model_usage_key}{suffix}") + if value is None or pd.isna(value): + continue + values[metric] = _coerce_int(value) + values["observed_failed_request_rate"] = _request_failure_rate( + failed=values["observed_failed_requests"], + total=values["observed_total_requests"], + ) + return values + + +def _model_usage_metadata(data: dict[str, Any], model_usage_key: str) -> dict[str, str | None]: + values: dict[str, str | None] = { + "model_alias": None, + "model_name": None, + "model_provider_name": None, + } + for suffix, field_name in _MODEL_USAGE_METADATA_SUFFIXES.items(): + value = data.get(f"model_usage.{model_usage_key}{suffix}") + if value is None or pd.isna(value): + continue + values[field_name] = str(value) + return values + + +def _has_observed_model_usage(usage: dict[str, int | float | None]) -> bool: + return any(value not in (None, 0) for value in usage.values()) + + +def _string_from_row(data: dict[str, Any], columns: list[str]) -> str | None: + for column in columns: + value = data.get(column) + if value is not None and not pd.isna(value): + return str(value) + return None + + +def _int_from_row(data: dict[str, Any], columns: list[str]) -> int | None: + value = _string_from_row(data, columns) + return int(float(value)) if value is not None else None + + +def _float_from_row(data: dict[str, Any], columns: list[str]) -> float | None: + value = _string_from_row(data, columns) + return float(value) if value is not None else None + + +def _coerce_int(value: Any) -> int: + return int(float(value)) + + +def _first_value(frames: list[pd.DataFrame], columns: list[str]) -> str | None: + for frame in frames: + for column in columns: + if column not in frame.columns: + continue + values = frame[column].dropna() + if not values.empty: + return str(values.iloc[0]) + return None + + +def _first_int(frames: list[pd.DataFrame], columns: list[str]) -> int | None: + value = _first_value(frames, columns) + return int(float(value)) if value is not None else None + + +def _first_float(frames: list[pd.DataFrame], columns: list[str]) -> float | None: + value = _first_value(frames, columns) + return float(value) if value is not None else None + + +def _coalesce_number(*values: float | None) -> float | None: + for value in values: + if value is not None: + return value + return None + + +def _artifact_signature_hashes(artifact_rows: pd.DataFrame) -> list[str]: + if "final_entity_signature_hashes" not in artifact_rows.columns: + return [] + values: set[str] = set() + for raw in artifact_rows["final_entity_signature_hashes"].dropna(): + values.update(_coerce_string_list(raw)) + return sorted(values) + + +def _artifact_signature_labels(artifact_rows: pd.DataFrame) -> dict[str, str]: + labels: dict[str, str] = {} + if "final_entity_signature_labels" in artifact_rows.columns: + for raw in artifact_rows["final_entity_signature_labels"].dropna(): + labels.update(_coerce_string_dict(raw)) + for column in artifact_rows.columns: + prefix = "final_entity_signature_labels." + if not column.startswith(prefix): + continue + signature_hash = column.removeprefix(prefix) + for value in artifact_rows[column].dropna(): + labels[signature_hash] = str(value) + return dict(sorted(labels.items())) + + +def _artifact_signature_details(artifact_rows: pd.DataFrame) -> dict[str, dict[str, Any]]: + details: dict[str, dict[str, Any]] = {} + if "final_entity_signature_details" in artifact_rows.columns: + for raw in artifact_rows["final_entity_signature_details"].dropna(): + details.update(_coerce_detail_map(raw)) + prefix = "final_entity_signature_details." + for column in artifact_rows.columns: + if not column.startswith(prefix): + continue + remainder = column.removeprefix(prefix) + signature_hash, _, field = remainder.partition(".") + if not signature_hash or not field: + continue + if field not in _SIGNATURE_DETAIL_FIELDS: + continue + for value in artifact_rows[column].dropna(): + details.setdefault(signature_hash, {})[field] = _json_scalar(value) + return dict(sorted(details.items())) + + +def _coerce_detail_map(raw: object) -> dict[str, dict[str, Any]]: + if isinstance(raw, str): + try: + raw = json.loads(raw) + except json.JSONDecodeError: + return {} + if not isinstance(raw, dict): + return {} + details: dict[str, dict[str, Any]] = {} + for signature_hash, value in raw.items(): + if isinstance(value, dict): + details[str(signature_hash)] = { + str(key): _json_scalar(item) for key, item in value.items() if str(key) in _SIGNATURE_DETAIL_FIELDS + } + return details + + +def _json_scalar(value: object) -> Any: + if hasattr(value, "item"): + try: + return value.item() + except ValueError: + return value + return value + + +def _coerce_string_list(raw: object) -> list[str]: + if isinstance(raw, list): + return [str(item) for item in raw] + return [] + + +def _coerce_string_dict(raw: object) -> dict[str, str]: + if isinstance(raw, dict): + return {str(key): str(value) for key, value in raw.items()} + return {} + + +def _signature_count(artifact_rows: pd.DataFrame, *, signature_hashes: list[str]) -> float | None: + if signature_hashes: + return float(len(signature_hashes)) + return _sum_or_none(artifact_rows, "final_entity_signature_count") + + +def _positive_count(dataframe: pd.DataFrame, column: str) -> int: + if column not in dataframe.columns: + return 0 + values = pd.to_numeric(dataframe[column], errors="coerce").fillna(0) + return int((values > 0).sum()) + + +def _zero_count(dataframe: pd.DataFrame, column: str) -> int: + if column not in dataframe.columns: + return 0 + values = pd.to_numeric(dataframe[column], errors="coerce").dropna() + return int((values == 0).sum()) + + +def _non_null_count(dataframe: pd.DataFrame, column: str) -> int: + if column not in dataframe.columns: + return 0 + return int(pd.to_numeric(dataframe[column], errors="coerce").notna().sum()) + + +def _zero_with_positive_count(dataframe: pd.DataFrame, *, zero_column: str, positive_column: str) -> int: + if zero_column not in dataframe.columns or positive_column not in dataframe.columns: + return 0 + zero_values = pd.to_numeric(dataframe[zero_column], errors="coerce") + positive_values = pd.to_numeric(dataframe[positive_column], errors="coerce") + return int(((zero_values == 0) & (positive_values > 0)).sum()) + + +def _sum_prefixed_ints(dataframe: pd.DataFrame, prefix: str) -> dict[str, int]: + totals: dict[str, int] = {} + base_column = prefix.removesuffix(".") + if base_column in dataframe.columns: + for value in dataframe[base_column].dropna().tolist(): + for key, count in _coerce_count_mapping(value).items(): + totals[key] = totals.get(key, 0) + count + for column in sorted(col for col in dataframe.columns if col.startswith(prefix)): + value = _sum_int_or_zero(dataframe, column) + if value: + totals[column.removeprefix(prefix)] = value + return totals + + +def _coerce_count_mapping(value: object) -> dict[str, int]: + payload = value + if isinstance(value, str): + try: + payload = json.loads(value) + except json.JSONDecodeError: + return {} + if not isinstance(payload, dict): + return {} + counts: dict[str, int] = {} + for key, count in payload.items(): + numeric = pd.to_numeric(pd.Series([count]), errors="coerce").dropna() + if not numeric.empty and numeric.iloc[0]: + counts[str(key)] = int(numeric.iloc[0]) + return counts + + +def build_group_rows(cases: list[CaseAnalysisRow]) -> list[GroupAnalysisRow]: + if not cases: + return [] + table = pd.DataFrame([case.model_dump() for case in cases]) + rows: list[GroupAnalysisRow] = [] + group_columns = [ + "workload_id", + "workload_category", + "config_id", + "experimental_detection_strategy", + "experimental_replacement_strategy", + "entity_label_set_id", + "entity_label_count", + "gliner_threshold", + ] + for keys, group in table.groupby(group_columns, dropna=False): + rows.append(_build_group_row(keys, group)) + return rows + + +def build_model_usage_group_rows(model_usage: list[ModelUsageAnalysisRow]) -> list[ModelUsageGroupAnalysisRow]: + if not model_usage: + return [] + table = pd.DataFrame([row.model_dump() for row in model_usage]) + rows: list[ModelUsageGroupAnalysisRow] = [] + group_columns = [ + "workload_id", + "config_id", + "experimental_detection_strategy", + "experimental_replacement_strategy", + "dd_parser_compat", + "workflow_name", + "model_alias", + "model_name", + "model_provider_name", + ] + for keys, group in table.groupby(group_columns, dropna=False): + rows.append(_build_model_usage_group_row(keys, group)) + return rows + + +def _build_model_usage_group_row(keys: tuple[Any, ...], group: pd.DataFrame) -> ModelUsageGroupAnalysisRow: + ( + workload_id, + config_id, + detection_strategy, + replacement_strategy, + dd_parser_compat, + workflow_name, + model_alias, + model_name, + provider_name, + ) = keys + reasoning_sum = _sum_int_or_none(group, "observed_reasoning_tokens") + total_requests = _sum_int_or_zero(group, "observed_total_requests") + failed_requests = _sum_int_or_zero(group, "observed_failed_requests") + return ModelUsageGroupAnalysisRow( + workload_id=_none_if_nan(workload_id), + config_id=_none_if_nan(config_id), + experimental_detection_strategy=_none_if_nan(detection_strategy), + experimental_replacement_strategy=_none_if_nan(replacement_strategy), + dd_parser_compat=_none_if_nan(dd_parser_compat), + workflow_name=_none_if_nan(workflow_name), + model_alias=_none_if_nan(model_alias), + model_name=str(model_name), + model_provider_name=_none_if_nan(provider_name), + case_count=int(group["case_id"].nunique()), + workflow_count=len(group), + sum_observed_total_requests=total_requests, + sum_observed_successful_requests=_sum_int_or_zero(group, "observed_successful_requests"), + sum_observed_failed_requests=failed_requests, + sum_observed_input_tokens=_sum_int_or_zero(group, "observed_input_tokens"), + sum_observed_output_tokens=_sum_int_or_zero(group, "observed_output_tokens"), + sum_observed_total_tokens=_sum_int_or_zero(group, "observed_total_tokens"), + sum_observed_reasoning_tokens=reasoning_sum, + observed_failed_request_rate=_request_failure_rate(failed=failed_requests, total=total_requests), + median_observed_total_requests=_median_or_none(group, "observed_total_requests"), + median_observed_failed_requests=_median_or_none(group, "observed_failed_requests"), + median_observed_total_tokens=_median_or_none(group, "observed_total_tokens"), + ) + + +def _build_group_row(keys: tuple[Any, ...], group: pd.DataFrame) -> GroupAnalysisRow: + ( + workload_id, + workload_category, + config_id, + detection_strategy, + replacement_strategy, + entity_label_set_id, + entity_label_count, + gliner_threshold, + ) = keys + case_count = int(group["case_id"].nunique()) + failed_case_count = _sum_bool_or_zero(group, "case_failed") + total_record_count = _sum_int_or_zero(group, "record_count") + total_input_text_tokens = _sum_int_or_none(group, "input_text_tokens_total") + total_empty_detection_count = _sum_int_or_zero(group, "empty_detection_count") + total_ground_truth_record_count = _sum_int_or_zero(group, "ground_truth_record_count") + total_empty_detection_with_gt_count = _sum_int_or_zero(group, "empty_detection_with_ground_truth_count") + final_entity_count = _sum_or_none(group, "final_entity_count") + ground_truth_entity_count = _sum_or_none(group, "ground_truth_entity_count") + true_positive = _sum_or_none(group, "entity_true_positive_count") + false_positive = _sum_or_none(group, "entity_false_positive_count") + false_negative = _sum_or_none(group, "entity_false_negative_count") + strict_precision = _safe_ratio(true_positive, _sum_optional_numbers(true_positive, false_positive)) + strict_recall = _safe_ratio(true_positive, _sum_optional_numbers(true_positive, false_negative)) + relaxed_gt_found = _sum_or_none(group, "entity_relaxed_gt_found_count") + relaxed_detected_tp = _sum_or_none(group, "entity_relaxed_detected_tp_count") + label_compatible_gt_found = _sum_or_none(group, "entity_relaxed_label_compatible_gt_found_count") + label_compatible_detected_tp = _sum_or_none(group, "entity_relaxed_label_compatible_detected_tp_count") + relaxed_precision = _safe_ratio(relaxed_detected_tp, final_entity_count) + relaxed_recall = _safe_ratio(relaxed_gt_found, ground_truth_entity_count) + label_compatible_precision = _safe_ratio(label_compatible_detected_tp, final_entity_count) + label_compatible_recall = _safe_ratio(label_compatible_gt_found, ground_truth_entity_count) + evaluation_metrics = _group_evaluation_metrics(group) + return GroupAnalysisRow( + workload_id=_none_if_nan(workload_id), + workload_category=_none_if_nan(workload_category), + config_id=_none_if_nan(config_id), + experimental_detection_strategy=_none_if_nan(detection_strategy), + experimental_replacement_strategy=_none_if_nan(replacement_strategy), + entity_label_set_id=_none_if_nan(entity_label_set_id), + entity_label_count=_int_if_not_nan(entity_label_count), + gliner_threshold=_float_if_not_nan(gliner_threshold), + case_count=case_count, + failed_case_count=failed_case_count, + failed_case_rate=_request_failure_rate(failed=failed_case_count, total=case_count), + error_stage_count=_sum_int_or_zero(group, "error_stage_count"), + error_ndd_workflow_count=_sum_int_or_zero(group, "error_ndd_workflow_count"), + error_model_workflow_count=_sum_int_or_zero(group, "error_model_workflow_count"), + median_pipeline_elapsed_sec=_median_or_none(group, "pipeline_elapsed_sec"), + median_ndd_elapsed_sec_total=_median_or_none(group, "ndd_elapsed_sec_total"), + median_observed_total_requests=_median_or_none(group, "observed_total_requests"), + median_observed_successful_requests=_median_or_none(group, "observed_successful_requests"), + median_observed_input_tokens=_median_or_none(group, "observed_input_tokens"), + median_observed_output_tokens=_median_or_none(group, "observed_output_tokens"), + median_observed_total_tokens=_median_or_none(group, "observed_total_tokens"), + median_observed_failed_requests=_median_or_none(group, "observed_failed_requests"), + median_observed_failed_request_rate=_median_or_none(group, "observed_failed_request_rate"), + median_observed_bridge_fallback_requests=_median_or_none(group, "observed_bridge_fallback_requests"), + median_observed_non_bridge_total_requests=_median_or_none(group, "observed_non_bridge_total_requests"), + median_observed_non_bridge_failed_requests=_median_or_none(group, "observed_non_bridge_failed_requests"), + median_observed_non_bridge_failed_request_rate=_median_or_none( + group, + "observed_non_bridge_failed_request_rate", + ), + total_record_count=total_record_count, + median_record_count=_median_or_none(group, "record_count"), + total_input_text_tokens=total_input_text_tokens, + median_input_text_tokens_total=_median_or_none(group, "input_text_tokens_total"), + median_records_per_pipeline_sec=_median_or_none(group, "records_per_pipeline_sec"), + median_records_per_ndd_sec=_median_or_none(group, "records_per_ndd_sec"), + median_input_text_tokens_per_pipeline_sec=_median_or_none(group, "input_text_tokens_per_pipeline_sec"), + median_input_text_tokens_per_ndd_sec=_median_or_none(group, "input_text_tokens_per_ndd_sec"), + median_topology_endpoint_count=_median_or_none(group, "topology_endpoint_count"), + median_topology_gpu_count=_median_or_none(group, "topology_gpu_count"), + median_topology_tensor_parallelism=_median_or_none(group, "topology_tensor_parallelism"), + median_topology_shard_count=_median_or_none(group, "topology_shard_count"), + median_input_text_tokens_per_endpoint_sec=_median_or_none(group, "input_text_tokens_per_endpoint_sec"), + median_input_text_tokens_per_gpu_sec=_median_or_none(group, "input_text_tokens_per_gpu_sec"), + median_final_entity_count=_median_or_none(group, "final_entity_count"), + total_empty_detection_count=total_empty_detection_count, + empty_detection_rate=_safe_ratio(total_empty_detection_count, total_record_count), + total_empty_detection_with_ground_truth_count=total_empty_detection_with_gt_count, + empty_detection_with_ground_truth_rate=_safe_ratio( + total_empty_detection_with_gt_count, + total_ground_truth_record_count, + ), + total_ground_truth_record_count=total_ground_truth_record_count, + sum_ground_truth_entity_count=ground_truth_entity_count, + sum_entity_true_positive_count=true_positive, + sum_entity_false_positive_count=false_positive, + sum_entity_false_negative_count=false_negative, + micro_entity_precision=strict_precision, + micro_entity_recall=strict_recall, + micro_entity_f1=_f1(strict_precision, strict_recall), + sum_entity_relaxed_gt_found_count=relaxed_gt_found, + sum_entity_relaxed_detected_tp_count=relaxed_detected_tp, + sum_entity_relaxed_label_compatible_gt_found_count=label_compatible_gt_found, + sum_entity_relaxed_label_compatible_detected_tp_count=label_compatible_detected_tp, + micro_entity_relaxed_precision=relaxed_precision, + micro_entity_relaxed_recall=relaxed_recall, + micro_entity_relaxed_f1=_f1(relaxed_precision, relaxed_recall), + micro_entity_relaxed_label_compatible_precision=label_compatible_precision, + micro_entity_relaxed_label_compatible_recall=label_compatible_recall, + micro_entity_relaxed_label_compatible_f1=_f1(label_compatible_precision, label_compatible_recall), + median_entity_relaxed_f1=_median_or_none(group, "entity_relaxed_f1"), + median_entity_relaxed_label_compatible_f1=_median_or_none( + group, + "entity_relaxed_label_compatible_f1", + ), + median_replacement_missing_final_entity_count=_median_or_none( + group, + "replacement_missing_final_entity_count", + ), + median_replacement_missing_final_value_count=_median_or_none(group, "replacement_missing_final_value_count"), + replacement_missing_final_entity_label_counts=_sum_prefixed_ints( + group, + "replacement_missing_final_entity_label_counts.", + ), + median_replacement_synthetic_original_collision_count=_median_or_none( + group, + "replacement_synthetic_original_collision_count", + ), + median_replacement_synthetic_original_collision_value_count=_median_or_none( + group, + "replacement_synthetic_original_collision_value_count", + ), + replacement_synthetic_original_collision_label_counts=_sum_prefixed_ints( + group, + "replacement_synthetic_original_collision_label_counts.", + ), + sum_original_value_leak_count=_sum_or_none(group, "original_value_leak_count"), + leaking_case_count=_positive_count(group, "original_value_leak_count"), + median_original_value_leak_count=_median_or_none(group, "original_value_leak_count"), + **evaluation_metrics, + median_seed_entity_count=_median_or_none(group, "seed_entity_count"), + median_seed_validation_candidate_count=_median_or_none(group, "seed_validation_candidate_count"), + median_estimated_seed_validation_chunk_count=_median_or_none(group, "estimated_seed_validation_chunk_count"), + median_augmented_entity_count=_median_or_none(group, "augmented_entity_count"), + median_augmented_new_final_value_count=_median_or_none(group, "augmented_new_final_value_count"), + median_artifact_final_entity_count=_median_or_none(group, "artifact_final_entity_count"), + median_artifact_final_detector_entity_count=_median_or_none(group, "artifact_final_detector_entity_count"), + median_artifact_final_augmenter_entity_count=_median_or_none(group, "artifact_final_augmenter_entity_count"), + median_artifact_final_entity_signature_count=_median_or_none(group, "artifact_final_entity_signature_count"), + ) + + +def _int_if_not_nan(value: object) -> int | None: + if pd.isna(value): + return None + return int(float(cast(Any, value))) + + +def _float_if_not_nan(value: object) -> float | None: + if pd.isna(value): + return None + return float(cast(Any, value)) + + +def _sum_bool_or_zero(dataframe: pd.DataFrame, column: str) -> int: + if column not in dataframe.columns: + return 0 + return int(dataframe[column].fillna(False).astype(bool).sum()) + + +def _group_evaluation_metrics(group: pd.DataFrame) -> dict[str, int | float | None]: + metrics: dict[str, int | float | None] = {} + for rollup in _EVALUATION_ROLLUPS: + judged_count = _sum_int_or_zero(group, f"{rollup.prefix}_judged_record_count") + valid_count = _sum_int_or_zero(group, f"{rollup.prefix}_valid_record_count") + metrics[f"sum_{rollup.prefix}_judged_record_count"] = judged_count + metrics[f"sum_{rollup.prefix}_valid_record_count"] = valid_count + metrics[f"micro_{rollup.prefix}_valid_rate"] = _safe_ratio(valid_count, judged_count) + metrics[f"sum_{rollup.invalid_count_column}"] = _sum_int_or_zero(group, rollup.invalid_count_column) + return metrics + + +def _sum_int_or_none(dataframe: pd.DataFrame, column: str) -> int | None: + value = _sum_or_none(dataframe, column) + return int(value) if value is not None else None + + +def _request_failure_rate(*, failed: object, total: object) -> float | None: + failed_value = _optional_number(failed) + total_value = _optional_number(total) + if failed_value is None or total_value is None or total_value <= 0: + return None + return failed_value / total_value + + +def _safe_rate(numerator: object, elapsed_sec: object) -> float | None: + numerator_value = _optional_number(numerator) + elapsed_value = _optional_number(elapsed_sec) + if numerator_value is None or elapsed_value is None or elapsed_value <= 0: + return None + return numerator_value / elapsed_value + + +def _safe_ratio(numerator: object, denominator: object) -> float | None: + numerator_value = _optional_number(numerator) + denominator_value = _optional_number(denominator) + if numerator_value is None or denominator_value is None or denominator_value <= 0: + return None + return numerator_value / denominator_value + + +def _sum_optional_numbers(*values: object) -> float | None: + numeric_values = [_optional_number(value) for value in values] + if any(value is None for value in numeric_values): + return None + return sum(cast(float, value) for value in numeric_values) + + +def _f1(precision: float | None, recall: float | None) -> float | None: + if precision is None or recall is None or precision + recall == 0: + return None + return 2 * precision * recall / (precision + recall) + + +def _optional_number(value: object) -> float | None: + if value is None or pd.isna(value): + return None + return float(value) + + +def _estimated_validation_chunk_count( + artifact_rows: pd.DataFrame, + *, + validation_max_entities_per_call: int | None, +) -> float | None: + if validation_max_entities_per_call is None or validation_max_entities_per_call <= 0: + return None + if "seed_validation_candidate_count" not in artifact_rows.columns: + return None + counts = pd.to_numeric(artifact_rows["seed_validation_candidate_count"], errors="coerce").dropna() + if counts.empty: + return None + return float(sum(math.ceil(count / validation_max_entities_per_call) for count in counts if count > 0)) + + +def write_analysis_tables( + result: BenchmarkOutputAnalysis, + output_dir: Path, + export_format: ExportFormat, +) -> AnalysisExportResult: + return _write_analysis_table_specs( + output_dir, + export_format, + [ + ModelTableSpec("case_analysis", result.cases, CaseAnalysisRow), + ModelTableSpec("group_analysis", result.groups, GroupAnalysisRow), + ModelTableSpec("model_analysis", result.model_usage, ModelUsageAnalysisRow), + ModelTableSpec("model_group_analysis", result.model_usage_groups, ModelUsageGroupAnalysisRow), + ], + ) + + +def render_result(result: BenchmarkOutputAnalysis, *, json_output: bool) -> str: + if json_output: + return result.model_dump_json(indent=2) + lines = [ + f"Analyzed {result.case_count} case(s) across {result.group_count} group(s); " + f"model rows={result.model_usage_count}" + ] + for group in result.groups: + label = ( + f"{group.workload_id}/{group.config_id}/" + f"{group.experimental_detection_strategy}/{group.experimental_replacement_strategy}" + ) + lines.append( + f"- {label}: cases={group.case_count}, median_entities={group.median_final_entity_count}, " + f"failed_cases={group.failed_case_count}/{group.case_count}, " + f"median_requests={group.median_observed_total_requests}, median_tokens={group.median_observed_total_tokens}, " + f"median_input_tok_s={group.median_input_text_tokens_per_pipeline_sec}, " + f"micro_relaxed_f1={group.micro_entity_relaxed_f1}, " + f"empty_with_gt={group.total_empty_detection_with_ground_truth_count}, " + f"median_failed_request_rate={group.median_observed_failed_request_rate}, " + f"median_aug_new_final={group.median_augmented_new_final_value_count}" + ) + return "\n".join(lines) + + +@app.default +def main( + benchmark_dir: Path, + *, + output: Annotated[Path | None, cyclopts.Parameter(("--output", "-o"))] = None, + detection_artifacts: Annotated[Path | None, cyclopts.Parameter("--detection-artifacts")] = None, + format: Annotated[ExportFormat, cyclopts.Parameter("--format")] = ExportFormat.parquet, + json_output: Annotated[bool, cyclopts.Parameter("--json")] = False, + log_format: Annotated[LogFormat, cyclopts.Parameter("--log-format")] = LogFormat.plain, +) -> None: + configure_logging(log_format) + try: + result = analyze_benchmark_output(benchmark_dir, detection_artifacts=detection_artifacts) + if output is not None: + write_analysis_tables(result, output, format) + except ValueError as exc: + log_bad_input(logger, str(exc)) + raise SystemExit(125) from exc + sys.stdout.write(render_result(result, json_output=json_output) + "\n") + + +if __name__ == "__main__": + app() diff --git a/tools/measurement/analyze_detection_artifacts.py b/tools/measurement/analyze_detection_artifacts.py new file mode 100644 index 00000000..2d340c55 --- /dev/null +++ b/tools/measurement/analyze_detection_artifacts.py @@ -0,0 +1,342 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Analyze detection artifacts for augmentation contribution and label-shape risks. + +Usage: + uv run python tools/measurement/analyze_detection_artifacts.py benchmark/artifacts + uv run python tools/measurement/analyze_detection_artifacts.py benchmark/artifacts --output detection.jsonl + uv run python tools/measurement/analyze_detection_artifacts.py benchmark/artifacts --json +""" + +from __future__ import annotations + +import ast +import hashlib +import json +import logging +import math +import re +import sys +from collections import Counter +from pathlib import Path +from typing import Annotated, Iterable + +import cyclopts +import pandas as pd +from measurement_tools.cli import LogFormat, configure_logging, log_bad_input +from measurement_tools.tables import ExportFormat +from pydantic import BaseModel, Field + +from anonymizer.engine.constants import ( + COL_AUGMENTED_ENTITIES, + COL_DETECTED_ENTITIES, + COL_SEED_ENTITIES_JSON, + COL_SEED_VALIDATION_CANDIDATES, + COL_VALIDATION_CANDIDATES, +) +from anonymizer.engine.schemas import ( + AugmentedEntitiesSchema, + EntitiesSchema, + EntitySchema, + ValidationCandidatesSchema, +) + +app = cyclopts.App(help=__doc__) +logger = logging.getLogger("measurement.detection_artifacts") + +API_KEY_PREFIX_RE = re.compile(r"^(sk-|sk_|sk-ant-|sk-proj-|ghp_|pat-|hf_|xox[a-z]-|ya29\.|aiza|akia|bearer\s+)", re.I) + + +class DetectionArtifactRow(BaseModel): + workflow_name: str + batch_file: str + row_index: int + seed_entity_count: int + seed_validation_candidate_count: int + merged_validation_candidate_count: int + augmented_entity_count: int + final_entity_count: int + augmented_duplicate_seed_value_count: int + augmented_new_value_count: int + augmented_new_final_value_count: int + weak_api_key_shape_count: int + final_entity_signature_count: int + final_entity_signature_hashes: list[str] = Field(default_factory=list) + final_entity_signature_labels: dict[str, str] = Field(default_factory=dict) + final_entity_signature_details: dict[str, dict[str, object]] = Field(default_factory=dict) + weak_api_key_shape_label_counts: dict[str, int] = Field(default_factory=dict) + final_label_counts: dict[str, int] = Field(default_factory=dict) + final_source_counts: dict[str, int] = Field(default_factory=dict) + + +class DetectionArtifactAnalysis(BaseModel): + artifact_path: str + rows: list[DetectionArtifactRow] = Field(default_factory=list) + + +def analyze_artifacts( + artifact_path: Path, + *, + parquet_files: Iterable[Path] | None = None, +) -> DetectionArtifactAnalysis: + if not artifact_path.exists() or not artifact_path.is_dir(): + raise ValueError(f"artifact path is not a directory: {artifact_path}") + rows: list[DetectionArtifactRow] = [] + for parquet_file in parquet_files if parquet_files is not None else iter_detection_parquet_files(artifact_path): + rows.extend(_analyze_parquet_file(parquet_file, artifact_root=artifact_path)) + return DetectionArtifactAnalysis(artifact_path=str(artifact_path), rows=rows) + + +def iter_detection_parquet_files(artifact_path: Path) -> list[Path]: + files: list[Path] = [] + for workflow_dir in sorted(path for path in artifact_path.iterdir() if path.is_dir()): + if not workflow_dir.name.startswith("entity-detection"): + continue + files.extend(sorted((workflow_dir / "parquet-files").glob("*.parquet"))) + return files + + +def _analyze_parquet_file(parquet_file: Path, *, artifact_root: Path) -> list[DetectionArtifactRow]: + dataframe = pd.read_parquet(parquet_file) + workflow_name = parquet_file.parents[1].name + batch_file = str(parquet_file.relative_to(artifact_root)) + return [ + _analyze_dataframe_row(row, workflow_name=workflow_name, batch_file=batch_file, row_index=row_index) + for row_index, row in dataframe.iterrows() + ] + + +def _analyze_dataframe_row( + row: pd.Series, + *, + workflow_name: str, + batch_file: str, + row_index: int, +) -> DetectionArtifactRow: + seed_entities = _parse_entities(row.get(COL_SEED_ENTITIES_JSON)) + augmented_entities = _parse_augmented_entities(row.get(COL_AUGMENTED_ENTITIES)) + final_entities = _parse_entities(row.get(COL_DETECTED_ENTITIES)) + return build_detection_artifact_row_from_entities( + workflow_name=workflow_name, + batch_file=batch_file, + row_index=row_index, + seed_entities=seed_entities, + seed_validation_candidate_count=_parse_validation_candidate_count(row.get(COL_SEED_VALIDATION_CANDIDATES)), + merged_validation_candidate_count=_parse_validation_candidate_count(row.get(COL_VALIDATION_CANDIDATES)), + augmented_entities=augmented_entities, + final_entities=final_entities, + ) + + +def build_detection_artifact_row_from_entities( + *, + workflow_name: str, + batch_file: str, + row_index: int, + seed_entities: list[EntitySchema], + seed_validation_candidate_count: int, + merged_validation_candidate_count: int, + augmented_entities: list[EntitySchema], + final_entities: list[EntitySchema], +) -> DetectionArtifactRow: + seed_values = {_value_key(entity.value) for entity in seed_entities} + final_values = {_value_key(entity.value) for entity in final_entities} + augmented_new = [entity for entity in augmented_entities if _value_key(entity.value) not in seed_values] + weak_counts = _weak_api_key_shape_counts(final_entities) + final_entity_signatures = _entity_signature_hashes(final_entities, row_index=int(row_index)) + final_entity_signature_labels = _entity_signature_labels(final_entities, row_index=int(row_index)) + final_entity_signature_details = _entity_signature_details(final_entities, row_index=int(row_index)) + return DetectionArtifactRow( + workflow_name=workflow_name, + batch_file=batch_file, + row_index=int(row_index), + seed_entity_count=len(seed_entities), + seed_validation_candidate_count=seed_validation_candidate_count, + merged_validation_candidate_count=merged_validation_candidate_count, + augmented_entity_count=len(augmented_entities), + final_entity_count=len(final_entities), + augmented_duplicate_seed_value_count=len(augmented_entities) - len(augmented_new), + augmented_new_value_count=len(augmented_new), + augmented_new_final_value_count=sum(1 for entity in augmented_new if _value_key(entity.value) in final_values), + weak_api_key_shape_count=sum(weak_counts.values()), + final_entity_signature_count=len(final_entity_signatures), + final_entity_signature_hashes=final_entity_signatures, + final_entity_signature_labels=final_entity_signature_labels, + final_entity_signature_details=final_entity_signature_details, + weak_api_key_shape_label_counts=dict(weak_counts), + final_label_counts=_count_by(final_entities, "label"), + final_source_counts=_count_by(final_entities, "source"), + ) + + +def _parse_entities(raw: object) -> list[EntitySchema]: + values = _extract_payload_list(raw, key="entities") + parsed = EntitiesSchema.model_validate({"entities": values}) + return [entity for entity in parsed.entities if entity.value and entity.label] + + +def _parse_augmented_entities(raw: object) -> list[EntitySchema]: + values = _extract_payload_list(raw, key="entities") + parsed = AugmentedEntitiesSchema.model_validate({"entities": values}) + return [ + EntitySchema(value=entity.value, label=entity.label, source="augmenter") + for entity in parsed.entities + if entity.value and entity.label + ] + + +def _parse_validation_candidate_count(raw: object) -> int: + values = _extract_payload_list(raw, key="candidates") + parsed = ValidationCandidatesSchema.model_validate({"candidates": values}) + return len(parsed.candidates) + + +def _extract_payload_list(raw: object, *, key: str) -> list[object]: + payload = _coerce_payload(raw) + if isinstance(payload, dict): + return _coerce_list(payload.get(key)) + return _coerce_list(payload) + + +def _coerce_payload(raw: object) -> object: + if _is_missing(raw): + return {} + if hasattr(raw, "tolist"): + raw = raw.tolist() + if not isinstance(raw, str): + return raw + text = raw.strip() + if not text: + return {} + try: + return json.loads(text) + except json.JSONDecodeError: + try: + return ast.literal_eval(text) + except (SyntaxError, ValueError): + return {} + + +def _coerce_list(value: object) -> list[object]: + value = _coerce_payload(value) + if isinstance(value, list): + return value + return [] + + +def _is_missing(value: object) -> bool: + return value is None or (isinstance(value, float) and math.isnan(value)) + + +def _value_key(value: str) -> str: + return " ".join(value.casefold().split()) + + +def _entity_signature_hashes(entities: list[EntitySchema], *, row_index: int) -> list[str]: + signatures = {_entity_signature_hash(entity, row_index=row_index) for entity in entities} + return sorted(signatures) + + +def _entity_signature_labels(entities: list[EntitySchema], *, row_index: int) -> dict[str, str]: + labels = {_entity_signature_hash(entity, row_index=row_index): entity.label for entity in entities} + return dict(sorted(labels.items())) + + +def _entity_signature_details(entities: list[EntitySchema], *, row_index: int) -> dict[str, dict[str, object]]: + details = { + _entity_signature_hash(entity, row_index=row_index): { + "label": entity.label, + "source": entity.source, + "row_index": int(row_index), + "start_position": entity.start_position, + "end_position": entity.end_position, + "value_length": len(entity.value), + } + for entity in entities + } + return dict(sorted(details.items())) + + +def _entity_signature_hash(entity: EntitySchema, *, row_index: int) -> str: + payload = json.dumps( + { + "row": row_index, + "label": entity.label, + "start": entity.start_position, + "end": entity.end_position, + }, + ensure_ascii=True, + sort_keys=True, + ) + return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:16] + + +def _weak_api_key_shape_counts(entities: list[EntitySchema]) -> Counter[str]: + counts: Counter[str] = Counter() + for entity in entities: + if entity.label == "api_key" and not _looks_like_api_key(entity.value): + counts[entity.label] += 1 + return counts + + +def _looks_like_api_key(value: str) -> bool: + stripped = value.strip() + if API_KEY_PREFIX_RE.search(stripped): + return True + compact = re.sub(r"[\s'\";:,/]+", "", stripped) + if len(compact) < 20: + return False + return bool(re.search(r"[A-Za-z]", compact)) and bool(re.search(r"\d", compact)) + + +def _count_by(entities: list[EntitySchema], field: str) -> dict[str, int]: + counts = Counter(str(getattr(entity, field)) for entity in entities if getattr(entity, field)) + return dict(sorted(counts.items())) + + +def write_rows(rows: list[DetectionArtifactRow], output_path: Path, export_format: ExportFormat) -> None: + output_path.parent.mkdir(parents=True, exist_ok=True) + table = pd.json_normalize([row.model_dump() for row in rows], sep=".") + if export_format == ExportFormat.parquet: + table.to_parquet(output_path, index=False) + elif export_format == ExportFormat.csv: + table.to_csv(output_path, index=False) + else: + table.to_json(output_path, orient="records", lines=True) + + +def render_result(result: DetectionArtifactAnalysis, *, json_output: bool) -> str: + if json_output: + return result.model_dump_json(indent=2) + total_warnings = sum(row.weak_api_key_shape_count for row in result.rows) + workflows = Counter(row.workflow_name for row in result.rows) + lines = [f"Analyzed {len(result.rows)} detection artifact row(s) from {result.artifact_path}"] + for workflow_name, count in sorted(workflows.items()): + lines.append(f"- {workflow_name}: {count} row(s)") + lines.append(f"Weak api_key shape warnings: {total_warnings}") + return "\n".join(lines) + + +@app.default +def main( + artifact_path: Path, + *, + output: Annotated[Path | None, cyclopts.Parameter(("--output", "-o"))] = None, + format: Annotated[ExportFormat, cyclopts.Parameter("--format")] = ExportFormat.jsonl, + json_output: Annotated[bool, cyclopts.Parameter("--json")] = False, + log_format: Annotated[LogFormat, cyclopts.Parameter("--log-format")] = LogFormat.plain, +) -> None: + configure_logging(log_format) + try: + result = analyze_artifacts(artifact_path) + if output is not None: + write_rows(result.rows, output, format) + except ValueError as exc: + log_bad_input(logger, str(exc)) + raise SystemExit(125) from exc + sys.stdout.write(render_result(result, json_output=json_output) + "\n") + + +if __name__ == "__main__": + app() diff --git a/tools/measurement/examples/repo-data-smoke-models.yaml b/tools/measurement/examples/repo-data-smoke-models.yaml new file mode 100644 index 00000000..5559587a --- /dev/null +++ b/tools/measurement/examples/repo-data-smoke-models.yaml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +model_configs: + - alias: gliner-pii-detector + model: nvidia/gliner-pii + provider: nvidia + inference_parameters: + max_parallel_requests: 1 + timeout: 120 + + - alias: gpt-oss-120b + model: openai/gpt-oss-120b + provider: nvidia + inference_parameters: + max_parallel_requests: 16 + max_tokens: 16384 + temperature: 0.3 + top_p: 0.95 + timeout: 300 + + - alias: nemotron-30b-thinking + model: nvidia/nemotron-3-nano-30b-a3b + provider: nvidia + inference_parameters: + max_parallel_requests: 16 + max_tokens: 8192 + temperature: 0.4 + top_p: 1.0 + timeout: 300 diff --git a/tools/measurement/examples/repo-data-smoke-providers.yaml b/tools/measurement/examples/repo-data-smoke-providers.yaml new file mode 100644 index 00000000..8799886f --- /dev/null +++ b/tools/measurement/examples/repo-data-smoke-providers.yaml @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +providers: + - name: nvidia + endpoint: https://integrate.api.nvidia.com/v1 + provider_type: openai + api_key: NVIDIA_API_KEY diff --git a/tools/measurement/examples/repo-data-smoke.yaml b/tools/measurement/examples/repo-data-smoke.yaml new file mode 100644 index 00000000..7a381091 --- /dev/null +++ b/tools/measurement/examples/repo-data-smoke.yaml @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +suite_id: repo-data-smoke +model_configs: ./repo-data-smoke-models.yaml +model_providers: ./repo-data-smoke-providers.yaml +workloads: + - id: biographies + source: ../../../docs/data/NVIDIA_synthetic_biographies.csv + text_column: biography + row_limit: 5 + - id: legal + source: ../../../docs/data/TAB_legal_sample25.csv + text_column: text + row_limit: 5 +configs: + - id: biographies-redact-default + replace: redact + - id: legal-hash-agent-labels + detect: + entity_labels: [person, email, api_key, password] + replace: + strategy: hash + digest_length: 12 +matrix: + - workload: biographies + config: biographies-redact-default + - workload: legal + config: legal-hash-agent-labels diff --git a/tools/measurement/examples/run-repo-data-smoke-with-dd-traces.sh b/tools/measurement/examples/run-repo-data-smoke-with-dd-traces.sh new file mode 100644 index 00000000..5bc5612a --- /dev/null +++ b/tools/measurement/examples/run-repo-data-smoke-with-dd-traces.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +output_dir="${1:-/tmp/anonymizer-repo-data-smoke-dd-traces}" +trace_mode="${DD_TRACE_MODE:-last_message}" +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(cd "${script_dir}/../../.." && pwd)" +suite_file="${BENCHMARK_SUITE:-${script_dir}/repo-data-smoke.yaml}" + +cd "${repo_root}" + +uv run python tools/measurement/run_benchmarks.py \ + "${suite_file}" \ + --output "${output_dir}" \ + --overwrite \ + --dd-trace "${trace_mode}" \ + --trace-dir "${output_dir}/traces" diff --git a/tools/measurement/export_measurements.py b/tools/measurement/export_measurements.py new file mode 100755 index 00000000..6480bbf9 --- /dev/null +++ b/tools/measurement/export_measurements.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Export Anonymizer measurement JSONL into per-record-type tables. + +Usage: + uv run python tools/measurement/export_measurements.py measurements.jsonl --output tables + uv run python tools/measurement/export_measurements.py measurements.jsonl -o tables --format csv + uv run python tools/measurement/export_measurements.py measurements.jsonl -o tables --json +""" + +import json +import logging +import sys +from pathlib import Path +from typing import Annotated + +import cyclopts +import pandas as pd +from measurement_tools.cli import LogFormat, configure_logging, log_bad_input +from measurement_tools.tables import ExportFormat, ensure_can_write, write_table +from pydantic import BaseModel, Field + +app = cyclopts.App(help=__doc__) +logger = logging.getLogger("measurement.export") + +MANIFEST_FILENAME = "manifest.json" + + +class TableSummary(BaseModel): + record_type: str + rows: int + columns: int + path: str + + +class ExportResult(BaseModel): + input_path: str + output_dir: str + format: ExportFormat + total_rows: int + tables: list[TableSummary] = Field(default_factory=list) + manifest_path: str + + +def read_measurements(path: Path) -> pd.DataFrame: + if not path.exists(): + raise ValueError(f"input path does not exist: {path}") + if path.is_dir(): + raise ValueError(f"input path is a directory: {path}") + if path.suffix == ".json": + dataframe = pd.read_json(path) + else: + dataframe = pd.read_json(path, lines=True) + if "record_type" not in dataframe.columns: + raise ValueError("measurement input must contain a record_type field") + return dataframe + + +def normalize_table(rows: pd.DataFrame) -> pd.DataFrame: + relevant_rows = rows.dropna(axis="columns", how="all") + normalized = pd.json_normalize(relevant_rows.to_dict("records"), sep=".") + for column in normalized.columns: + if normalized[column].map(_is_nested_value).any(): + normalized[column] = normalized[column].map(_json_cell) + return normalized + + +def _is_nested_value(value: object) -> bool: + return isinstance(value, dict | list) + + +def _json_cell(value: object) -> object: + if not _is_nested_value(value): + return value + return json.dumps(value, ensure_ascii=True, sort_keys=True) + + +def export_tables( + dataframe: pd.DataFrame, + *, + input_path: Path, + output_dir: Path, + export_format: ExportFormat, + overwrite: bool, +) -> ExportResult: + output_dir.mkdir(parents=True, exist_ok=True) + tables = [ + _export_one_table(record_type, rows, output_dir=output_dir, export_format=export_format, overwrite=overwrite) + for record_type, rows in dataframe.groupby("record_type", sort=False) + ] + result = ExportResult( + input_path=str(input_path), + output_dir=str(output_dir), + format=export_format, + total_rows=len(dataframe), + tables=tables, + manifest_path=str(output_dir / MANIFEST_FILENAME), + ) + write_manifest(result, output_dir / MANIFEST_FILENAME, overwrite=overwrite) + return result + + +def _export_one_table( + record_type: str, + rows: pd.DataFrame, + *, + output_dir: Path, + export_format: ExportFormat, + overwrite: bool, +) -> TableSummary: + table = normalize_table(rows) + path = output_dir / f"{record_type}.{export_format.value}" + ensure_can_write(path, overwrite=overwrite) + write_table(table, path, export_format) + return TableSummary(record_type=record_type, rows=len(table), columns=len(table.columns), path=str(path)) + + +def write_manifest(result: ExportResult, path: Path, *, overwrite: bool) -> None: + ensure_can_write(path, overwrite=overwrite) + path.write_text(result.model_dump_json(indent=2) + "\n", encoding="utf-8") + + +def render_result(result: ExportResult, *, json_output: bool) -> str: + if json_output: + return result.model_dump_json(indent=2) + lines = [f"Wrote {len(result.tables)} table(s) from {result.total_rows} measurement record(s)"] + lines.append(f"Output: {result.output_dir}") + for table in result.tables: + lines.append(f"- {table.record_type}: {table.rows} rows, {table.columns} columns -> {table.path}") + lines.append(f"Manifest: {result.manifest_path}") + return "\n".join(lines) + + +@app.default +def main( + input_path: Path, + *, + output: Annotated[Path | None, cyclopts.Parameter(("--output", "-o"))] = None, + format: Annotated[ExportFormat, cyclopts.Parameter("--format")] = ExportFormat.parquet, + overwrite: Annotated[bool, cyclopts.Parameter("--overwrite")] = False, + json_output: Annotated[bool, cyclopts.Parameter("--json")] = False, + log_format: Annotated[LogFormat, cyclopts.Parameter("--log-format")] = LogFormat.plain, +) -> None: + configure_logging(log_format) + output_dir = output or input_path.with_suffix("").with_name(f"{input_path.stem}-tables") + try: + dataframe = read_measurements(input_path) + result = export_tables( + dataframe, + input_path=input_path, + output_dir=output_dir, + export_format=format, + overwrite=overwrite, + ) + except ValueError as exc: + log_bad_input(logger, str(exc)) + raise SystemExit(125) from exc + sys.stdout.write(render_result(result, json_output=json_output) + "\n") + + +if __name__ == "__main__": + app() diff --git a/tools/measurement/measurement_tools/__init__.py b/tools/measurement/measurement_tools/__init__.py new file mode 100644 index 00000000..28a69b6a --- /dev/null +++ b/tools/measurement/measurement_tools/__init__.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Shared support for measurement command-line tools.""" diff --git a/tools/measurement/measurement_tools/cli.py b/tools/measurement/measurement_tools/cli.py new file mode 100644 index 00000000..505bbf2f --- /dev/null +++ b/tools/measurement/measurement_tools/cli.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Shared CLI logging helpers for measurement tools.""" + +from __future__ import annotations + +import json +import logging +import sys +from enum import StrEnum + + +class LogFormat(StrEnum): + plain = "plain" + json = "json" + + +_log_format = LogFormat.plain + + +def configure_logging(log_format: LogFormat) -> None: + global _log_format + + _log_format = log_format + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + + +def log_bad_input(logger: logging.Logger, error: str) -> None: + if _log_format == LogFormat.json: + payload = {"level": "error", "event": "bad_input", "error": error} + sys.stderr.write(json.dumps(payload, ensure_ascii=True, sort_keys=True) + "\n") + return + logger.error("bad_input error=%s", error) diff --git a/tools/measurement/measurement_tools/stats.py b/tools/measurement/measurement_tools/stats.py new file mode 100644 index 00000000..bdc30977 --- /dev/null +++ b/tools/measurement/measurement_tools/stats.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Small aggregation helpers shared by measurement analysis tools.""" + +from __future__ import annotations + +from typing import cast + +import pandas as pd + + +def none_if_nan(value: object) -> str | None: + if pd.isna(value): + return None + return str(value) + + +def median_or_none(dataframe: pd.DataFrame, column: str) -> float | None: + if column not in dataframe.columns: + return None + values = pd.to_numeric(dataframe[column], errors="coerce").dropna() + if values.empty: + return None + return float(values.median()) + + +def sum_int_or_zero(dataframe: pd.DataFrame, column: str) -> int: + return int(sum_or_zero(dataframe, column)) + + +def sum_or_zero(dataframe: pd.DataFrame, column: str) -> float: + value = sum_or_none(dataframe, column) + return 0.0 if value is None else value + + +def sum_or_none(dataframe: pd.DataFrame, column: str) -> float | None: + if column not in dataframe.columns: + return None + values = pd.to_numeric(dataframe[column], errors="coerce").dropna() + if values.empty: + return None + return float(values.sum()) + + +def optional_number(value: object) -> float | None: + if value is None or pd.isna(value): + return None + return float(cast(float, value)) diff --git a/tools/measurement/measurement_tools/tables.py b/tools/measurement/measurement_tools/tables.py new file mode 100644 index 00000000..4e697aa8 --- /dev/null +++ b/tools/measurement/measurement_tools/tables.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Shared table export helpers for measurement analysis tools.""" + +from __future__ import annotations + +from dataclasses import dataclass +from enum import StrEnum +from pathlib import Path +from typing import Sequence + +import pandas as pd +from pydantic import BaseModel, Field + + +class ExportFormat(StrEnum): + parquet = "parquet" + csv = "csv" + jsonl = "jsonl" + + +class TableSummary(BaseModel): + table: str + rows: int + path: str + + +class AnalysisExportResult(BaseModel): + output_dir: str + format: ExportFormat + tables: list[TableSummary] = Field(default_factory=list) + manifest_path: str + + +@dataclass(frozen=True) +class ModelTableSpec: + name: str + rows: Sequence[BaseModel] + row_model: type[BaseModel] | None = None + + +def write_analysis_tables( + output_dir: Path, + export_format: ExportFormat, + specs: Sequence[ModelTableSpec], +) -> AnalysisExportResult: + output_dir.mkdir(parents=True, exist_ok=True) + tables = [ + write_model_rows(spec.rows, output_dir / f"{spec.name}.{export_format.value}", export_format, spec.row_model) + for spec in specs + ] + export_result = AnalysisExportResult( + output_dir=str(output_dir), + format=export_format, + tables=tables, + manifest_path=str(output_dir / "manifest.json"), + ) + Path(export_result.manifest_path).write_text(export_result.model_dump_json(indent=2) + "\n", encoding="utf-8") + return export_result + + +def write_model_rows( + rows: Sequence[BaseModel], + path: Path, + export_format: ExportFormat, + row_model: type[BaseModel] | None = None, +) -> TableSummary: + table = rows_to_table(rows, row_model) + write_table(table, path, export_format) + return TableSummary(table=path.stem, rows=len(table), path=str(path)) + + +def rows_to_table(rows: Sequence[BaseModel], row_model: type[BaseModel] | None = None) -> pd.DataFrame: + if rows: + return pd.json_normalize([row.model_dump() for row in rows], sep=".") + if row_model is None: + return pd.DataFrame() + return pd.DataFrame(columns=list(row_model.model_fields)) + + +def write_table(table: pd.DataFrame, path: Path, export_format: ExportFormat) -> None: + if export_format == ExportFormat.parquet: + table.to_parquet(path, index=False) + elif export_format == ExportFormat.csv: + table.to_csv(path, index=False) + else: + table.to_json(path, orient="records", lines=True) + + +def ensure_can_write(path: Path, *, overwrite: bool) -> None: + if path.exists() and not overwrite: + raise ValueError(f"output already exists: {path}; pass --overwrite to replace it") diff --git a/tools/measurement/run_benchmarks.py b/tools/measurement/run_benchmarks.py new file mode 100755 index 00000000..5e67ca6c --- /dev/null +++ b/tools/measurement/run_benchmarks.py @@ -0,0 +1,1252 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Run Anonymizer benchmark suites and export measurement tables. + +Usage: + uv run python tools/measurement/run_benchmarks.py suite.yaml --output benchmark-runs/suite + uv run python tools/measurement/run_benchmarks.py suite.yaml --dry-run --json +""" + +import logging +import shutil +import sys +import time +from dataclasses import dataclass +from enum import StrEnum +from pathlib import Path +from typing import Annotated, Any + +import cyclopts +import pandas as pd +import pyarrow.parquet as pq +import yaml +from analyze_detection_artifacts import ( + analyze_artifacts, + iter_detection_parquet_files, +) +from data_designer.config.models import ModelProvider +from data_designer.config.utils.io_helpers import load_config_file +from export_measurements import export_tables, read_measurements +from measurement_tools.cli import LogFormat, configure_logging, log_bad_input +from measurement_tools.tables import ExportFormat +from pydantic import BaseModel, ConfigDict, Field, ValidationError, model_validator + +from anonymizer.config.anonymizer_config import ( + AnonymizerConfig, + AnonymizerInput, + Detect, + Rewrite, + infer_input_source_suffix, + is_remote_input_source, +) +from anonymizer.config.replace_strategies import Annotate, Hash, Redact, Substitute +from anonymizer.config.rewrite import DEFAULT_PRESERVE_TEXT, DEFAULT_PROTECT_TEXT, PrivacyGoal, RiskTolerance +from anonymizer.engine.io.constants import SUPPORTED_IO_FORMATS +from anonymizer.engine.ndd.model_loader import parse_model_configs, validate_model_alias_references +from anonymizer.interface.anonymizer import Anonymizer +from anonymizer.measurement import MeasurementConfig, configured_measurement_session, record_evaluation_metrics + +app = cyclopts.App(help=__doc__) +logger = logging.getLogger("measurement.benchmark") + + +class CaseStatus(StrEnum): + planned = "planned" + completed = "completed" + error = "error" + + +class DDTraceMode(StrEnum): + none = "none" + last_message = "last_message" + all_messages = "all_messages" + + +class ReplaceKind(StrEnum): + redact = "redact" + hash = "hash" + annotate = "annotate" + substitute = "substitute" + + +class WorkloadSpec(BaseModel): + model_config = ConfigDict(extra="forbid") + + id: str + source: str + text_column: str = "text" + id_column: str | None = None + data_summary: str | None = None + row_limit: int | None = Field(default=None, ge=1) + row_offset: int = Field(default=0, ge=0) + + +class ReplaceSpec(BaseModel): + model_config = ConfigDict(extra="forbid") + + strategy: ReplaceKind + format_template: str | None = None + normalize_label: bool | None = None + algorithm: str | None = None + digest_length: int | None = None + instructions: str | None = None + + +class RewriteSpec(BaseModel): + model_config = ConfigDict(extra="forbid") + + protect: str | None = None + preserve: str | None = None + instructions: str | None = None + risk_tolerance: RiskTolerance = RiskTolerance.low + max_repair_iterations: int = 3 + strict_entity_protection: bool = False + + +class ConfigSpec(BaseModel): + model_config = ConfigDict(extra="forbid") + + id: str + detect: dict[str, Any] = Field(default_factory=dict) + replace: str | ReplaceSpec | None = None + rewrite: RewriteSpec | None = None + evaluate: bool = False + emit_telemetry: bool = False + + @model_validator(mode="after") + def validate_mode(self) -> "ConfigSpec": + if self.replace is None and self.rewrite is None: + raise ValueError("config must define replace or rewrite") + if self.replace is not None and self.rewrite is not None: + raise ValueError("config cannot define both replace and rewrite") + if self.evaluate and self.rewrite is not None: + raise ValueError("evaluate is only supported for replace configs") + return self + + +class MatrixEntry(BaseModel): + model_config = ConfigDict(extra="forbid") + + workload: str + config: str + repetitions: int = Field(default=1, ge=1) + + +RESERVED_RUN_TAG_KEYS = frozenset({"suite_id", "workload_id", "config_id", "repetition", "case_id"}) + + +def _duplicates(values: list[str]) -> list[str]: + seen: set[str] = set() + duplicates: set[str] = set() + for value in values: + if value in seen: + duplicates.add(value) + seen.add(value) + return sorted(duplicates) + + +class BenchmarkSpec(BaseModel): + model_config = ConfigDict(extra="forbid") + + suite_id: str + model_configs: str | None = None + model_providers: str | None = None + artifact_path: str | None = None + run_tags: dict[str, Any] = Field(default_factory=dict) + case_retries: int = Field(default=0, ge=0) + case_retry_backoff_sec: float = Field(default=0.0, ge=0.0) + workloads: list[WorkloadSpec] = Field(min_length=1) + configs: list[ConfigSpec] = Field(min_length=1) + matrix: list[MatrixEntry] | None = Field(default=None, min_length=1) + + @model_validator(mode="after") + def validate_ids(self) -> "BenchmarkSpec": + workload_ids = [workload.id for workload in self.workloads] + config_ids = [config.id for config in self.configs] + if duplicate_workloads := _duplicates(workload_ids): + raise ValueError(f"duplicate workload id(s): {', '.join(duplicate_workloads)}") + if duplicate_configs := _duplicates(config_ids): + raise ValueError(f"duplicate config id(s): {', '.join(duplicate_configs)}") + self._validate_matrix_references(set(workload_ids), set(config_ids)) + self._validate_run_tags() + return self + + def _validate_matrix_references(self, workload_ids: set[str], config_ids: set[str]) -> None: + if self.matrix is None: + return + missing_workloads = sorted({entry.workload for entry in self.matrix} - workload_ids) + missing_configs = sorted({entry.config for entry in self.matrix} - config_ids) + if missing_workloads: + raise ValueError(f"matrix references unknown workload id(s): {', '.join(missing_workloads)}") + if missing_configs: + raise ValueError(f"matrix references unknown config id(s): {', '.join(missing_configs)}") + duplicate_entries = _duplicate_matrix_entries(self.matrix) + if duplicate_entries: + formatted = ", ".join(f"{workload}/{config}" for workload, config in duplicate_entries) + raise ValueError(f"duplicate matrix workload/config entry(s): {formatted}; use repetitions for repeats") + + def _validate_run_tags(self) -> None: + reserved_tags = sorted(set(self.run_tags) & RESERVED_RUN_TAG_KEYS) + if reserved_tags: + formatted = ", ".join(reserved_tags) + raise ValueError(f"run_tags cannot define reserved benchmark tag(s): {formatted}") + + +class BenchmarkCase(BaseModel): + suite_id: str + workload_id: str + config_id: str + repetition: int + case_id: str + status: CaseStatus = CaseStatus.planned + elapsed_sec: float | None = None + measurement_path: str | None = None + detection_artifact_path: str | None = None + trace_path: str | None = None + task_trace_path: str | None = None + error: str | None = None + attempt_count: int = 0 + attempt_errors: list[str] = Field(default_factory=list) + + +class BenchmarkResult(BaseModel): + suite_id: str + output_dir: str + measurement_path: str + summary_path: str + table_dir: str | None + detection_artifact_analysis_path: str | None = None + cases: list[BenchmarkCase] + + +@dataclass(frozen=True) +class _CaseRunPaths: + raw_path: Path + artifact_output_path: Path + trace_path: Path | None + task_trace_path: Path | None + artifact_snapshot: dict[str, int] | None + export_detection_artifacts: bool + + +def load_spec(path: Path) -> BenchmarkSpec: + if not path.exists() or path.is_dir(): + raise ValueError(f"spec path is not a file: {path}") + raw = yaml.safe_load(path.read_text(encoding="utf-8")) + if not isinstance(raw, dict): + raise ValueError("benchmark spec must be a YAML mapping") + return BenchmarkSpec.model_validate(raw) + + +def build_cases(spec: BenchmarkSpec) -> list[BenchmarkCase]: + matrix = spec.matrix or _cross_product_matrix(spec) + return [ + BenchmarkCase( + suite_id=spec.suite_id, + workload_id=entry.workload, + config_id=entry.config, + repetition=repetition, + case_id=f"{entry.workload}__{entry.config}__r{repetition:03d}", + ) + for entry in matrix + for repetition in range(entry.repetitions) + ] + + +def _cross_product_matrix(spec: BenchmarkSpec) -> list[MatrixEntry]: + return [ + MatrixEntry(workload=workload.id, config=config.id, repetitions=1) + for workload in spec.workloads + for config in spec.configs + ] + + +def _duplicate_matrix_entries(matrix: list[MatrixEntry]) -> list[tuple[str, str]]: + seen: set[tuple[str, str]] = set() + duplicates: set[tuple[str, str]] = set() + for entry in matrix: + key = (entry.workload, entry.config) + if key in seen: + duplicates.add(key) + seen.add(key) + return sorted(duplicates) + + +def prepare_output_dir(output_dir: Path, *, overwrite: bool, dry_run: bool) -> None: + if dry_run: + return + if output_dir.exists() and not output_dir.is_dir(): + raise ValueError(f"output path exists and is not a directory: {output_dir}") + if output_dir.exists(): + if overwrite: + shutil.rmtree(output_dir) + elif any(output_dir.iterdir()): + raise ValueError(f"output directory is not empty: {output_dir}; pass --overwrite to replace it") + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / "raw").mkdir(exist_ok=True) + + +def preflight_suite(spec: BenchmarkSpec, *, spec_path: Path) -> None: + """Validate cheap suite inputs before any benchmark case consumes model time.""" + base_dir = spec_path.parent + errors: list[str] = [] + parsed_models = _preflight_model_configs(spec, base_dir=base_dir, errors=errors) + + _preflight_model_providers_with_errors(spec, base_dir=base_dir, errors=errors) + errors.extend(_preflight_workload_errors(spec, base_dir=base_dir)) + errors.extend(_preflight_config_errors(spec, parsed_models=parsed_models)) + if errors: + raise ValueError("Benchmark preflight failed:\n- " + "\n- ".join(errors)) + + +def _preflight_model_configs(spec: BenchmarkSpec, *, base_dir: Path, errors: list[str]) -> Any | None: + try: + return parse_model_configs(_resolve_config_source(spec.model_configs, base_dir)) + except Exception as exc: + errors.append(f"model_configs invalid: {exc}") + return None + + +def _preflight_model_providers_with_errors( + spec: BenchmarkSpec, + *, + base_dir: Path, + errors: list[str], +) -> None: + try: + _preflight_model_providers(spec, base_dir=base_dir) + except Exception as exc: + errors.append(f"model_providers invalid: {exc}") + + +def _preflight_workload_errors(spec: BenchmarkSpec, *, base_dir: Path) -> list[str]: + errors: list[str] = [] + for workload in spec.workloads: + try: + _preflight_workload(workload, base_dir=base_dir) + except Exception as exc: + errors.append(str(exc)) + return errors + + +def _preflight_config_errors(spec: BenchmarkSpec, *, parsed_models: Any | None) -> list[str]: + errors: list[str] = [] + active_config_ids = _active_config_ids(spec) + for config in spec.configs: + if config.id not in active_config_ids: + continue + try: + anonymizer_config = build_anonymizer_config(config) + except Exception as exc: + errors.append(f"config '{config.id}' invalid: {exc}") + continue + if parsed_models is None: + continue + try: + validate_model_alias_references( + parsed_models.model_configs, + parsed_models.selected_models, + check_substitute=isinstance(anonymizer_config.replace, Substitute) + or anonymizer_config.rewrite is not None, + check_rewrite=anonymizer_config.rewrite is not None, + check_evaluate=config.evaluate, + ) + except ValueError as exc: + errors.append(f"config '{config.id}' model aliases invalid: {exc}") + return errors + + +def _active_config_ids(spec: BenchmarkSpec) -> set[str]: + if spec.matrix is None: + return {config.id for config in spec.configs} + return {entry.config for entry in spec.matrix} + + +def _preflight_model_providers(spec: BenchmarkSpec, *, base_dir: Path) -> None: + raw = _resolve_config_source(spec.model_providers, base_dir) + if raw is None: + return + if "\n" in raw: + config_dict = yaml.safe_load(raw) + else: + candidate = Path(raw.strip()).expanduser() + if candidate.suffix in (".yaml", ".yml"): + if not candidate.is_file(): + raise FileNotFoundError(f"Providers config file not found: {candidate}") + config_dict = load_config_file(candidate) + else: + config_dict = yaml.safe_load(raw) + raw_providers = config_dict.get("providers") if isinstance(config_dict, dict) else None + if not isinstance(raw_providers, list): + raise ValueError("model_providers YAML must contain a top-level 'providers' list.") + for provider in raw_providers: + ModelProvider.model_validate(provider) + + +def _preflight_workload(workload: WorkloadSpec, *, base_dir: Path) -> None: + resolved_source = _resolve_input_source(workload.source, base_dir) + if _workload_has_row_slice(workload) and not _is_local_input_source(str(resolved_source)): + raise ValueError(f"workload '{workload.id}' row slicing requires a local workload source") + input_data = AnonymizerInput( + source=str(resolved_source), + text_column=workload.text_column, + id_column=workload.id_column, + data_summary=workload.data_summary, + ) + columns = _input_columns(input_data.source) + if columns is None: + return + if workload.text_column not in columns: + raise ValueError( + f"workload '{workload.id}' text_column '{workload.text_column}' not found in {input_data.source}; " + f"available columns: {sorted(columns)}" + ) + if workload.id_column is not None and workload.id_column not in columns: + raise ValueError( + f"workload '{workload.id}' id_column '{workload.id_column}' not found in {input_data.source}; " + f"available columns: {sorted(columns)}" + ) + + +def _input_columns(source: str) -> set[str] | None: + suffix = infer_input_source_suffix(source) + if suffix not in SUPPORTED_IO_FORMATS: + supported_formats = " or ".join(SUPPORTED_IO_FORMATS) + raise ValueError(f"Unsupported input format: {suffix}. Use {supported_formats}.") + if is_remote_input_source(source): + return None + if suffix == ".csv": + return set(pd.read_csv(source, nrows=0).columns) + return set(pq.ParquetFile(source).schema_arrow.names) + + +def run_suite( + spec: BenchmarkSpec, + *, + spec_path: Path, + output_dir: Path, + export: bool, + fail_fast: bool, + dd_trace: DDTraceMode, + trace_dir: Path | None, + dd_task_trace: bool = False, + task_trace_dir: Path | None = None, +) -> BenchmarkResult: + contexts = _build_contexts( + spec, + spec_path=spec_path, + output_dir=output_dir, + dd_trace=dd_trace, + trace_dir=trace_dir, + dd_task_trace=dd_task_trace, + task_trace_dir=task_trace_dir, + ) + anonymizer = Anonymizer(**contexts["anonymizer_kwargs"]) + cases = _run_cases(spec, contexts=contexts, anonymizer=anonymizer, fail_fast=fail_fast, export=export) + measurement_path = combine_measurements(cases, output_dir / "measurements.jsonl") + should_export = _should_export_measurements(export=export, measurement_path=measurement_path) + table_dir = _export_suite_tables(measurement_path, output_dir=output_dir, should_export=should_export) + artifact_analysis_path = _combine_suite_detection_artifacts( + cases, output_dir=output_dir, should_export=should_export + ) + result = _benchmark_result( + spec, + output_dir=output_dir, + measurement_path=measurement_path, + table_dir=table_dir, + artifact_analysis_path=artifact_analysis_path, + cases=cases, + ) + write_summary(result) + return result + + +def _run_cases( + spec: BenchmarkSpec, + *, + contexts: dict[str, Any], + anonymizer: Anonymizer, + fail_fast: bool, + export: bool, +) -> list[BenchmarkCase]: + return [ + _run_case( + case, + spec, + contexts=contexts, + anonymizer=anonymizer, + fail_fast=fail_fast, + export_detection_artifacts=export, + ) + for case in build_cases(spec) + ] + + +def _should_export_measurements(*, export: bool, measurement_path: Path) -> bool: + return export and measurement_path.stat().st_size > 0 + + +def _export_suite_tables(measurement_path: Path, *, output_dir: Path, should_export: bool) -> Path | None: + if not should_export: + return None + return export_measurement_tables(measurement_path, output_dir / "tables") + + +def _combine_suite_detection_artifacts( + cases: list[BenchmarkCase], + *, + output_dir: Path, + should_export: bool, +) -> Path | None: + if not should_export: + return None + return combine_detection_artifact_analysis(cases, output_dir / "detection-artifacts.jsonl") + + +def _benchmark_result( + spec: BenchmarkSpec, + *, + output_dir: Path, + measurement_path: Path, + table_dir: Path | None, + artifact_analysis_path: Path | None, + cases: list[BenchmarkCase], +) -> BenchmarkResult: + return BenchmarkResult( + suite_id=spec.suite_id, + output_dir=str(output_dir), + measurement_path=str(measurement_path), + summary_path=str(output_dir / "summary.json"), + table_dir=str(table_dir) if table_dir is not None else None, + detection_artifact_analysis_path=str(artifact_analysis_path) if artifact_analysis_path is not None else None, + cases=cases, + ) + + +def _build_contexts( + spec: BenchmarkSpec, + *, + spec_path: Path, + output_dir: Path, + dd_trace: DDTraceMode, + trace_dir: Path | None, + dd_task_trace: bool = False, + task_trace_dir: Path | None = None, +) -> dict[str, Any]: + base_dir = spec_path.parent + artifact_path = _resolve_optional_path(spec.artifact_path, base_dir) or output_dir / "artifacts" + return { + "base_dir": base_dir, + "workloads": {workload.id: workload for workload in spec.workloads}, + "configs": {config.id: config for config in spec.configs}, + "raw_dir": output_dir / "raw", + "dd_trace": dd_trace, + "trace_dir": trace_dir or output_dir / "traces", + "dd_task_trace": dd_task_trace, + "task_trace_dir": task_trace_dir or output_dir / "task-traces", + "artifact_path": artifact_path, + "anonymizer_kwargs": { + "model_configs": _resolve_config_source(spec.model_configs, base_dir), + "model_providers": _resolve_config_source(spec.model_providers, base_dir), + "artifact_path": artifact_path, + }, + } + + +def _run_case( + case: BenchmarkCase, + spec: BenchmarkSpec, + *, + contexts: dict[str, Any], + anonymizer: Anonymizer, + fail_fast: bool, + export_detection_artifacts: bool, +) -> BenchmarkCase: + started = time.perf_counter() + attempt_errors: list[str] = [] + max_attempts = 1 if fail_fast else spec.case_retries + 1 + for attempt_number in range(1, max_attempts + 1): + paths = _case_run_paths(case, contexts=contexts, export_detection_artifacts=export_detection_artifacts) + try: + return _run_case_success( + case, + spec, + contexts=contexts, + anonymizer=anonymizer, + paths=paths, + started=started, + attempt_count=attempt_number, + attempt_errors=attempt_errors, + ) + except Exception as exc: + if fail_fast: + raise + attempt_errors.append(str(exc)) + if attempt_number >= max_attempts: + return _run_case_error( + case, + contexts=contexts, + paths=paths, + started=started, + error=exc, + attempt_count=attempt_number, + attempt_errors=attempt_errors, + ) + _sleep_before_case_retry(spec, case=case, attempt_number=attempt_number, error=exc) + + raise RuntimeError("unreachable benchmark retry state") + + +def _sleep_before_case_retry( + spec: BenchmarkSpec, + *, + case: BenchmarkCase, + attempt_number: int, + error: Exception, +) -> None: + logger.warning( + "case %s attempt %d failed; retrying after %.2fs: %s", + case.case_id, + attempt_number, + spec.case_retry_backoff_sec, + error, + ) + if spec.case_retry_backoff_sec > 0: + time.sleep(spec.case_retry_backoff_sec) + + +def _case_run_paths( + case: BenchmarkCase, + *, + contexts: dict[str, Any], + export_detection_artifacts: bool, +) -> _CaseRunPaths: + return _CaseRunPaths( + raw_path=contexts["raw_dir"] / f"{case.case_id}.jsonl", + artifact_output_path=contexts["raw_dir"] / f"{case.case_id}.detection-artifacts.jsonl", + trace_path=_case_trace_path(case, contexts=contexts), + task_trace_path=_case_task_trace_path(case, contexts=contexts), + artifact_snapshot=snapshot_detection_artifacts(contexts["artifact_path"]) + if export_detection_artifacts + else None, + export_detection_artifacts=export_detection_artifacts, + ) + + +def _run_case_success( + case: BenchmarkCase, + spec: BenchmarkSpec, + *, + contexts: dict[str, Any], + anonymizer: Anonymizer, + paths: _CaseRunPaths, + started: float, + attempt_count: int, + attempt_errors: list[str], +) -> BenchmarkCase: + workload = _get_item(contexts["workloads"], case.workload_id, "workload") + config = _get_item(contexts["configs"], case.config_id, "config") + _execute_case( + anonymizer, + workload, + config, + raw_path=paths.raw_path, + trace_path=paths.trace_path, + task_trace_path=paths.task_trace_path, + case=case, + spec=spec, + base_dir=contexts["base_dir"], + dd_trace=contexts["dd_trace"], + ) + detection_artifact_path = _case_detection_artifact_path( + contexts, + paths, + case=case, + ) + return _case_with_result( + case, + status=CaseStatus.completed, + started=started, + raw_path=paths.raw_path, + detection_artifact_path=detection_artifact_path, + trace_path=paths.trace_path, + task_trace_path=paths.task_trace_path, + attempt_count=attempt_count, + attempt_errors=attempt_errors, + ) + + +def _run_case_error( + case: BenchmarkCase, + *, + contexts: dict[str, Any], + paths: _CaseRunPaths, + started: float, + error: Exception, + attempt_count: int, + attempt_errors: list[str], +) -> BenchmarkCase: + detection_artifact_path = _export_case_detection_artifacts_if_requested( + contexts, + paths.artifact_output_path, + case=case, + artifact_snapshot=paths.artifact_snapshot, + ) + return _case_with_result( + case, + status=CaseStatus.error, + started=started, + raw_path=paths.raw_path, + detection_artifact_path=detection_artifact_path, + trace_path=paths.trace_path, + task_trace_path=paths.task_trace_path, + error=str(error), + attempt_count=attempt_count, + attempt_errors=attempt_errors, + ) + + +def _case_detection_artifact_path( + contexts: dict[str, Any], + paths: _CaseRunPaths, + *, + case: BenchmarkCase, +) -> Path | None: + detection_artifact_path = _export_case_detection_artifacts_if_requested( + contexts, + paths.artifact_output_path, + case=case, + artifact_snapshot=paths.artifact_snapshot, + ) + if detection_artifact_path is not None or paths.artifact_snapshot is None: + return detection_artifact_path + return None + + +def _case_with_result( + case: BenchmarkCase, + *, + status: CaseStatus, + started: float, + raw_path: Path, + detection_artifact_path: Path | None, + trace_path: Path | None, + task_trace_path: Path | None, + attempt_count: int, + attempt_errors: list[str], + error: str | None = None, +) -> BenchmarkCase: + return case.model_copy( + update={ + "status": status, + "elapsed_sec": time.perf_counter() - started, + "measurement_path": str(raw_path), + "detection_artifact_path": (str(detection_artifact_path) if detection_artifact_path is not None else None), + "trace_path": str(trace_path) if trace_path is not None else None, + "task_trace_path": str(task_trace_path) if task_trace_path is not None else None, + "error": error, + "attempt_count": attempt_count, + "attempt_errors": list(attempt_errors), + } + ) + + +def _export_case_detection_artifacts_if_requested( + contexts: dict[str, Any], + output_path: Path, + *, + case: BenchmarkCase, + artifact_snapshot: dict[str, int] | None, +) -> Path | None: + if artifact_snapshot is None: + return None + return export_case_detection_artifact_analysis( + contexts["artifact_path"], + output_path, + case=case, + artifact_snapshot=artifact_snapshot, + ) + + +def _case_trace_path(case: BenchmarkCase, *, contexts: dict[str, Any]) -> Path | None: + if contexts["dd_trace"] == DDTraceMode.none: + return None + return contexts["trace_dir"] / f"{case.case_id}.jsonl" + + +def _case_task_trace_path(case: BenchmarkCase, *, contexts: dict[str, Any]) -> Path | None: + if not contexts["dd_task_trace"]: + return None + return contexts["task_trace_dir"] / f"{case.case_id}.jsonl" + + +def _execute_case( + anonymizer: Anonymizer, + workload: WorkloadSpec, + config: ConfigSpec, + *, + raw_path: Path, + trace_path: Path | None, + task_trace_path: Path | None, + case: BenchmarkCase, + spec: BenchmarkSpec, + base_dir: Path, + dd_trace: DDTraceMode, +) -> None: + anonymizer_config = build_anonymizer_config(config) + input_data = build_input( + workload, + base_dir, + slice_dir=raw_path.parent / "inputs", + case_id=case.case_id, + ) + measurement = MeasurementConfig( + output_path=raw_path, + run_id=case.case_id, + run_tags=_run_tags(case, spec), + streaming=True, + keep_records=False, + dd_trace=dd_trace.value, + dd_trace_path=trace_path, + dd_task_trace_path=task_trace_path, + fail_on_write_error=True, + ) + with configured_measurement_session(measurement): + result = anonymizer.run( + config=anonymizer_config, + data=input_data, + ) + if config.evaluate: + evaluated = anonymizer.evaluate(result) + record_evaluation_metrics( + evaluated.trace_dataframe, + mode="replace", + strategy=type(anonymizer_config.replace).__name__, + text_column=evaluated.resolved_text_column, + ) + + +def build_input( + workload: WorkloadSpec, + base_dir: Path, + *, + slice_dir: Path | None = None, + case_id: str | None = None, +) -> AnonymizerInput: + resolved_source = _resolve_input_source(workload.source, base_dir) + source = ( + _materialize_sliced_source(workload, resolved_source, slice_dir=slice_dir, case_id=case_id) + if _workload_has_row_slice(workload) + else resolved_source + ) + return AnonymizerInput( + source=str(source), + text_column=workload.text_column, + id_column=workload.id_column, + data_summary=workload.data_summary, + ) + + +def _workload_has_row_slice(workload: WorkloadSpec) -> bool: + return workload.row_limit is not None or workload.row_offset > 0 + + +def _is_local_input_source(source: str) -> bool: + return "://" not in source + + +def _materialize_sliced_source( + workload: WorkloadSpec, + source: str | Path, + *, + slice_dir: Path | None, + case_id: str | None, +) -> Path: + if not _is_local_input_source(str(source)): + raise ValueError(f"workload '{workload.id}' row slicing requires a local workload source") + if slice_dir is None or case_id is None: + raise ValueError("row slicing requires slice_dir and case_id") + source_path = Path(source) + suffix = infer_input_source_suffix(str(source_path)) + dataframe = _read_local_input_dataframe(source_path, suffix=suffix) + sliced = dataframe.iloc[_slice_bounds(workload)] + slice_dir.mkdir(parents=True, exist_ok=True) + destination = slice_dir / f"{_safe_case_filename(case_id)}{suffix}" + _write_local_input_dataframe(sliced, destination, suffix=suffix) + return destination + + +def _slice_bounds(workload: WorkloadSpec) -> slice: + start = workload.row_offset + stop = start + workload.row_limit if workload.row_limit is not None else None + return slice(start, stop) + + +def _read_local_input_dataframe(source: Path, *, suffix: str) -> pd.DataFrame: + if suffix == ".csv": + return pd.read_csv(source) + if suffix == ".parquet": + return pd.read_parquet(source) + supported_formats = " or ".join(SUPPORTED_IO_FORMATS) + raise ValueError(f"Unsupported input format: {suffix}. Use {supported_formats}.") + + +def _write_local_input_dataframe(dataframe: pd.DataFrame, destination: Path, *, suffix: str) -> None: + if suffix == ".csv": + dataframe.to_csv(destination, index=False) + return + if suffix == ".parquet": + dataframe.to_parquet(destination, index=False) + return + supported_formats = " or ".join(SUPPORTED_IO_FORMATS) + raise ValueError(f"Unsupported input format: {suffix}. Use {supported_formats}.") + + +def _safe_case_filename(case_id: str) -> str: + return "".join(char if char.isalnum() or char in "._-" else "_" for char in case_id) + + +def build_anonymizer_config(config: ConfigSpec) -> AnonymizerConfig: + detect = Detect.model_validate(config.detect) + if config.replace is not None: + return AnonymizerConfig( + detect=detect, replace=build_replace(config.replace), emit_telemetry=config.emit_telemetry + ) + return AnonymizerConfig(detect=detect, rewrite=build_rewrite(config.rewrite), emit_telemetry=config.emit_telemetry) + + +def build_replace(raw: str | ReplaceSpec) -> Redact | Hash | Annotate | Substitute: + spec = ReplaceSpec(strategy=ReplaceKind(raw)) if isinstance(raw, str) else raw + if spec.strategy == ReplaceKind.redact: + return Redact(**_present({"format_template": spec.format_template, "normalize_label": spec.normalize_label})) + if spec.strategy == ReplaceKind.hash: + return Hash( + **_present( + { + "format_template": spec.format_template, + "algorithm": spec.algorithm, + "digest_length": spec.digest_length, + } + ) + ) + if spec.strategy == ReplaceKind.annotate: + return Annotate(**_present({"format_template": spec.format_template})) + return Substitute(**_present({"instructions": spec.instructions})) + + +def build_rewrite(spec: RewriteSpec | None) -> Rewrite: + if spec is None: + raise ValueError("rewrite config is missing") + privacy_goal = _privacy_goal(spec) + return Rewrite( + privacy_goal=privacy_goal, + instructions=spec.instructions, + risk_tolerance=spec.risk_tolerance, + max_repair_iterations=spec.max_repair_iterations, + strict_entity_protection=spec.strict_entity_protection, + ) + + +def _privacy_goal(spec: RewriteSpec) -> PrivacyGoal | None: + if spec.protect is None and spec.preserve is None: + return None + return PrivacyGoal( + protect=spec.protect or DEFAULT_PROTECT_TEXT, + preserve=spec.preserve or DEFAULT_PRESERVE_TEXT, + ) + + +def combine_measurements(cases: list[BenchmarkCase], destination: Path) -> Path: + with destination.open("w", encoding="utf-8") as output: + for case in cases: + if case.measurement_path is None: + continue + source = Path(case.measurement_path) + if source.exists(): + output.write(source.read_text(encoding="utf-8")) + return destination + + +def combine_detection_artifact_analysis(cases: list[BenchmarkCase], destination: Path) -> Path | None: + chunks: list[str] = [] + for case in cases: + if case.detection_artifact_path is None: + continue + source = Path(case.detection_artifact_path) + if source.exists(): + chunks.append(_jsonl_chunk(source.read_text(encoding="utf-8"))) + if not chunks: + return None + destination.write_text("".join(chunks), encoding="utf-8") + return destination + + +def _jsonl_chunk(text: str) -> str: + if not text or text.endswith("\n"): + return text + return text + "\n" + + +def export_measurement_tables(measurement_path: Path, table_dir: Path) -> Path: + dataframe = read_measurements(measurement_path) + export_tables( + dataframe, input_path=measurement_path, output_dir=table_dir, export_format=ExportFormat.parquet, overwrite=True + ) + return table_dir + + +def snapshot_detection_artifacts(artifact_path: Path) -> dict[str, int]: + if not artifact_path.exists(): + return {} + return { + str(parquet_file.relative_to(artifact_path)): parquet_file.stat().st_mtime_ns + for parquet_file in iter_detection_parquet_files(artifact_path) + } + + +def changed_detection_artifact_files(artifact_path: Path, snapshot: dict[str, int]) -> list[Path]: + if not artifact_path.exists(): + return [] + changed: list[Path] = [] + for parquet_file in iter_detection_parquet_files(artifact_path): + key = str(parquet_file.relative_to(artifact_path)) + if snapshot.get(key) != parquet_file.stat().st_mtime_ns: + changed.append(parquet_file) + return changed + + +def export_detection_artifact_analysis( + artifact_path: Path, + output_path: Path, + *, + artifact_snapshot: dict[str, int] | None = None, +) -> Path | None: + if not artifact_path.exists(): + return None + parquet_files = ( + changed_detection_artifact_files(artifact_path, artifact_snapshot) if artifact_snapshot is not None else None + ) + analysis = analyze_artifacts(artifact_path, parquet_files=parquet_files) + if not analysis.rows: + return None + write_detection_artifact_payloads([row.model_dump() for row in analysis.rows], output_path) + return output_path + + +def export_case_detection_artifact_analysis( + artifact_path: Path, + output_path: Path, + *, + case: BenchmarkCase, + artifact_snapshot: dict[str, int], +) -> Path | None: + if not artifact_path.exists(): + return None + parquet_files = changed_detection_artifact_files(artifact_path, artifact_snapshot) + analysis = analyze_artifacts(artifact_path, parquet_files=parquet_files) + if not analysis.rows: + return None + write_detection_artifact_payloads( + [_with_case_metadata(row.model_dump(), case=case) for row in analysis.rows], + output_path, + ) + return output_path + + +def _with_case_metadata(row: dict[str, Any], *, case: BenchmarkCase) -> dict[str, Any]: + return { + "suite_id": case.suite_id, + "workload_id": case.workload_id, + "config_id": case.config_id, + "repetition": case.repetition, + "case_id": case.case_id, + "run_id": case.case_id, + **row, + } + + +def write_detection_artifact_payloads(rows: list[dict[str, Any]], output_path: Path) -> None: + output_path.parent.mkdir(parents=True, exist_ok=True) + pd.json_normalize(rows, sep=".").to_json(output_path, orient="records", lines=True) + + +def write_summary(result: BenchmarkResult) -> None: + Path(result.summary_path).write_text(result.model_dump_json(indent=2) + "\n", encoding="utf-8") + + +def render_result(result: BenchmarkResult, *, json_output: bool) -> str: + if json_output: + return result.model_dump_json(indent=2) + completed = sum(case.status == CaseStatus.completed for case in result.cases) + errored = sum(case.status == CaseStatus.error for case in result.cases) + planned = sum(case.status == CaseStatus.planned for case in result.cases) + if planned and completed == 0 and errored == 0: + return f"Planned {planned} case(s); output={result.output_dir}" + return f"Ran {completed}/{len(result.cases)} case(s); errors={errored}; output={result.output_dir}" + + +def _run_tags(case: BenchmarkCase, spec: BenchmarkSpec) -> dict[str, Any]: + return { + **spec.run_tags, + "suite_id": spec.suite_id, + "workload_id": case.workload_id, + "config_id": case.config_id, + "repetition": case.repetition, + "case_id": case.case_id, + } + + +def _present(values: dict[str, Any]) -> dict[str, Any]: + return {key: value for key, value in values.items() if value is not None} + + +def _get_item(items: dict[str, Any], item_id: str, item_type: str) -> Any: + if item_id not in items: + raise ValueError(f"unknown {item_type}: {item_id}") + return items[item_id] + + +def _resolve_input_source(source: str, base_dir: Path) -> str | Path: + if "://" in source: + return source + return _resolve_path(source, base_dir) + + +def _resolve_optional_path(raw: str | None, base_dir: Path) -> Path | None: + if raw is None: + return None + return _resolve_path(raw, base_dir) + + +def _resolve_config_source(raw: str | None, base_dir: Path) -> str | None: + if raw is None or "\n" in raw: + return raw + candidate = Path(raw).expanduser() + if candidate.suffix in {".yaml", ".yml"}: + return str(_resolve_path(raw, base_dir)) + return raw + + +def _resolve_path(raw: str, base_dir: Path) -> Path: + path = Path(raw).expanduser() + return path if path.is_absolute() else base_dir / path + + +def dry_run_result( + spec: BenchmarkSpec, + *, + output_dir: Path, + export: bool, + dd_trace: DDTraceMode, + trace_dir: Path | None, + dd_task_trace: bool = False, + task_trace_dir: Path | None = None, +) -> BenchmarkResult: + cases = build_cases(spec) + if dd_trace != DDTraceMode.none: + resolved_trace_dir = trace_dir or output_dir / "traces" + cases = [ + case.model_copy(update={"trace_path": str(resolved_trace_dir / f"{case.case_id}.jsonl")}) for case in cases + ] + if dd_task_trace: + resolved_task_trace_dir = task_trace_dir or output_dir / "task-traces" + cases = [ + case.model_copy(update={"task_trace_path": str(resolved_task_trace_dir / f"{case.case_id}.jsonl")}) + for case in cases + ] + return BenchmarkResult( + suite_id=spec.suite_id, + output_dir=str(output_dir), + measurement_path=str(output_dir / "measurements.jsonl"), + summary_path=str(output_dir / "summary.json"), + table_dir=str(output_dir / "tables") if export else None, + detection_artifact_analysis_path=str(output_dir / "detection-artifacts.jsonl") if export else None, + cases=cases, + ) + + +@app.default +def main( + spec: Path, + *, + output: Annotated[Path | None, cyclopts.Parameter(("--output", "-o"))] = None, + overwrite: Annotated[bool, cyclopts.Parameter("--overwrite")] = False, + dry_run: Annotated[bool, cyclopts.Parameter("--dry-run")] = False, + export: Annotated[bool, cyclopts.Parameter("--export")] = True, + fail_fast: Annotated[bool, cyclopts.Parameter("--fail-fast")] = False, + dd_trace: Annotated[DDTraceMode, cyclopts.Parameter("--dd-trace")] = DDTraceMode.none, + trace_dir: Annotated[Path | None, cyclopts.Parameter("--trace-dir")] = None, + dd_task_trace: Annotated[bool, cyclopts.Parameter("--dd-task-trace")] = False, + task_trace_dir: Annotated[Path | None, cyclopts.Parameter("--task-trace-dir")] = None, + json_output: Annotated[bool, cyclopts.Parameter("--json")] = False, + log_format: Annotated[LogFormat, cyclopts.Parameter("--log-format")] = LogFormat.plain, +) -> None: + configure_logging(log_format) + try: + result = run_or_plan( + spec, + output=output, + overwrite=overwrite, + dry_run=dry_run, + export=export, + fail_fast=fail_fast, + dd_trace=dd_trace, + trace_dir=trace_dir, + dd_task_trace=dd_task_trace, + task_trace_dir=task_trace_dir, + ) + except (ValueError, ValidationError) as exc: + log_bad_input(logger, str(exc)) + raise SystemExit(125) from exc + sys.stdout.write(render_result(result, json_output=json_output) + "\n") + if any(case.status == CaseStatus.error for case in result.cases): + raise SystemExit(1) + + +def run_or_plan( + spec_path: Path, + *, + output: Path | None, + overwrite: bool, + dry_run: bool, + export: bool, + fail_fast: bool, + dd_trace: DDTraceMode = DDTraceMode.none, + trace_dir: Path | None = None, + dd_task_trace: bool = False, + task_trace_dir: Path | None = None, +) -> BenchmarkResult: + benchmark_spec = load_spec(spec_path) + output_dir = output or Path("benchmark-runs") / benchmark_spec.suite_id + if trace_dir is not None and dd_trace == DDTraceMode.none: + raise ValueError("--trace-dir requires --dd-trace") + if task_trace_dir is not None and not dd_task_trace: + raise ValueError("--task-trace-dir requires --dd-task-trace") + preflight_suite(benchmark_spec, spec_path=spec_path) + if dry_run: + return dry_run_result( + benchmark_spec, + output_dir=output_dir, + export=export, + dd_trace=dd_trace, + trace_dir=trace_dir, + dd_task_trace=dd_task_trace, + task_trace_dir=task_trace_dir, + ) + prepare_output_dir(output_dir, overwrite=overwrite, dry_run=dry_run) + return run_suite( + benchmark_spec, + spec_path=spec_path, + output_dir=output_dir, + export=export, + fail_fast=fail_fast, + dd_trace=dd_trace, + trace_dir=trace_dir, + dd_task_trace=dd_task_trace, + task_trace_dir=task_trace_dir, + ) + + +if __name__ == "__main__": + app() diff --git a/uv.lock b/uv.lock index ae31c1ea..45102849 100644 --- a/uv.lock +++ b/uv.lock @@ -2425,6 +2425,7 @@ dependencies = [ { name = "data-designer" }, { name = "httpx" }, { name = "pydantic" }, + { name = "pydantic-settings" }, { name = "pygments" }, { name = "tiktoken" }, ] @@ -2462,6 +2463,7 @@ requires-dist = [ { name = "data-designer", specifier = "==0.6.0" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "pydantic", specifier = ">=2.9,<3" }, + { name = "pydantic-settings", specifier = ">=2.12,<3" }, { name = "pygments", specifier = ">=2.20.0" }, { name = "tiktoken", specifier = ">=0.9.0" }, ]