NVIDIA-NeMo · binaryaaron · Jun 12, 2026 · Jun 2, 2026 · Jun 3, 2026 · Jun 3, 2026
@@ -0,0 +1,174 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+name: Benchmark CI
+
+on:
+  workflow_dispatch:
+    inputs:
+      ref:
+        description: "Commit SHA, branch, or tag to benchmark"
+        required: true
+        default: "main"
+      suite:
+        description: "Benchmark suite YAML path"
+        required: true
+        default: "tools/measurement/examples/repo-data-smoke.yaml"
+      output_dir:
+        description: "Output directory for benchmark artifacts"
+        required: true
+        default: "benchmark-results"
+      dd_trace:
+        description: "Capture DataDesigner message traces"
+        required: true
+        type: choice
+        options:
+          - "none"
+          - "last_message"
+          - "all_messages"
+        default: "none"
+      dd_task_trace:
+        description: "Capture sanitized DataDesigner scheduler task traces"
+        required: true
+        type: choice
+        options:
+          - "false"
+          - "true"
+        default: "false"
+      fail_fast:
+        description: "Stop at the first failed benchmark case"
+        required: true
+        type: choice
+        options:
+          - "false"
+          - "true"
+        default: "false"
+
+permissions:
+  contents: read
+
+env:
+  NEMO_TELEMETRY_ENABLED: "false"
+  BENCHMARK_REF: ${{ inputs.ref }}
+  BENCHMARK_SUITE: ${{ inputs.suite }}
+  BENCHMARK_OUTPUT_DIR: ${{ inputs.output_dir }}
+  BENCHMARK_DD_TRACE: ${{ inputs.dd_trace }}
+  BENCHMARK_DD_TASK_TRACE: ${{ inputs.dd_task_trace }}
+  BENCHMARK_FAIL_FAST: ${{ inputs.fail_fast }}
+
+jobs:
+  benchmark:
+    name: Benchmark
+    runs-on: [self-hosted, anonymizer-evals]
+    timeout-minutes: 120
+
+    steps:
+      - name: Checkout benchmark target
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ env.BENCHMARK_REF }}
+          fetch-depth: "0"
+
+      - name: Resolve benchmark target commit
+        id: target
+        run: echo "commit=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.11"
+
+      - name: Install dependencies
+        run: uv sync --group dev
+
+      - name: Check NVIDIA API key
+        env:
+          NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
+        run: |
+          if [ -z "${NVIDIA_API_KEY:-}" ]; then
+            echo "::error::NVIDIA_API_KEY secret is required for benchmark CI"
+            exit 1
+          fi
+
+      - name: Run benchmark suite
+        env:
+          NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
+        run: |
+          TRACE_ARGS=(--dd-trace "$BENCHMARK_DD_TRACE")
+          if [ "$BENCHMARK_DD_TRACE" != "none" ]; then
+            TRACE_ARGS+=(--trace-dir "$BENCHMARK_OUTPUT_DIR/traces")
+          fi
+
+          TASK_TRACE_ARGS=()
+          if [ "$BENCHMARK_DD_TASK_TRACE" = "true" ]; then
+            TASK_TRACE_ARGS+=(--dd-task-trace --task-trace-dir "$BENCHMARK_OUTPUT_DIR/task-traces")
+          fi
+
+          FAIL_FAST_ARGS=()
+          if [ "$BENCHMARK_FAIL_FAST" = "true" ]; then
+            FAIL_FAST_ARGS+=(--fail-fast)
+          fi
+
+          uv run python tools/measurement/run_benchmarks.py \
+            "$BENCHMARK_SUITE" \
+            --output "$BENCHMARK_OUTPUT_DIR" \
+            --overwrite \
+            "${TRACE_ARGS[@]}" \
+            "${TASK_TRACE_ARGS[@]}" \
+            "${FAIL_FAST_ARGS[@]}"
+
+      - name: Add benchmark summary
+        if: always()
+        env:
+          BENCHMARK_COMMIT: ${{ steps.target.outputs.commit }}
+        run: |
+          python - <<'PY'
+          import json
+          import os
+          from pathlib import Path
+
+          output_dir = Path(os.environ["BENCHMARK_OUTPUT_DIR"])
+          summary_path = output_dir / "summary.json"
+          step_summary = Path(os.environ["GITHUB_STEP_SUMMARY"])
+
+          with step_summary.open("a", encoding="utf-8") as handle:
+              handle.write("# Anonymizer Benchmark\n\n")
+              handle.write(f"- Ref: `{os.environ['BENCHMARK_REF']}`\n")
+              handle.write(f"- Commit: `{os.environ.get('BENCHMARK_COMMIT', 'unknown')}`\n")
+              handle.write(f"- Suite: `{os.environ['BENCHMARK_SUITE']}`\n")
+              handle.write(f"- Output: `{output_dir}`\n")
+              handle.write(f"- DD traces: `{os.environ['BENCHMARK_DD_TRACE']}`\n")
+              handle.write(f"- DD task traces: `{os.environ['BENCHMARK_DD_TASK_TRACE']}`\n\n")
+
+              if not summary_path.exists():
+                  handle.write("`summary.json` was not produced. Check job logs for setup or preflight failures.\n")
+                  raise SystemExit(0)
+
+              summary = json.loads(summary_path.read_text(encoding="utf-8"))
+              cases = summary.get("cases", [])
+              completed = sum(1 for case in cases if case.get("status") == "completed")
+              errors = sum(1 for case in cases if case.get("status") == "error")
+              handle.write(f"Ran {completed}/{len(cases)} case(s); errors={errors}.\n\n")
+              handle.write("| Case | Status | Elapsed | Attempts |\n")
+              handle.write("| --- | --- | ---: | ---: |\n")
+              for case in cases:
+                  elapsed = case.get("elapsed_sec")
+                  elapsed_text = "" if elapsed is None else f"{elapsed:.2f}s"
+                  handle.write(
+                      f"| `{case.get('case_id')}` | {case.get('status')} | {elapsed_text} | "
+                      f"{case.get('attempt_count', 0)} |\n"
+                  )
+          PY
+
+      - name: Upload benchmark artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: anonymizer-benchmark-${{ steps.target.outputs.commit }}
+          path: ${{ env.BENCHMARK_OUTPUT_DIR }}/
+          if-no-files-found: warn
@@ -108,6 +108,7 @@ ai/tmp/
 
 # Anonymizer execution artifacts
 .anonymizer-artifacts/
+benchmark-results/
 docs/notebook_source/data/synth_bios_sample10_anonymized.csv
 
 # TLS certs and keys (if any)

@@ -0,0 +1,183 @@
+<!-- SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
+<!-- SPDX-License-Identifier: Apache-2.0 -->
+
+# Observability
+
+Anonymizer keeps local run measurement in the `anonymizer.measurement` package.
+Measurement hooks record timings, counts, model-call summaries, and safety
+metrics without changing anonymization behavior. Benchmark tools convert those
+records into tables for latency, reliability, model usage, and quality analysis.
+
+Measurement is separate from anonymous NVIDIA telemetry. Telemetry can report
+one product event per run or preview. Users can opt out as described in
+[Telemetry and Privacy](../index.md#telemetry-and-privacy). Measurement records
+are local artifacts. They are written only when developer tooling or caller code
+activates a measurement session.
+
+## Model
+
+Instrumentation is passive unless a `MeasurementCollector` is active in the
+current context:
+
+```python
+from anonymizer.measurement import MeasurementConfig, configured_measurement_session
+
+measurement = MeasurementConfig(output_path="benchmark-runs/case/measurements.jsonl")
+
+with configured_measurement_session(measurement):
+    result = anonymizer.run(config=config, data=data)
+```
+
+Instrumentation uses these entry points:
+
+- `stage_timer(...)` wraps pipeline phases and records elapsed time.
+- `record_run_metadata(...)` records config, input, model, and runtime metadata
+  once per run, without raw source values.
+- `record_record_metrics(...)` records per-row counts and safety metrics from
+  the trace DataFrame.
+- `record_ndd_workflow(...)` records DataDesigner workflow summaries at the
+  `NddAdapter` boundary.
+- `record_model_workflow(...)` records benchmark-only direct model calls that do
+  not use DataDesigner.
+
+The public API and CLI do not read measurement environment variables by default.
+Benchmark and developer tools opt into measurement explicitly.
+
+## Record Types
+
+Measurement output is JSONL by default. Each row has a `record_type` and shared
+run metadata.
+
+| Record type | Meaning |
+| --- | --- |
+| `run` | One anonymization call: mode, strategy, input shape, config metadata, model aliases, runtime metadata. |
+| `stage` | Pipeline phase timing, status, row counts, and row throughput. |
+| `record` | Per-input-row counts, text-length buckets, entity counts, ground-truth comparison metrics when present, replacement coverage, leakage flags, and estimated LLM calls. |
+| `ndd_workflow` | DataDesigner workflow summary: workflow name, model aliases, row counts, failures, elapsed time, usage summary, and throughput. |
+| `model_workflow` | Direct model workflow summary for benchmark-only paths outside DataDesigner. |
+| `dd_trace_coverage` | Trace coverage summary for DataDesigner columns when message tracing is enabled. |
+
+Use `tools/measurement/export_measurements.py` to convert raw measurement JSONL
+into Parquet, CSV, or JSONL tables.
+
+## Output and Sinks
+
+`MeasurementConfig` controls output:
+
+| Field | Purpose |
+| --- | --- |
+| `output_path` | Destination for measurement records. |
+| `output_format` | `jsonl` or `json`; defaults to `jsonl`. |
+| `record_level` | Include per-row `record` entries; defaults to `True`. |
+| `streaming` | Write JSONL records as they are emitted instead of collecting them in memory. |
+| `keep_records` | Keep emitted records in memory for caller access. |
+| `run_id` | Optional stable run ID. |
+| `run_tags` | Caller-supplied tags copied to every record. |
+| `fail_on_write_error` | Raise output write/close failures when the run body succeeded. |
+
+Streaming mode supports JSONL only. Use it for long benchmark suites where
+holding all measurement records in memory is unnecessary.
+
+`MeasurementConfig.from_env()` can read `ANONYMIZER_MEASUREMENT_*` settings for
+developer tooling. Product entry points do not call it automatically.
+
+| Environment variable | Field |
+| --- | --- |
+| `ANONYMIZER_MEASUREMENT_OUTPUT_PATH` | `output_path` |
+| `ANONYMIZER_MEASUREMENT_OUTPUT_FORMAT` | `output_format` |
+| `ANONYMIZER_MEASUREMENT_RECORD_LEVEL` | `record_level` |
+| `ANONYMIZER_MEASUREMENT_STREAMING` | `streaming` |
+| `ANONYMIZER_MEASUREMENT_KEEP_RECORDS` | `keep_records` |
+| `ANONYMIZER_MEASUREMENT_DD_TRACE` | `dd_trace` |
+| `ANONYMIZER_MEASUREMENT_DD_TRACE_PATH` | `dd_trace_path` |
+| `ANONYMIZER_MEASUREMENT_DD_TASK_TRACE_PATH` | `dd_task_trace_path` |
+| `ANONYMIZER_MEASUREMENT_FAIL_ON_WRITE_ERROR` | `fail_on_write_error` |
+| `ANONYMIZER_MEASUREMENT_RUN_ID` | `run_id` |
+| `ANONYMIZER_MEASUREMENT_RUN_TAGS` | `run_tags` |
+
+## DataDesigner Message Traces
+
+DataDesigner message traces are optional sidecar artifacts for model-call
+debugging:
+
+```python
+measurement = MeasurementConfig(
+    output_path="benchmark-runs/case/measurements.jsonl",
+    dd_trace="last_message",
+    dd_trace_path="benchmark-runs/case/traces.jsonl",
+)
+```
+
+`last_message` stores the final prompt message for each traced DataDesigner
+model call. `all_messages` stores the full message list.
+
+Message traces are separate from measurement records. They may contain raw input
+text, prompts, generated output, entity values, replacement values, secrets, and
+PII. Do not share them unless they have been reviewed or redacted.
+
+Anonymizer requests standard LLM-column traces through DataDesigner native LLM
+column trace side effects. That covers `LLMTextColumnConfig` and
+`LLMStructuredColumnConfig`.
+
+Model-backed `CustomColumnConfig` generator functions use a temporary
+Anonymizer shim that instruments the per-run DataDesigner model registry and
+returned model facades. This captures model calls that DataDesigner does not yet
+expose through a public trace sink. Treat this as a brittle bridge over private
+DataDesigner internals, not as a stable integration point.
+
+When tracing is enabled, the measurement stream records a `dd_trace_coverage`
+row with native, private-facade, and unsupported column counts so benchmark
+analysis can see which trace path covered each workflow.
+
+## DataDesigner Task Traces
+
+Scheduler task traces are a separate sidecar:
+
+```python
+measurement = MeasurementConfig(
+    output_path="benchmark-runs/case/measurements.jsonl",
+    dd_task_trace_path="benchmark-runs/case/task-traces.jsonl",
+)
+```
+
+Task traces capture DataDesigner scheduler timing metadata: workflow, column,
+row group, row index, task type, status, relative dispatch/slot-acquired/
+completion offsets, queue wait time, execution time, total time, and whether an
+error was present. They do not store raw DataDesigner error strings because
+those strings can contain prompts, outputs, or source values.
+
+Offsets are relative to the earliest positive `dispatched_at` timestamp in the
+task-trace batch for that workflow. They make task overlap easier to inspect
+without persisting host-specific wall-clock timestamps.
+
+## Safety Rules
+
+Measurement records must not contain raw text, entity values, prompts, generated
+outputs, replacement maps, provider secrets, or API keys.
+
+Use counts, labels, lengths, buckets, model aliases, status flags, elapsed time,
+token counts, request counts, and run-scoped HMACs instead. The collector hashes
+record identity with a per-run key. Record hashes can join artifacts from one
+run, but they are not stable identifiers across unrelated runs unless the caller
+supplies the same hash key deliberately.
+
+When adding instrumentation:
+
+- Put timing around stable phase boundaries, not every helper call.
+- Record metadata at the boundary where the information is known.
+- Keep raw debug payloads in explicit sidecars, never in measurement records.
+- Prefer `run_tags` for external run context such as source refs, CI IDs,
+  topology labels, or experimental strategy. The benchmark runner owns
+  `suite_id`, `case_id`, `workload_id`, `config_id`, and `repetition`.
+- Keep benchmark-only strategy switches in `tools/measurement`, not product
+  defaults.
+
+## Key Files
+
+| File | Purpose |
+| --- | --- |
+| `src/anonymizer/measurement/` | Collector, config, context managers, safe record builders, and trace sidecar hooks. |
+| `src/anonymizer/interface/anonymizer.py` | Run-level and per-record measurement integration. |
+| `src/anonymizer/engine/ndd/adapter.py` | DataDesigner workflow measurement, native message trace capture, and scheduler task trace capture. |
+| `tools/measurement/run_benchmarks.py` | Benchmark suite runner that activates measurement sessions and writes per-case artifacts. |
+| `tools/measurement/README.md` | Detailed benchmark and analysis command reference. |
@@ -167,6 +167,8 @@ nav:
       - Choosing a Replacement Strategy: notebooks/03_choosing_a_replacement_strategy.ipynb
       - Rewriting Biographies: notebooks/04_rewriting_biographies.ipynb
       - Rewriting Legal Documents: notebooks/05_rewriting_legal_documents.ipynb
+  - Development:
+      - Observability: development/observability.md
   - API Reference: reference/
   - Developer Notes:
       - devnotes/index.md

@@ -9,6 +9,7 @@ license = "Apache-2.0"
 dependencies = [
     "data-designer==0.6.0",
     "pydantic>=2.9,<3",
+    "pydantic-settings>=2.12,<3",
     "cyclopts>=3",
     "pygments>=2.20.0",
     "cryptography>=46.0.6",

@@ -45,6 +45,7 @@
 COL_ENTITIES_BY_VALUE = "_entities_by_value"
 COL_REPLACED_TEXT = "__nemo_anonymizer_text_output__"
 COL_REPLACEMENT_MAP = "_replacement_map"
+COL_REPLACEMENT_MAP_SOURCE = "_replacement_map_source"
 
 # LlmReplaceWorkflow internal prompt-construction columns. Created by
 # `LlmReplaceWorkflow.generate_map_only` for the replacement-generator prompt