From fcff318ba131385f30574b835ed6217f16545af3 Mon Sep 17 00:00:00 2001 From: Shuxin Lin Date: Mon, 27 Apr 2026 16:27:38 -0400 Subject: [PATCH 1/8] feat(evaluation): add offline evaluation module with uv run evaluate CLI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement src/evaluation/ — consumes saved agent trajectories ({run_id}.json under AGENT_TRAJECTORY_DIR) and scenario files, joins them on scenario_id, runs a registered grader per scenario, and emits a JSON report combining grading results with operational metrics (tokens, duration p50/p95, tool calls, optional cost estimate). The shape follows SWE-bench / HELM / τ-bench conventions: agent run → evaluate → report.json, with offline re-grading from saved trajectories as a first-class workflow. Includes: - Pydantic models (Scenario, PersistedTrajectory, GradeResult, OpsMetrics, EvalReport) - Loader for trajectory dirs and JSON/JSONL scenario files - Grader registry with two deterministic graders (exact_string_match, numeric_match) and a pluggable LLM judge bound to LLMBackend (six-criterion rubric) - Per-task ops metric extraction (handles both SDK Trajectory and plan-execute list[StepResult] shapes) plus aggregate rollups - Report writer with terminal summary and JSON output - evaluate script registered in [project.scripts] - 39 unit tests covering models, loader, graders, metrics, report, and end-to-end runner — all passing alongside existing 270 tests Closes #279 Signed-off-by: Shuxin Lin --- pyproject.toml | 3 +- src/evaluation/__init__.py | 34 ++++++ src/evaluation/cli.py | 106 +++++++++++++++++ src/evaluation/graders/__init__.py | 36 ++++++ src/evaluation/graders/deterministic.py | 71 ++++++++++++ src/evaluation/graders/llm_judge.py | 144 ++++++++++++++++++++++++ src/evaluation/loader.py | 93 +++++++++++++++ src/evaluation/metrics.py | 125 ++++++++++++++++++++ src/evaluation/models.py | 110 ++++++++++++++++++ src/evaluation/report.py | 87 ++++++++++++++ src/evaluation/runner.py | 68 +++++++++++ src/evaluation/tests/__init__.py | 0 src/evaluation/tests/conftest.py | 72 ++++++++++++ src/evaluation/tests/test_graders.py | 120 ++++++++++++++++++++ src/evaluation/tests/test_loader.py | 72 ++++++++++++ src/evaluation/tests/test_metrics.py | 101 +++++++++++++++++ src/evaluation/tests/test_models.py | 45 ++++++++ src/evaluation/tests/test_report.py | 74 ++++++++++++ src/evaluation/tests/test_runner.py | 76 +++++++++++++ 19 files changed, 1436 insertions(+), 1 deletion(-) create mode 100644 src/evaluation/cli.py create mode 100644 src/evaluation/graders/__init__.py create mode 100644 src/evaluation/graders/deterministic.py create mode 100644 src/evaluation/graders/llm_judge.py create mode 100644 src/evaluation/loader.py create mode 100644 src/evaluation/metrics.py create mode 100644 src/evaluation/models.py create mode 100644 src/evaluation/report.py create mode 100644 src/evaluation/runner.py create mode 100644 src/evaluation/tests/__init__.py create mode 100644 src/evaluation/tests/conftest.py create mode 100644 src/evaluation/tests/test_graders.py create mode 100644 src/evaluation/tests/test_loader.py create mode 100644 src/evaluation/tests/test_metrics.py create mode 100644 src/evaluation/tests/test_models.py create mode 100644 src/evaluation/tests/test_report.py create mode 100644 src/evaluation/tests/test_runner.py diff --git a/pyproject.toml b/pyproject.toml index 95d794c90..1d928f9e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] -packages = ["src/agent", "src/llm", "src/observability"] +packages = ["src/agent", "src/evaluation", "src/llm", "src/observability"] [project] name = "assetopsbench-mcp" @@ -42,6 +42,7 @@ wo-mcp-server = "servers.wo.main:main" vibration-mcp-server = "servers.vibration.main:main" openai-agent = "agent.openai_agent.cli:main" deep-agent = "agent.deep_agent.cli:main" +evaluate = "evaluation.cli:main" [dependency-groups] diff --git a/src/evaluation/__init__.py b/src/evaluation/__init__.py index e69de29bb..ca632ab1b 100644 --- a/src/evaluation/__init__.py +++ b/src/evaluation/__init__.py @@ -0,0 +1,34 @@ +"""Offline evaluation harness for AssetOpsBench agent runs. + +Consumes saved trajectory files (written by +:func:`observability.persistence.persist_trajectory`) and scenario files +(under ``src/scenarios/``) and emits a structured JSON report combining +graded outcomes with operational metrics. + +The shape mirrors conventions from SWE-bench, HELM, and τ-bench: +``run`` (executes the agent — already exists) → ``evaluate`` (this +module) → ``report.json``. Re-grading from saved trajectories is +first-class. +""" + +from .models import ( + AggregateOps, + EvalReport, + GradeResult, + OpsMetrics, + PersistedTrajectory, + Scenario, + ScenarioResult, + TypeBreakdown, +) + +__all__ = [ + "AggregateOps", + "EvalReport", + "GradeResult", + "OpsMetrics", + "PersistedTrajectory", + "Scenario", + "ScenarioResult", + "TypeBreakdown", +] diff --git a/src/evaluation/cli.py b/src/evaluation/cli.py new file mode 100644 index 000000000..806452899 --- /dev/null +++ b/src/evaluation/cli.py @@ -0,0 +1,106 @@ +"""``uv run evaluate`` — offline grading + report generation.""" + +from __future__ import annotations + +import argparse +import logging +import sys +from pathlib import Path + +from . import graders as grader_registry +from .report import render_summary, write_report +from .runner import evaluate + + +def _build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="evaluate", + description=( + "Grade saved agent trajectories against scenario files and " + "emit a JSON report." + ), + ) + p.add_argument( + "--trajectories", + type=Path, + required=True, + help="Directory of {run_id}.json trajectory files (or a single file).", + ) + p.add_argument( + "--scenarios", + type=Path, + nargs="+", + required=True, + help="One or more scenario JSON / JSONL files.", + ) + p.add_argument( + "--output", + type=Path, + required=True, + help="Path to write the JSON report.", + ) + p.add_argument( + "--grader-default", + default="llm_judge", + help="Grader name when scenario.grading_method is unset. " + "Default: llm_judge.", + ) + p.add_argument( + "--judge-model", + default=None, + help="Model id for the LLM judge (e.g. " + "litellm_proxy/anthropic/claude-opus-4-5). " + "Required when any scenario routes to llm_judge.", + ) + p.add_argument( + "-v", + "--verbose", + action="store_true", + help="Enable INFO-level logging.", + ) + return p + + +def _maybe_install_judge(judge_model: str | None) -> None: + if not judge_model: + return + # Imported lazily so the CLI works for deterministic-only runs even + # if the LiteLLM dep happens to be flaky in the dev environment. + from llm import LiteLLMBackend # type: ignore[import-not-found] + + from .graders.llm_judge import install + + install(LiteLLMBackend(model=judge_model)) + + +def _validate_grader_default(name: str) -> None: + try: + grader_registry.get(name) + except KeyError as exc: + raise SystemExit(str(exc)) + + +def main(argv: list[str] | None = None) -> int: + args = _build_parser().parse_args(argv) + logging.basicConfig( + level=logging.INFO if args.verbose else logging.WARNING, + format="%(asctime)s %(levelname)s %(name)s: %(message)s", + ) + + _maybe_install_judge(args.judge_model) + _validate_grader_default(args.grader_default) + + report = evaluate( + trajectories_path=args.trajectories, + scenarios_paths=list(args.scenarios), + default_grading_method=args.grader_default, + ) + + out = write_report(report, args.output) + print(render_summary(report)) + print(f"\nReport written: {out}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/evaluation/graders/__init__.py b/src/evaluation/graders/__init__.py new file mode 100644 index 000000000..f58a074e0 --- /dev/null +++ b/src/evaluation/graders/__init__.py @@ -0,0 +1,36 @@ +"""Pluggable grader registry. + +Each grader is a callable taking ``(scenario, answer, trajectory_text)`` +and returning a :class:`~evaluation.models.GradeResult`. Registration +happens via :func:`register`; the CLI looks up graders by name from +``scenario.grading_method`` (falling back to a CLI-supplied default). +""" + +from __future__ import annotations + +from typing import Callable + +from ..models import GradeResult, Scenario + +Grader = Callable[[Scenario, str, str], GradeResult] + +_REGISTRY: dict[str, Grader] = {} + + +def register(name: str, grader: Grader) -> None: + _REGISTRY[name] = grader + + +def get(name: str) -> Grader: + if name not in _REGISTRY: + raise KeyError( + f"unknown grader {name!r}; registered: {sorted(_REGISTRY)}" + ) + return _REGISTRY[name] + + +def names() -> list[str]: + return sorted(_REGISTRY) + + +from . import deterministic # noqa: E402,F401 — register-on-import diff --git a/src/evaluation/graders/deterministic.py b/src/evaluation/graders/deterministic.py new file mode 100644 index 000000000..35db1c299 --- /dev/null +++ b/src/evaluation/graders/deterministic.py @@ -0,0 +1,71 @@ +"""Pure deterministic graders — no LLM, no network.""" + +from __future__ import annotations + +import math + +from ..models import GradeResult, Scenario +from . import register + + +def exact_string_match( + scenario: Scenario, answer: str, trajectory_text: str +) -> GradeResult: + expected = scenario.expected_answer + if expected is None: + return GradeResult( + grading_method="exact_string_match", + passed=False, + score=0.0, + rationale="scenario has no expected_answer", + ) + + a = str(answer).strip().lower() + e = str(expected).strip().lower() + passed = a == e + return GradeResult( + grading_method="exact_string_match", + passed=passed, + score=1.0 if passed else 0.0, + rationale="" if passed else f"expected {expected!r}, got {answer!r}", + details={"expected": expected, "actual": answer}, + ) + + +def numeric_match( + scenario: Scenario, answer: str, trajectory_text: str +) -> GradeResult: + expected_raw = scenario.expected_answer + extra = scenario.model_extra or {} + tolerance = float(extra.get("tolerance", 1e-6)) + + if expected_raw is None: + return GradeResult( + grading_method="numeric_match", + passed=False, + rationale="scenario has no expected_answer", + ) + + try: + a = float(answer) + e = float(expected_raw) + except (TypeError, ValueError) as err: + return GradeResult( + grading_method="numeric_match", + passed=False, + rationale=f"could not parse numbers: {err}", + details={"expected": expected_raw, "actual": answer}, + ) + + passed = math.isclose(a, e, rel_tol=tolerance, abs_tol=tolerance) + return GradeResult( + grading_method="numeric_match", + passed=passed, + score=1.0 if passed else 0.0, + rationale="" if passed else f"|{a} - {e}| > tol={tolerance}", + details={"expected": e, "actual": a, "tolerance": tolerance}, + ) + + +register("exact_string_match", exact_string_match) +register("numeric_match", numeric_match) diff --git a/src/evaluation/graders/llm_judge.py b/src/evaluation/graders/llm_judge.py new file mode 100644 index 000000000..fb55bf73f --- /dev/null +++ b/src/evaluation/graders/llm_judge.py @@ -0,0 +1,144 @@ +"""LLM-judge grader. + +Free-form answers are scored against ``scenario.characteristic_form`` +using a six-criterion rubric (task completion, data retrieval accuracy, +result verification, agent sequence, clarity, hallucinations) — the +same shape as ``aobench/scenario-server/grading/graders.evaluation_agent`` +but built directly on :class:`~llm.LLMBackend` so the evaluation module +has no dependency on the scenario-server codebase. +""" + +from __future__ import annotations + +import json +import logging +import re + +from llm import LLMBackend + +from ..models import GradeResult, Scenario +from . import register + +_log = logging.getLogger(__name__) + +_RUBRIC_KEYS = ( + "task_completion", + "data_retrieval_accuracy", + "generalized_result_verification", + "agent_sequence_correct", + "clarity_and_justification", + "hallucinations", +) + +_PROMPT_TEMPLATE = """You are an evaluation judge for an industrial-asset-operations agent. + +Score the agent response against the expected characteristic answer using the six criteria below. Respond ONLY with a JSON object, no prose. + +QUESTION: +{question} + +EXPECTED CHARACTERISTIC: +{characteristic} + +AGENT RESPONSE: +{answer} + +AGENT TRAJECTORY (turns / tool calls / outputs): +{trajectory} + +Return JSON with these boolean fields plus a one-sentence reason: + +{{ + "task_completion": , + "data_retrieval_accuracy": , + "generalized_result_verification": , + "agent_sequence_correct": , + "clarity_and_justification": , + "hallucinations": , + "reason": "" +}} + +The agent passes overall iff the first five are true AND hallucinations is false.""" + + +class LLMJudgeGrader: + """Closure-style grader that holds an :class:`LLMBackend`.""" + + def __init__(self, llm: LLMBackend, name: str = "llm_judge") -> None: + self._llm = llm + self.name = name + + def __call__( + self, scenario: Scenario, answer: str, trajectory_text: str + ) -> GradeResult: + characteristic = scenario.characteristic_form or scenario.expected_answer or "" + if not characteristic: + return GradeResult( + grading_method=self.name, + passed=False, + rationale="scenario has neither characteristic_form nor expected_answer", + ) + + prompt = _PROMPT_TEMPLATE.format( + question=scenario.text, + characteristic=characteristic, + answer=answer, + trajectory=trajectory_text[:8000], + ) + + try: + raw = self._llm.generate(prompt) + except Exception as exc: # judge call failure is a grading failure, not a crash + _log.exception("llm_judge: backend error") + return GradeResult( + grading_method=self.name, + passed=False, + rationale=f"judge backend error: {exc}", + ) + + review = _parse_review(raw) + if review is None: + return GradeResult( + grading_method=self.name, + passed=False, + rationale="judge returned unparseable JSON", + details={"raw": raw[:2000]}, + ) + + passed = ( + review.get("task_completion") is True + and review.get("data_retrieval_accuracy") is True + and review.get("generalized_result_verification") is True + and review.get("agent_sequence_correct") is True + and review.get("clarity_and_justification") is True + and review.get("hallucinations") is False + ) + score = sum(1 for k in _RUBRIC_KEYS[:5] if review.get(k) is True) / 5.0 + if review.get("hallucinations") is True: + score = max(0.0, score - 0.2) + + return GradeResult( + grading_method=self.name, + passed=passed, + score=round(score, 3), + rationale=str(review.get("reason", ""))[:500], + details=review, + ) + + +def _parse_review(raw: str) -> dict | None: + if not raw: + return None + # Tolerate leading prose / markdown fences by extracting the first {...} block. + match = re.search(r"\{.*\}", raw, re.DOTALL) + if not match: + return None + try: + return json.loads(match.group(0)) + except json.JSONDecodeError: + return None + + +def install(llm: LLMBackend, name: str = "llm_judge") -> None: + """Register an LLM-judge grader bound to ``llm`` under ``name``.""" + register(name, LLMJudgeGrader(llm, name=name)) diff --git a/src/evaluation/loader.py b/src/evaluation/loader.py new file mode 100644 index 000000000..31b9c761b --- /dev/null +++ b/src/evaluation/loader.py @@ -0,0 +1,93 @@ +"""Load trajectories and scenarios, then join them by ``scenario_id``.""" + +from __future__ import annotations + +import json +import logging +from pathlib import Path +from typing import Iterable, Iterator + +from .models import PersistedTrajectory, Scenario + +_log = logging.getLogger(__name__) + + +def load_trajectories(path: Path) -> list[PersistedTrajectory]: + """Load every ``*.json`` trajectory under ``path``. + + ``path`` may be a directory (the ``AGENT_TRAJECTORY_DIR`` layout) or + a single JSON file. Files that fail to parse are logged and + skipped — a partial directory should still yield a usable batch. + """ + p = Path(path) + if p.is_file(): + return [_load_one(p)] if p.suffix == ".json" else [] + + out: list[PersistedTrajectory] = [] + for child in sorted(p.glob("*.json")): + try: + out.append(_load_one(child)) + except Exception: + _log.exception("loader: failed to parse %s", child) + return out + + +def _load_one(path: Path) -> PersistedTrajectory: + raw = json.loads(path.read_text(encoding="utf-8")) + return PersistedTrajectory.from_raw(raw) + + +def load_scenarios(paths: Iterable[Path] | Path) -> list[Scenario]: + """Load scenarios from one or more files. + + Each file may be a JSON list, a single JSON object, or JSONL. + Scenario IDs are coerced to strings to make the join key uniform + (CouchDB-style trajectories use string IDs; local JSON files use + ints). + """ + if isinstance(paths, (str, Path)): + paths = [Path(paths)] + + out: list[Scenario] = [] + for p in paths: + out.extend(_load_scenario_file(Path(p))) + return out + + +def _load_scenario_file(path: Path) -> list[Scenario]: + text = path.read_text(encoding="utf-8").strip() + if not text: + return [] + + if path.suffix == ".jsonl": + return [ + Scenario.from_raw(json.loads(line)) + for line in text.splitlines() + if line.strip() + ] + + raw = json.loads(text) + if isinstance(raw, list): + return [Scenario.from_raw(item) for item in raw] + if isinstance(raw, dict): + return [Scenario.from_raw(raw)] + raise ValueError(f"unexpected scenario JSON shape in {path}: {type(raw).__name__}") + + +def join_records( + scenarios: list[Scenario], + trajectories: list[PersistedTrajectory], +) -> Iterator[tuple[Scenario, PersistedTrajectory]]: + """Yield (scenario, trajectory) pairs joined on ``scenario_id``. + + Scenarios with no matching trajectory and trajectories with no + matching scenario are silently dropped — the caller can compute the + diff from the input lists if reporting is needed. + """ + by_id: dict[str, Scenario] = {s.id: s for s in scenarios} + for traj in trajectories: + if traj.scenario_id is None: + continue + scenario = by_id.get(traj.scenario_id) + if scenario is not None: + yield scenario, traj diff --git a/src/evaluation/metrics.py b/src/evaluation/metrics.py new file mode 100644 index 000000000..325074a7e --- /dev/null +++ b/src/evaluation/metrics.py @@ -0,0 +1,125 @@ +"""Operational metric extraction and aggregation.""" + +from __future__ import annotations + +import statistics +from typing import Any + +from .models import AggregateOps, OpsMetrics, PersistedTrajectory, ScenarioResult + +# USD per 1M tokens, rough public list-prices. None when unknown. Used +# only for the optional ``est_cost_usd`` rollup; consumers should treat +# it as an estimate, not a billing source of truth. +_PRICE_PER_1M: dict[str, tuple[float, float]] = { + "claude-opus-4-5": (15.0, 75.0), + "claude-opus-4-1": (15.0, 75.0), + "claude-sonnet-4-6": (3.0, 15.0), + "claude-haiku-4-5": (1.0, 5.0), + "gpt-5": (10.0, 30.0), + "gpt-4.1": (3.0, 12.0), + "gpt-4o": (2.5, 10.0), + "llama-4-maverick": (0.27, 0.85), +} + + +def metrics_from_trajectory(record: PersistedTrajectory) -> OpsMetrics: + """Extract per-task ops metrics from a persisted trajectory record.""" + traj = record.trajectory + if traj is None: + return OpsMetrics() + + if isinstance(traj, dict) and "turns" in traj: + return _from_sdk_trajectory(traj, record.model) + if isinstance(traj, list): + return _from_plan_execute(traj, record.model) + return OpsMetrics() + + +def _from_sdk_trajectory(traj: dict, model: str) -> OpsMetrics: + turns = traj.get("turns", []) or [] + tokens_in = sum(int(t.get("input_tokens") or 0) for t in turns) + tokens_out = sum(int(t.get("output_tokens") or 0) for t in turns) + + durations_ms = [t.get("duration_ms") for t in turns if t.get("duration_ms") is not None] + duration_ms = sum(durations_ms) if durations_ms else None + + tool_names: list[str] = [] + for t in turns: + for tc in t.get("tool_calls") or []: + name = tc.get("name") + if name: + tool_names.append(name) + + return OpsMetrics( + turn_count=len(turns), + tool_call_count=len(tool_names), + unique_tools=sorted(set(tool_names)), + tokens_in=tokens_in, + tokens_out=tokens_out, + duration_ms=duration_ms, + est_cost_usd=_estimate_cost(model, tokens_in, tokens_out), + ) + + +def _from_plan_execute(steps: list[Any], model: str) -> OpsMetrics: + # plan-execute persists ``list[StepResult]``; the dataclass exposes + # ``server`` / ``tool`` / ``response`` fields but no per-step token + # counts, so we surface what is available and leave the rest at zero. + tool_names = [ + s.get("tool") + for s in steps + if isinstance(s, dict) and s.get("tool") + ] + return OpsMetrics( + turn_count=len(steps), + tool_call_count=len(tool_names), + unique_tools=sorted(set(tool_names)), + est_cost_usd=_estimate_cost(model, 0, 0), + ) + + +def _estimate_cost(model: str, tokens_in: int, tokens_out: int) -> float | None: + if not model or (tokens_in == 0 and tokens_out == 0): + return None + key = _normalize_model(model) + rate = _PRICE_PER_1M.get(key) + if rate is None: + return None + in_rate, out_rate = rate + return round((tokens_in * in_rate + tokens_out * out_rate) / 1_000_000, 6) + + +def _normalize_model(model: str) -> str: + # Strip provider prefixes like ``litellm_proxy/anthropic/`` and + # version suffixes like ``-20250101``. + tail = model.rsplit("/", 1)[-1].lower() + parts = tail.split("-") + if parts and parts[-1].isdigit() and len(parts[-1]) >= 6: + parts = parts[:-1] + return "-".join(parts) + + +def aggregate_ops(results: list[ScenarioResult]) -> AggregateOps: + if not results: + return AggregateOps() + + durations = [r.ops.duration_ms for r in results if r.ops.duration_ms is not None] + costs = [r.ops.est_cost_usd for r in results if r.ops.est_cost_usd is not None] + + return AggregateOps( + tokens_in_total=sum(r.ops.tokens_in for r in results), + tokens_out_total=sum(r.ops.tokens_out for r in results), + duration_ms_p50=_percentile(durations, 50), + duration_ms_p95=_percentile(durations, 95), + tool_calls_total=sum(r.ops.tool_call_count for r in results), + est_cost_usd_total=round(sum(costs), 6) if costs else None, + ) + + +def _percentile(values: list[float], pct: float) -> float | None: + if not values: + return None + if len(values) == 1: + return float(values[0]) + quantiles = statistics.quantiles(values, n=100, method="inclusive") + return float(quantiles[int(pct) - 1]) diff --git a/src/evaluation/models.py b/src/evaluation/models.py new file mode 100644 index 000000000..846e98911 --- /dev/null +++ b/src/evaluation/models.py @@ -0,0 +1,110 @@ +"""Pydantic models for the offline evaluation pipeline.""" + +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, ConfigDict, Field + + +class Scenario(BaseModel): + """One evaluation scenario. + + Mirrors the on-disk shape under ``src/scenarios/`` and is permissive + via ``extra='allow'`` so domain-specific fields (e.g. category, + characteristic_form) survive the round-trip. + """ + + model_config = ConfigDict(extra="allow") + + id: str + text: str + type: str = "" + category: str = "" + characteristic_form: str | None = None + expected_answer: str | None = None + grading_method: str | None = None + + @classmethod + def from_raw(cls, raw: dict) -> "Scenario": + d = dict(raw) + if "id" in d: + d["id"] = str(d["id"]) + return cls.model_validate(d) + + +class PersistedTrajectory(BaseModel): + """Record written by ``observability.persistence.persist_trajectory``.""" + + model_config = ConfigDict(extra="allow") + + run_id: str + scenario_id: str | None = None + runner: str + model: str + question: str + answer: str + trajectory: Any = None + + @classmethod + def from_raw(cls, raw: dict) -> "PersistedTrajectory": + d = dict(raw) + if d.get("scenario_id") is not None: + d["scenario_id"] = str(d["scenario_id"]) + return cls.model_validate(d) + + +class OpsMetrics(BaseModel): + """Per-task operational metrics derived from a trajectory.""" + + turn_count: int = 0 + tool_call_count: int = 0 + unique_tools: list[str] = Field(default_factory=list) + tokens_in: int = 0 + tokens_out: int = 0 + duration_ms: float | None = None + est_cost_usd: float | None = None + + +class GradeResult(BaseModel): + grading_method: str + passed: bool + score: float = 0.0 + rationale: str = "" + details: dict[str, Any] = Field(default_factory=dict) + + +class ScenarioResult(BaseModel): + scenario_id: str + scenario_type: str = "" + runner: str + model: str + question: str + answer: str + grade: GradeResult + ops: OpsMetrics + + +class AggregateOps(BaseModel): + tokens_in_total: int = 0 + tokens_out_total: int = 0 + duration_ms_p50: float | None = None + duration_ms_p95: float | None = None + tool_calls_total: int = 0 + est_cost_usd_total: float | None = None + + +class TypeBreakdown(BaseModel): + total: int = 0 + passed: int = 0 + pass_rate: float = 0.0 + + +class EvalReport(BaseModel): + generated_at: str + runners: list[str] = Field(default_factory=list) + models: list[str] = Field(default_factory=list) + totals: dict[str, Any] = Field(default_factory=dict) + by_scenario_type: dict[str, TypeBreakdown] = Field(default_factory=dict) + ops: AggregateOps = Field(default_factory=AggregateOps) + results: list[ScenarioResult] = Field(default_factory=list) diff --git a/src/evaluation/report.py b/src/evaluation/report.py new file mode 100644 index 000000000..72ff9b0e2 --- /dev/null +++ b/src/evaluation/report.py @@ -0,0 +1,87 @@ +"""Build an :class:`EvalReport` from graded scenario results.""" + +from __future__ import annotations + +import datetime as _dt +import json +from collections import defaultdict +from pathlib import Path + +from .metrics import aggregate_ops +from .models import EvalReport, ScenarioResult, TypeBreakdown + + +def build_report(results: list[ScenarioResult]) -> EvalReport: + total = len(results) + passed = sum(1 for r in results if r.grade.passed) + + by_type: dict[str, list[ScenarioResult]] = defaultdict(list) + for r in results: + by_type[r.scenario_type or "unknown"].append(r) + + breakdown: dict[str, TypeBreakdown] = {} + for stype, items in by_type.items(): + n = len(items) + p = sum(1 for r in items if r.grade.passed) + breakdown[stype] = TypeBreakdown( + total=n, + passed=p, + pass_rate=round(p / n, 4) if n else 0.0, + ) + + return EvalReport( + generated_at=_dt.datetime.now(_dt.timezone.utc).isoformat(), + runners=sorted({r.runner for r in results}), + models=sorted({r.model for r in results}), + totals={ + "scenarios": total, + "graded": total, + "passed": passed, + "pass_rate": round(passed / total, 4) if total else 0.0, + }, + by_scenario_type=breakdown, + ops=aggregate_ops(results), + results=results, + ) + + +def write_report(report: EvalReport, output: Path) -> Path: + output = Path(output) + output.parent.mkdir(parents=True, exist_ok=True) + output.write_text(report.model_dump_json(indent=2), encoding="utf-8") + return output + + +def render_summary(report: EvalReport) -> str: + lines: list[str] = [] + t = report.totals + lines.append( + f"Scenarios: {t.get('scenarios', 0)} " + f"Passed: {t.get('passed', 0)} " + f"Pass rate: {t.get('pass_rate', 0):.1%}" + ) + if report.by_scenario_type: + lines.append("") + lines.append("By scenario type:") + for stype, b in sorted(report.by_scenario_type.items()): + lines.append( + f" {stype:<16} {b.passed:>4}/{b.total:<4} ({b.pass_rate:.1%})" + ) + o = report.ops + lines.append("") + lines.append("Operational metrics:") + lines.append(f" tokens_in_total: {o.tokens_in_total}") + lines.append(f" tokens_out_total: {o.tokens_out_total}") + lines.append(f" tool_calls_total: {o.tool_calls_total}") + if o.duration_ms_p50 is not None: + lines.append(f" duration_ms_p50: {o.duration_ms_p50:.1f}") + if o.duration_ms_p95 is not None: + lines.append(f" duration_ms_p95: {o.duration_ms_p95:.1f}") + if o.est_cost_usd_total is not None: + lines.append(f" est_cost_usd: ${o.est_cost_usd_total:.4f}") + return "\n".join(lines) + + +def report_to_json(report: EvalReport) -> str: + """Convenience JSON dump that round-trips through pydantic.""" + return json.dumps(json.loads(report.model_dump_json()), indent=2) diff --git a/src/evaluation/runner.py b/src/evaluation/runner.py new file mode 100644 index 000000000..f87da2fa6 --- /dev/null +++ b/src/evaluation/runner.py @@ -0,0 +1,68 @@ +"""Glue: load → grade → assemble report.""" + +from __future__ import annotations + +import json +import logging +from pathlib import Path + +from . import graders as grader_registry +from .loader import join_records, load_scenarios, load_trajectories +from .metrics import metrics_from_trajectory +from .models import EvalReport, PersistedTrajectory, Scenario, ScenarioResult +from .report import build_report + +_log = logging.getLogger(__name__) + + +def evaluate( + *, + trajectories_path: Path, + scenarios_paths: list[Path], + default_grading_method: str = "llm_judge", +) -> EvalReport: + """Load, grade, and aggregate. + + Per-scenario grader is picked from ``scenario.grading_method`` when + set, falling back to ``default_grading_method``. + """ + scenarios = load_scenarios(scenarios_paths) + trajectories = load_trajectories(trajectories_path) + + results: list[ScenarioResult] = [] + for scenario, traj in join_records(scenarios, trajectories): + results.append(_grade_one(scenario, traj, default_grading_method)) + + return build_report(results) + + +def _grade_one( + scenario: Scenario, + traj: PersistedTrajectory, + default_grading_method: str, +) -> ScenarioResult: + method = scenario.grading_method or default_grading_method + grader = grader_registry.get(method) + trajectory_text = _trajectory_to_text(traj) + grade = grader(scenario, traj.answer, trajectory_text) + + return ScenarioResult( + scenario_id=scenario.id, + scenario_type=scenario.type, + runner=traj.runner, + model=traj.model, + question=traj.question, + answer=traj.answer, + grade=grade, + ops=metrics_from_trajectory(traj), + ) + + +def _trajectory_to_text(traj: PersistedTrajectory) -> str: + """Flatten a trajectory to a text blob for the LLM judge prompt.""" + if traj.trajectory is None: + return "" + try: + return json.dumps(traj.trajectory, indent=2, default=str) + except (TypeError, ValueError): + return str(traj.trajectory) diff --git a/src/evaluation/tests/__init__.py b/src/evaluation/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/evaluation/tests/conftest.py b/src/evaluation/tests/conftest.py new file mode 100644 index 000000000..65eedf7d9 --- /dev/null +++ b/src/evaluation/tests/conftest.py @@ -0,0 +1,72 @@ +"""Shared fixtures for evaluation unit tests.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from evaluation.models import Scenario + + +@pytest.fixture +def make_scenario(): + def _factory(**overrides) -> Scenario: + defaults = { + "id": "1", + "text": "What sensors are on Chiller 6?", + "type": "iot", + "category": "Knowledge Query", + "characteristic_form": "Should list temperature, pressure, vibration sensors.", + } + defaults.update(overrides) + return Scenario.from_raw(defaults) + + return _factory + + +@pytest.fixture +def make_persisted_record(): + def _factory(**overrides) -> dict: + defaults = { + "run_id": "run-1", + "scenario_id": "1", + "runner": "plan-execute", + "model": "watsonx/ibm/granite", + "question": "Q?", + "answer": "A.", + "trajectory": { + "turns": [ + { + "index": 0, + "text": "thinking", + "tool_calls": [{"name": "sites", "input": {}}], + "input_tokens": 10, + "output_tokens": 5, + "duration_ms": 100.0, + }, + { + "index": 1, + "text": "answer", + "tool_calls": [], + "input_tokens": 12, + "output_tokens": 7, + "duration_ms": 200.0, + }, + ], + "started_at": "2026-04-27T00:00:00Z", + }, + } + defaults.update(overrides) + return defaults + + return _factory + + +@pytest.fixture +def trajectory_dir(tmp_path: Path, make_persisted_record): + """A directory pre-populated with one trajectory JSON file.""" + rec = make_persisted_record() + (tmp_path / f"{rec['run_id']}.json").write_text(json.dumps(rec), encoding="utf-8") + return tmp_path diff --git a/src/evaluation/tests/test_graders.py b/src/evaluation/tests/test_graders.py new file mode 100644 index 000000000..9002e5f95 --- /dev/null +++ b/src/evaluation/tests/test_graders.py @@ -0,0 +1,120 @@ +"""Tests for deterministic + LLM-judge graders.""" + +from __future__ import annotations + +from evaluation import graders as registry +from evaluation.graders.deterministic import exact_string_match, numeric_match +from evaluation.graders.llm_judge import LLMJudgeGrader, install +from llm import LLMBackend + + +class _StubLLM(LLMBackend): + def __init__(self, response: str) -> None: + self._response = response + + def generate(self, prompt: str, temperature: float = 0.0) -> str: + return self._response + + +class TestExactStringMatch: + def test_match_case_insensitive(self, make_scenario): + s = make_scenario(expected_answer="Hello World") + r = exact_string_match(s, "hello world", "") + assert r.passed and r.score == 1.0 + + def test_mismatch(self, make_scenario): + s = make_scenario(expected_answer="foo") + r = exact_string_match(s, "bar", "") + assert not r.passed + assert r.details["expected"] == "foo" + + def test_missing_expected(self, make_scenario): + s = make_scenario(expected_answer=None) + r = exact_string_match(s, "anything", "") + assert not r.passed + assert "expected_answer" in r.rationale + + +class TestNumericMatch: + def test_within_tolerance(self, make_scenario): + s = make_scenario(expected_answer="3.14159") + r = numeric_match(s, "3.141591", "") + assert r.passed + + def test_unparseable(self, make_scenario): + s = make_scenario(expected_answer="3.14") + r = numeric_match(s, "not a number", "") + assert not r.passed + assert "could not parse" in r.rationale + + def test_custom_tolerance(self, make_scenario): + s = make_scenario(expected_answer="100", tolerance=0.05) + r = numeric_match(s, "104", "") + assert r.passed + + +class TestRegistry: + def test_deterministic_graders_registered(self): + assert "exact_string_match" in registry.names() + assert "numeric_match" in registry.names() + + def test_get_unknown_raises(self): + try: + registry.get("does_not_exist") + except KeyError as e: + assert "does_not_exist" in str(e) + else: + raise AssertionError("expected KeyError") + + +class TestLLMJudgeGrader: + def _all_pass_response(self) -> str: + return ( + '{"task_completion": true, "data_retrieval_accuracy": true, ' + '"generalized_result_verification": true, "agent_sequence_correct": true, ' + '"clarity_and_justification": true, "hallucinations": false, ' + '"reason": "Looks good."}' + ) + + def test_passes_when_all_criteria_true(self, make_scenario): + grader = LLMJudgeGrader(_StubLLM(self._all_pass_response())) + r = grader(make_scenario(), "answer", "trajectory") + assert r.passed + assert r.score == 1.0 + assert r.rationale == "Looks good." + + def test_fails_on_hallucination(self, make_scenario): + resp = self._all_pass_response().replace( + '"hallucinations": false', '"hallucinations": true' + ) + grader = LLMJudgeGrader(_StubLLM(resp)) + r = grader(make_scenario(), "answer", "trajectory") + assert not r.passed + # Score is penalized but not zeroed when 5/5 criteria pass. + assert r.score < 1.0 + + def test_handles_unparseable_response(self, make_scenario): + grader = LLMJudgeGrader(_StubLLM("not json at all")) + r = grader(make_scenario(), "a", "t") + assert not r.passed + assert "unparseable" in r.rationale + + def test_handles_markdown_fenced_response(self, make_scenario): + wrapped = "Here you go:\n```json\n" + self._all_pass_response() + "\n```" + grader = LLMJudgeGrader(_StubLLM(wrapped)) + r = grader(make_scenario(), "a", "t") + assert r.passed + + def test_missing_characteristic_short_circuits(self, make_scenario): + grader = LLMJudgeGrader(_StubLLM(self._all_pass_response())) + s = make_scenario(characteristic_form=None, expected_answer=None) + r = grader(s, "a", "t") + assert not r.passed + assert "characteristic_form" in r.rationale + + def test_install_registers_under_default_name(self, make_scenario): + install(_StubLLM(self._all_pass_response())) + assert "llm_judge" in registry.names() + grader = registry.get("llm_judge") + r = grader(make_scenario(), "a", "t") + assert r.passed diff --git a/src/evaluation/tests/test_loader.py b/src/evaluation/tests/test_loader.py new file mode 100644 index 000000000..24260b34b --- /dev/null +++ b/src/evaluation/tests/test_loader.py @@ -0,0 +1,72 @@ +"""Tests for the trajectory + scenario loader.""" + +from __future__ import annotations + +import json +from pathlib import Path + +from evaluation.loader import ( + join_records, + load_scenarios, + load_trajectories, +) +from evaluation.models import Scenario + + +def test_load_trajectories_from_dir(trajectory_dir: Path): + records = load_trajectories(trajectory_dir) + assert len(records) == 1 + assert records[0].run_id == "run-1" + assert records[0].scenario_id == "1" + + +def test_load_trajectories_skips_unparseable(tmp_path: Path, make_persisted_record): + (tmp_path / "good.json").write_text(json.dumps(make_persisted_record()), encoding="utf-8") + (tmp_path / "bad.json").write_text("{not json", encoding="utf-8") + records = load_trajectories(tmp_path) + assert len(records) == 1 + + +def test_load_scenarios_json_list(tmp_path: Path): + p = tmp_path / "s.json" + p.write_text( + json.dumps( + [{"id": 1, "text": "Q1"}, {"id": "2", "text": "Q2"}] + ), + encoding="utf-8", + ) + out = load_scenarios(p) + assert [s.id for s in out] == ["1", "2"] + + +def test_load_scenarios_jsonl(tmp_path: Path): + p = tmp_path / "s.jsonl" + p.write_text( + '{"id": 1, "text": "Q1"}\n{"id": 2, "text": "Q2"}\n', + encoding="utf-8", + ) + out = load_scenarios(p) + assert [s.id for s in out] == ["1", "2"] + + +def test_load_scenarios_single_object(tmp_path: Path): + p = tmp_path / "s.json" + p.write_text(json.dumps({"id": 7, "text": "Q"}), encoding="utf-8") + out = load_scenarios(p) + assert [s.id for s in out] == ["7"] + + +def test_join_drops_orphans(make_persisted_record): + from evaluation.models import PersistedTrajectory + + scenarios = [ + Scenario.from_raw({"id": 1, "text": "Q1"}), + Scenario.from_raw({"id": 2, "text": "Q2"}), + ] + trajs = [ + PersistedTrajectory.from_raw(make_persisted_record(scenario_id=1)), + PersistedTrajectory.from_raw(make_persisted_record(run_id="r2", scenario_id=99)), + ] + pairs = list(join_records(scenarios, trajs)) + assert len(pairs) == 1 + assert pairs[0][0].id == "1" diff --git a/src/evaluation/tests/test_metrics.py b/src/evaluation/tests/test_metrics.py new file mode 100644 index 000000000..80cdef621 --- /dev/null +++ b/src/evaluation/tests/test_metrics.py @@ -0,0 +1,101 @@ +"""Tests for ops metrics extraction and aggregation.""" + +from __future__ import annotations + +from evaluation.metrics import ( + _normalize_model, + aggregate_ops, + metrics_from_trajectory, +) +from evaluation.models import ( + GradeResult, + OpsMetrics, + PersistedTrajectory, + ScenarioResult, +) + + +def _result(passed: bool = True, ops: OpsMetrics | None = None) -> ScenarioResult: + return ScenarioResult( + scenario_id="1", + scenario_type="iot", + runner="plan-execute", + model="watsonx/ibm/granite", + question="q", + answer="a", + grade=GradeResult(grading_method="exact_string_match", passed=passed), + ops=ops or OpsMetrics(), + ) + + +class TestMetricsFromTrajectory: + def test_sdk_trajectory_sums_per_turn(self, make_persisted_record): + rec = PersistedTrajectory.from_raw(make_persisted_record()) + m = metrics_from_trajectory(rec) + assert m.turn_count == 2 + assert m.tokens_in == 22 + assert m.tokens_out == 12 + assert m.tool_call_count == 1 + assert m.unique_tools == ["sites"] + assert m.duration_ms == 300.0 + + def test_handles_none_trajectory(self, make_persisted_record): + rec = PersistedTrajectory.from_raw(make_persisted_record(trajectory=None)) + assert metrics_from_trajectory(rec) == OpsMetrics() + + def test_plan_execute_list_trajectory(self, make_persisted_record): + rec = PersistedTrajectory.from_raw( + make_persisted_record( + trajectory=[ + {"step_number": 1, "task": "t", "server": "iot", "tool": "sites", "response": "ok"}, + {"step_number": 2, "task": "t2", "server": "iot", "tool": "assets", "response": "ok"}, + {"step_number": 3, "task": "t3", "server": "iot", "tool": "sites", "response": "ok"}, + ] + ) + ) + m = metrics_from_trajectory(rec) + assert m.turn_count == 3 + assert m.tool_call_count == 3 + assert m.unique_tools == ["assets", "sites"] + + +class TestAggregateOps: + def test_empty(self): + agg = aggregate_ops([]) + assert agg.tokens_in_total == 0 + assert agg.duration_ms_p50 is None + + def test_sums_and_percentiles(self): + results = [ + _result(ops=OpsMetrics(tokens_in=10, tokens_out=5, duration_ms=100.0, tool_call_count=1)), + _result(ops=OpsMetrics(tokens_in=20, tokens_out=10, duration_ms=300.0, tool_call_count=2)), + _result(ops=OpsMetrics(tokens_in=30, tokens_out=15, duration_ms=500.0, tool_call_count=3)), + ] + agg = aggregate_ops(results) + assert agg.tokens_in_total == 60 + assert agg.tokens_out_total == 30 + assert agg.tool_calls_total == 6 + assert agg.duration_ms_p50 is not None + assert agg.duration_ms_p95 is not None + assert agg.duration_ms_p50 <= agg.duration_ms_p95 + + def test_cost_only_when_some_present(self): + results = [ + _result(ops=OpsMetrics(est_cost_usd=0.01)), + _result(ops=OpsMetrics(est_cost_usd=0.02)), + ] + agg = aggregate_ops(results) + assert agg.est_cost_usd_total == 0.03 + + +class TestNormalizeModel: + def test_strips_provider_prefix(self): + assert _normalize_model("litellm_proxy/anthropic/claude-opus-4-5") == "claude-opus-4-5" + assert _normalize_model("watsonx/ibm/granite-13b") == "granite-13b" + + def test_strips_long_numeric_suffix(self): + assert _normalize_model("claude-opus-4-5-20250101") == "claude-opus-4-5" + + def test_keeps_short_numeric_suffix(self): + # "4-5" suffix is the model version, not a date — leave it intact. + assert _normalize_model("claude-opus-4-5") == "claude-opus-4-5" diff --git a/src/evaluation/tests/test_models.py b/src/evaluation/tests/test_models.py new file mode 100644 index 000000000..4aca4d551 --- /dev/null +++ b/src/evaluation/tests/test_models.py @@ -0,0 +1,45 @@ +"""Tests for evaluation Pydantic models.""" + +from evaluation.models import PersistedTrajectory, Scenario + + +def test_scenario_from_raw_coerces_int_id_to_str(): + s = Scenario.from_raw({"id": 301, "text": "Q"}) + assert s.id == "301" + assert isinstance(s.id, str) + + +def test_scenario_preserves_extra_fields(): + s = Scenario.from_raw({"id": "1", "text": "Q", "characteristic_form": "X", "tolerance": 0.01}) + extra = s.model_extra or {} + assert extra.get("tolerance") == 0.01 + + +def test_persisted_trajectory_coerces_scenario_id(): + t = PersistedTrajectory.from_raw( + { + "run_id": "r", + "scenario_id": 42, + "runner": "plan-execute", + "model": "m", + "question": "q", + "answer": "a", + "trajectory": None, + } + ) + assert t.scenario_id == "42" + + +def test_persisted_trajectory_allows_none_scenario_id(): + t = PersistedTrajectory.from_raw( + { + "run_id": "r", + "scenario_id": None, + "runner": "plan-execute", + "model": "m", + "question": "q", + "answer": "a", + "trajectory": None, + } + ) + assert t.scenario_id is None diff --git a/src/evaluation/tests/test_report.py b/src/evaluation/tests/test_report.py new file mode 100644 index 000000000..14816832a --- /dev/null +++ b/src/evaluation/tests/test_report.py @@ -0,0 +1,74 @@ +"""Tests for EvalReport assembly and serialization.""" + +from __future__ import annotations + +import json +from pathlib import Path + +from evaluation.models import ( + GradeResult, + OpsMetrics, + ScenarioResult, +) +from evaluation.report import build_report, render_summary, write_report + + +def _result(stype: str, passed: bool, **ops_kwargs) -> ScenarioResult: + return ScenarioResult( + scenario_id="x", + scenario_type=stype, + runner="plan-execute", + model="watsonx/ibm/granite", + question="q", + answer="a", + grade=GradeResult(grading_method="llm_judge", passed=passed, score=1.0 if passed else 0.0), + ops=OpsMetrics(**ops_kwargs), + ) + + +def test_build_report_totals_and_breakdown(): + results = [ + _result("iot", True, tokens_in=10, tokens_out=5), + _result("iot", False, tokens_in=8, tokens_out=4), + _result("tsfm", True, tokens_in=20, tokens_out=10), + ] + report = build_report(results) + + assert report.totals == { + "scenarios": 3, + "graded": 3, + "passed": 2, + "pass_rate": round(2 / 3, 4), + } + assert report.by_scenario_type["iot"].total == 2 + assert report.by_scenario_type["iot"].passed == 1 + assert report.by_scenario_type["tsfm"].pass_rate == 1.0 + assert report.ops.tokens_in_total == 38 + + +def test_build_report_handles_empty(): + report = build_report([]) + assert report.totals["scenarios"] == 0 + assert report.totals["pass_rate"] == 0.0 + assert report.by_scenario_type == {} + + +def test_write_report_round_trips(tmp_path: Path): + results = [_result("iot", True)] + report = build_report(results) + out = write_report(report, tmp_path / "nested" / "report.json") + assert out.exists() + data = json.loads(out.read_text(encoding="utf-8")) + assert data["totals"]["passed"] == 1 + assert data["by_scenario_type"]["iot"]["pass_rate"] == 1.0 + + +def test_render_summary_includes_headlines(): + results = [ + _result("iot", True, tokens_in=10, tokens_out=5, duration_ms=100.0, tool_call_count=1), + _result("iot", False, tokens_in=8, tokens_out=4, duration_ms=200.0), + ] + text = render_summary(build_report(results)) + assert "Pass rate" in text + assert "iot" in text + assert "tokens_in_total" in text diff --git a/src/evaluation/tests/test_runner.py b/src/evaluation/tests/test_runner.py new file mode 100644 index 000000000..ffab1688f --- /dev/null +++ b/src/evaluation/tests/test_runner.py @@ -0,0 +1,76 @@ +"""Smoke test for the end-to-end evaluation runner.""" + +from __future__ import annotations + +import json +from pathlib import Path + +from evaluation.models import GradeResult, Scenario +from evaluation.runner import evaluate +from evaluation import graders as registry + + +def _always_pass_grader(scenario: Scenario, answer: str, trajectory_text: str) -> GradeResult: + return GradeResult(grading_method="stub", passed=True, score=1.0) + + +def test_evaluate_end_to_end(tmp_path: Path, make_persisted_record): + # Two trajectories, both joinable to scenarios. + rec_a = make_persisted_record(run_id="run-a", scenario_id=1, answer="A") + rec_b = make_persisted_record(run_id="run-b", scenario_id=2, answer="B") + (tmp_path / "run-a.json").write_text(json.dumps(rec_a), encoding="utf-8") + (tmp_path / "run-b.json").write_text(json.dumps(rec_b), encoding="utf-8") + + scenarios_path = tmp_path / "scenarios.json" + scenarios_path.write_text( + json.dumps( + [ + {"id": 1, "text": "Q1", "type": "iot"}, + {"id": 2, "text": "Q2", "type": "tsfm"}, + ] + ), + encoding="utf-8", + ) + + registry.register("stub", _always_pass_grader) + + report = evaluate( + trajectories_path=tmp_path, + scenarios_paths=[scenarios_path], + default_grading_method="stub", + ) + + assert report.totals["scenarios"] == 2 + assert report.totals["passed"] == 2 + assert set(report.by_scenario_type.keys()) == {"iot", "tsfm"} + assert report.ops.tokens_in_total > 0 + + +def test_evaluate_uses_per_scenario_grading_method(tmp_path: Path, make_persisted_record): + rec = make_persisted_record(run_id="run-x", scenario_id=1) + (tmp_path / "run-x.json").write_text(json.dumps(rec), encoding="utf-8") + + scenarios_path = tmp_path / "scenarios.json" + scenarios_path.write_text( + json.dumps( + [ + { + "id": 1, + "text": "Q", + "type": "iot", + "expected_answer": "A.", + "grading_method": "exact_string_match", + } + ] + ), + encoding="utf-8", + ) + + report = evaluate( + trajectories_path=tmp_path, + scenarios_paths=[scenarios_path], + default_grading_method="numeric_match", # would fail; per-scenario override wins + ) + + assert report.totals["passed"] == 1 + assert report.results[0].grade.grading_method == "exact_string_match" From 079bf6ad11fa50f6d60d79b79f4c7d8ddb4818c4 Mon Sep 17 00:00:00 2001 From: Shuxin Lin Date: Wed, 13 May 2026 11:59:58 -0400 Subject: [PATCH 2/8] refactor(evaluation): adopt MLflow Evaluator/Scorer concept MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer feedback (PR #280): align with MLflow's evaluator/scorer split. - Rename src/evaluation/graders/ -> src/evaluation/scorers/ and organise by family: code_based (exact/numeric), llm_judge (LLM-As-Judge), semantic (new). - Rename GradeResult -> ScorerResult with field `scorer` (Scenario input field `grading_method` unchanged — input contract preserved). - Add `Evaluator` class as the orchestration entry point; functional `evaluate()` now delegates to it. - Add Semantic-Score scorer using difflib.SequenceMatcher (stdlib only, no extra deps); threshold overridable via scenario.similarity_threshold. - CLI: add --scorer-default (keeps --grader-default as alias). Tests: 7 new (4 semantic + 2 evaluator + 1 registry); 46 total in src/evaluation/, full suite 316 passed. Signed-off-by: Shuxin Lin --- src/evaluation/__init__.py | 14 ++- src/evaluation/cli.py | 31 +++---- src/evaluation/evaluator.py | 87 +++++++++++++++++++ src/evaluation/graders/__init__.py | 36 -------- src/evaluation/models.py | 13 ++- src/evaluation/runner.py | 59 ++----------- src/evaluation/scorers/__init__.py | 44 ++++++++++ .../code_based.py} | 28 +++--- .../{graders => scorers}/llm_judge.py | 32 +++---- src/evaluation/scorers/semantic.py | 51 +++++++++++ src/evaluation/tests/test_evaluator.py | 67 ++++++++++++++ src/evaluation/tests/test_metrics.py | 4 +- src/evaluation/tests/test_report.py | 4 +- src/evaluation/tests/test_runner.py | 12 +-- .../{test_graders.py => test_scorers.py} | 78 ++++++++++++----- 15 files changed, 393 insertions(+), 167 deletions(-) create mode 100644 src/evaluation/evaluator.py delete mode 100644 src/evaluation/graders/__init__.py create mode 100644 src/evaluation/scorers/__init__.py rename src/evaluation/{graders/deterministic.py => scorers/code_based.py} (76%) rename src/evaluation/{graders => scorers}/llm_judge.py (85%) create mode 100644 src/evaluation/scorers/semantic.py create mode 100644 src/evaluation/tests/test_evaluator.py rename src/evaluation/tests/{test_graders.py => test_scorers.py} (57%) diff --git a/src/evaluation/__init__.py b/src/evaluation/__init__.py index ca632ab1b..280da44d5 100644 --- a/src/evaluation/__init__.py +++ b/src/evaluation/__init__.py @@ -9,26 +9,36 @@ ``run`` (executes the agent — already exists) → ``evaluate`` (this module) → ``report.json``. Re-grading from saved trajectories is first-class. + +The evaluation concept follows MLflow's vocabulary: an +:class:`Evaluator` orchestrates one or more :data:`Scorer` callables +(:class:`ScorerResult` records the outcome). Scorers fall into three +families — Code-Based, LLM-As-Judge, and Semantic-Score — registered +under :mod:`evaluation.scorers`. """ +from .evaluator import Evaluator from .models import ( AggregateOps, EvalReport, - GradeResult, OpsMetrics, PersistedTrajectory, Scenario, ScenarioResult, + ScorerResult, TypeBreakdown, ) +from .scorers import Scorer __all__ = [ "AggregateOps", "EvalReport", - "GradeResult", + "Evaluator", "OpsMetrics", "PersistedTrajectory", "Scenario", "ScenarioResult", + "Scorer", + "ScorerResult", "TypeBreakdown", ] diff --git a/src/evaluation/cli.py b/src/evaluation/cli.py index 806452899..cc4ee87ed 100644 --- a/src/evaluation/cli.py +++ b/src/evaluation/cli.py @@ -1,4 +1,4 @@ -"""``uv run evaluate`` — offline grading + report generation.""" +"""``uv run evaluate`` — offline scoring + report generation.""" from __future__ import annotations @@ -7,16 +7,16 @@ import sys from pathlib import Path -from . import graders as grader_registry +from . import scorers as scorer_registry +from .evaluator import Evaluator from .report import render_summary, write_report -from .runner import evaluate def _build_parser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( prog="evaluate", description=( - "Grade saved agent trajectories against scenario files and " + "Score saved agent trajectories against scenario files and " "emit a JSON report." ), ) @@ -40,15 +40,17 @@ def _build_parser() -> argparse.ArgumentParser: help="Path to write the JSON report.", ) p.add_argument( + "--scorer-default", "--grader-default", + dest="scorer_default", default="llm_judge", - help="Grader name when scenario.grading_method is unset. " - "Default: llm_judge.", + help="Scorer name when scenario.grading_method is unset. " + "Default: llm_judge. (--grader-default is a legacy alias.)", ) p.add_argument( "--judge-model", default=None, - help="Model id for the LLM judge (e.g. " + help="Model id for the LLM-As-Judge scorer (e.g. " "litellm_proxy/anthropic/claude-opus-4-5). " "Required when any scenario routes to llm_judge.", ) @@ -64,18 +66,18 @@ def _build_parser() -> argparse.ArgumentParser: def _maybe_install_judge(judge_model: str | None) -> None: if not judge_model: return - # Imported lazily so the CLI works for deterministic-only runs even - # if the LiteLLM dep happens to be flaky in the dev environment. + # Imported lazily so the CLI works for code-based-only runs even if + # the LiteLLM dep happens to be flaky in the dev environment. from llm import LiteLLMBackend # type: ignore[import-not-found] - from .graders.llm_judge import install + from .scorers.llm_judge import install install(LiteLLMBackend(model=judge_model)) -def _validate_grader_default(name: str) -> None: +def _validate_scorer_default(name: str) -> None: try: - grader_registry.get(name) + scorer_registry.get(name) except KeyError as exc: raise SystemExit(str(exc)) @@ -88,12 +90,11 @@ def main(argv: list[str] | None = None) -> int: ) _maybe_install_judge(args.judge_model) - _validate_grader_default(args.grader_default) + _validate_scorer_default(args.scorer_default) - report = evaluate( + report = Evaluator(default_scorer=args.scorer_default).evaluate( trajectories_path=args.trajectories, scenarios_paths=list(args.scenarios), - default_grading_method=args.grader_default, ) out = write_report(report, args.output) diff --git a/src/evaluation/evaluator.py b/src/evaluation/evaluator.py new file mode 100644 index 000000000..f7d151040 --- /dev/null +++ b/src/evaluation/evaluator.py @@ -0,0 +1,87 @@ +"""Evaluator — orchestrates a set of scorers over a batch of records. + +Mirrors MLflow's evaluator/scorer split: the :class:`Evaluator` owns +the loading + per-record dispatch, while each :data:`Scorer` is a small +callable that produces a single :class:`ScorerResult`. +""" + +from __future__ import annotations + +import json +import logging +from pathlib import Path + +from . import scorers as scorer_registry +from .loader import join_records, load_scenarios, load_trajectories +from .metrics import metrics_from_trajectory +from .models import ( + EvalReport, + PersistedTrajectory, + Scenario, + ScenarioResult, + ScorerResult, +) +from .report import build_report +from .scorers import Scorer + +_log = logging.getLogger(__name__) + + +class Evaluator: + """Run a batch of scenarios against their saved trajectories. + + ``default_scorer`` names the registered scorer to use when a + scenario does not set ``grading_method``. Per-scenario overrides + take precedence. + """ + + def __init__(self, default_scorer: str = "llm_judge") -> None: + self.default_scorer = default_scorer + + def evaluate( + self, + *, + trajectories_path: Path, + scenarios_paths: list[Path], + ) -> EvalReport: + scenarios = load_scenarios(scenarios_paths) + trajectories = load_trajectories(trajectories_path) + + results: list[ScenarioResult] = [] + for scenario, traj in join_records(scenarios, trajectories): + results.append(self._score_one(scenario, traj)) + + return build_report(results) + + def _score_one( + self, scenario: Scenario, traj: PersistedTrajectory + ) -> ScenarioResult: + name = scenario.grading_method or self.default_scorer + scorer = self._resolve(name) + trajectory_text = _trajectory_to_text(traj) + grade = scorer(scenario, traj.answer, trajectory_text) + + return ScenarioResult( + scenario_id=scenario.id, + scenario_type=scenario.type, + runner=traj.runner, + model=traj.model, + question=traj.question, + answer=traj.answer, + grade=grade, + ops=metrics_from_trajectory(traj), + ) + + @staticmethod + def _resolve(name: str) -> Scorer: + return scorer_registry.get(name) + + +def _trajectory_to_text(traj: PersistedTrajectory) -> str: + """Flatten a trajectory to a text blob for the LLM-As-Judge prompt.""" + if traj.trajectory is None: + return "" + try: + return json.dumps(traj.trajectory, indent=2, default=str) + except (TypeError, ValueError): + return str(traj.trajectory) diff --git a/src/evaluation/graders/__init__.py b/src/evaluation/graders/__init__.py deleted file mode 100644 index f58a074e0..000000000 --- a/src/evaluation/graders/__init__.py +++ /dev/null @@ -1,36 +0,0 @@ -"""Pluggable grader registry. - -Each grader is a callable taking ``(scenario, answer, trajectory_text)`` -and returning a :class:`~evaluation.models.GradeResult`. Registration -happens via :func:`register`; the CLI looks up graders by name from -``scenario.grading_method`` (falling back to a CLI-supplied default). -""" - -from __future__ import annotations - -from typing import Callable - -from ..models import GradeResult, Scenario - -Grader = Callable[[Scenario, str, str], GradeResult] - -_REGISTRY: dict[str, Grader] = {} - - -def register(name: str, grader: Grader) -> None: - _REGISTRY[name] = grader - - -def get(name: str) -> Grader: - if name not in _REGISTRY: - raise KeyError( - f"unknown grader {name!r}; registered: {sorted(_REGISTRY)}" - ) - return _REGISTRY[name] - - -def names() -> list[str]: - return sorted(_REGISTRY) - - -from . import deterministic # noqa: E402,F401 — register-on-import diff --git a/src/evaluation/models.py b/src/evaluation/models.py index 846e98911..25fd0b2aa 100644 --- a/src/evaluation/models.py +++ b/src/evaluation/models.py @@ -66,8 +66,15 @@ class OpsMetrics(BaseModel): est_cost_usd: float | None = None -class GradeResult(BaseModel): - grading_method: str +class ScorerResult(BaseModel): + """Output of a single :class:`Scorer` invocation. + + ``scorer`` is the registered name of the scorer that produced this + result — distinct from ``Scenario.grading_method``, which is the + *requested* scorer on the input side. + """ + + scorer: str passed: bool score: float = 0.0 rationale: str = "" @@ -81,7 +88,7 @@ class ScenarioResult(BaseModel): model: str question: str answer: str - grade: GradeResult + grade: ScorerResult ops: OpsMetrics diff --git a/src/evaluation/runner.py b/src/evaluation/runner.py index f87da2fa6..23df16f77 100644 --- a/src/evaluation/runner.py +++ b/src/evaluation/runner.py @@ -1,18 +1,11 @@ -"""Glue: load → grade → assemble report.""" +"""Backwards-friendly functional entry point delegating to :class:`Evaluator`.""" from __future__ import annotations -import json -import logging from pathlib import Path -from . import graders as grader_registry -from .loader import join_records, load_scenarios, load_trajectories -from .metrics import metrics_from_trajectory -from .models import EvalReport, PersistedTrajectory, Scenario, ScenarioResult -from .report import build_report - -_log = logging.getLogger(__name__) +from .evaluator import Evaluator +from .models import EvalReport def evaluate( @@ -21,48 +14,12 @@ def evaluate( scenarios_paths: list[Path], default_grading_method: str = "llm_judge", ) -> EvalReport: - """Load, grade, and aggregate. + """Load, score, and aggregate. - Per-scenario grader is picked from ``scenario.grading_method`` when + Per-scenario scorer is picked from ``scenario.grading_method`` when set, falling back to ``default_grading_method``. """ - scenarios = load_scenarios(scenarios_paths) - trajectories = load_trajectories(trajectories_path) - - results: list[ScenarioResult] = [] - for scenario, traj in join_records(scenarios, trajectories): - results.append(_grade_one(scenario, traj, default_grading_method)) - - return build_report(results) - - -def _grade_one( - scenario: Scenario, - traj: PersistedTrajectory, - default_grading_method: str, -) -> ScenarioResult: - method = scenario.grading_method or default_grading_method - grader = grader_registry.get(method) - trajectory_text = _trajectory_to_text(traj) - grade = grader(scenario, traj.answer, trajectory_text) - - return ScenarioResult( - scenario_id=scenario.id, - scenario_type=scenario.type, - runner=traj.runner, - model=traj.model, - question=traj.question, - answer=traj.answer, - grade=grade, - ops=metrics_from_trajectory(traj), + return Evaluator(default_scorer=default_grading_method).evaluate( + trajectories_path=trajectories_path, + scenarios_paths=scenarios_paths, ) - - -def _trajectory_to_text(traj: PersistedTrajectory) -> str: - """Flatten a trajectory to a text blob for the LLM judge prompt.""" - if traj.trajectory is None: - return "" - try: - return json.dumps(traj.trajectory, indent=2, default=str) - except (TypeError, ValueError): - return str(traj.trajectory) diff --git a/src/evaluation/scorers/__init__.py b/src/evaluation/scorers/__init__.py new file mode 100644 index 000000000..00ff0c1e2 --- /dev/null +++ b/src/evaluation/scorers/__init__.py @@ -0,0 +1,44 @@ +"""Pluggable scorer registry. + +Each scorer is a callable taking ``(scenario, answer, trajectory_text)`` +and returning a :class:`~evaluation.models.ScorerResult`. The vocabulary +follows MLflow's evaluation concept: an ``Evaluator`` orchestrates one +or more ``Scorer`` s; scorers fall into three families: + +* **Code-Based** — deterministic, no model required (string/numeric + matchers in :mod:`evaluation.scorers.code_based`). +* **LLM-As-Judge** — model-graded against a rubric + (:mod:`evaluation.scorers.llm_judge`). +* **Semantic-Score** — similarity-based, no model call + (:mod:`evaluation.scorers.semantic`). +""" + +from __future__ import annotations + +from typing import Callable + +from ..models import Scenario, ScorerResult + +Scorer = Callable[[Scenario, str, str], ScorerResult] + +_REGISTRY: dict[str, Scorer] = {} + + +def register(name: str, scorer: Scorer) -> None: + _REGISTRY[name] = scorer + + +def get(name: str) -> Scorer: + if name not in _REGISTRY: + raise KeyError( + f"unknown scorer {name!r}; registered: {sorted(_REGISTRY)}" + ) + return _REGISTRY[name] + + +def names() -> list[str]: + return sorted(_REGISTRY) + + +from . import code_based # noqa: E402,F401 — register-on-import +from . import semantic # noqa: E402,F401 — register-on-import diff --git a/src/evaluation/graders/deterministic.py b/src/evaluation/scorers/code_based.py similarity index 76% rename from src/evaluation/graders/deterministic.py rename to src/evaluation/scorers/code_based.py index 35db1c299..929ef8bdf 100644 --- a/src/evaluation/graders/deterministic.py +++ b/src/evaluation/scorers/code_based.py @@ -1,20 +1,20 @@ -"""Pure deterministic graders — no LLM, no network.""" +"""Code-Based scorers — deterministic, no LLM, no network.""" from __future__ import annotations import math -from ..models import GradeResult, Scenario +from ..models import Scenario, ScorerResult from . import register def exact_string_match( scenario: Scenario, answer: str, trajectory_text: str -) -> GradeResult: +) -> ScorerResult: expected = scenario.expected_answer if expected is None: - return GradeResult( - grading_method="exact_string_match", + return ScorerResult( + scorer="exact_string_match", passed=False, score=0.0, rationale="scenario has no expected_answer", @@ -23,8 +23,8 @@ def exact_string_match( a = str(answer).strip().lower() e = str(expected).strip().lower() passed = a == e - return GradeResult( - grading_method="exact_string_match", + return ScorerResult( + scorer="exact_string_match", passed=passed, score=1.0 if passed else 0.0, rationale="" if passed else f"expected {expected!r}, got {answer!r}", @@ -34,14 +34,14 @@ def exact_string_match( def numeric_match( scenario: Scenario, answer: str, trajectory_text: str -) -> GradeResult: +) -> ScorerResult: expected_raw = scenario.expected_answer extra = scenario.model_extra or {} tolerance = float(extra.get("tolerance", 1e-6)) if expected_raw is None: - return GradeResult( - grading_method="numeric_match", + return ScorerResult( + scorer="numeric_match", passed=False, rationale="scenario has no expected_answer", ) @@ -50,16 +50,16 @@ def numeric_match( a = float(answer) e = float(expected_raw) except (TypeError, ValueError) as err: - return GradeResult( - grading_method="numeric_match", + return ScorerResult( + scorer="numeric_match", passed=False, rationale=f"could not parse numbers: {err}", details={"expected": expected_raw, "actual": answer}, ) passed = math.isclose(a, e, rel_tol=tolerance, abs_tol=tolerance) - return GradeResult( - grading_method="numeric_match", + return ScorerResult( + scorer="numeric_match", passed=passed, score=1.0 if passed else 0.0, rationale="" if passed else f"|{a} - {e}| > tol={tolerance}", diff --git a/src/evaluation/graders/llm_judge.py b/src/evaluation/scorers/llm_judge.py similarity index 85% rename from src/evaluation/graders/llm_judge.py rename to src/evaluation/scorers/llm_judge.py index fb55bf73f..00518061a 100644 --- a/src/evaluation/graders/llm_judge.py +++ b/src/evaluation/scorers/llm_judge.py @@ -1,4 +1,4 @@ -"""LLM-judge grader. +"""LLM-As-Judge scorer. Free-form answers are scored against ``scenario.characteristic_form`` using a six-criterion rubric (task completion, data retrieval accuracy, @@ -16,7 +16,7 @@ from llm import LLMBackend -from ..models import GradeResult, Scenario +from ..models import Scenario, ScorerResult from . import register _log = logging.getLogger(__name__) @@ -61,8 +61,8 @@ The agent passes overall iff the first five are true AND hallucinations is false.""" -class LLMJudgeGrader: - """Closure-style grader that holds an :class:`LLMBackend`.""" +class LLMJudgeScorer: + """Closure-style scorer that holds an :class:`LLMBackend`.""" def __init__(self, llm: LLMBackend, name: str = "llm_judge") -> None: self._llm = llm @@ -70,11 +70,11 @@ def __init__(self, llm: LLMBackend, name: str = "llm_judge") -> None: def __call__( self, scenario: Scenario, answer: str, trajectory_text: str - ) -> GradeResult: + ) -> ScorerResult: characteristic = scenario.characteristic_form or scenario.expected_answer or "" if not characteristic: - return GradeResult( - grading_method=self.name, + return ScorerResult( + scorer=self.name, passed=False, rationale="scenario has neither characteristic_form nor expected_answer", ) @@ -88,18 +88,18 @@ def __call__( try: raw = self._llm.generate(prompt) - except Exception as exc: # judge call failure is a grading failure, not a crash + except Exception as exc: # judge call failure is a scoring failure, not a crash _log.exception("llm_judge: backend error") - return GradeResult( - grading_method=self.name, + return ScorerResult( + scorer=self.name, passed=False, rationale=f"judge backend error: {exc}", ) review = _parse_review(raw) if review is None: - return GradeResult( - grading_method=self.name, + return ScorerResult( + scorer=self.name, passed=False, rationale="judge returned unparseable JSON", details={"raw": raw[:2000]}, @@ -117,8 +117,8 @@ def __call__( if review.get("hallucinations") is True: score = max(0.0, score - 0.2) - return GradeResult( - grading_method=self.name, + return ScorerResult( + scorer=self.name, passed=passed, score=round(score, 3), rationale=str(review.get("reason", ""))[:500], @@ -140,5 +140,5 @@ def _parse_review(raw: str) -> dict | None: def install(llm: LLMBackend, name: str = "llm_judge") -> None: - """Register an LLM-judge grader bound to ``llm`` under ``name``.""" - register(name, LLMJudgeGrader(llm, name=name)) + """Register an LLM-As-Judge scorer bound to ``llm`` under ``name``.""" + register(name, LLMJudgeScorer(llm, name=name)) diff --git a/src/evaluation/scorers/semantic.py b/src/evaluation/scorers/semantic.py new file mode 100644 index 000000000..56506eb10 --- /dev/null +++ b/src/evaluation/scorers/semantic.py @@ -0,0 +1,51 @@ +"""Semantic-Score scorer — similarity without an LLM call. + +Uses ``difflib.SequenceMatcher`` over normalised text so the scorer has +no external dependencies and is stable in CI. A scenario can override +the pass threshold via ``scenario.similarity_threshold`` (default 0.6). +""" + +from __future__ import annotations + +import re +from difflib import SequenceMatcher + +from ..models import Scenario, ScorerResult +from . import register + +_DEFAULT_THRESHOLD = 0.6 +_WS_RE = re.compile(r"\s+") + + +def _normalize(text: str) -> str: + return _WS_RE.sub(" ", str(text).strip().lower()) + + +def semantic_similarity( + scenario: Scenario, answer: str, trajectory_text: str +) -> ScorerResult: + reference = scenario.characteristic_form or scenario.expected_answer + if not reference: + return ScorerResult( + scorer="semantic_similarity", + passed=False, + rationale="scenario has neither characteristic_form nor expected_answer", + ) + + extra = scenario.model_extra or {} + threshold = float(extra.get("similarity_threshold", _DEFAULT_THRESHOLD)) + + score = SequenceMatcher(None, _normalize(reference), _normalize(answer)).ratio() + passed = score >= threshold + return ScorerResult( + scorer="semantic_similarity", + passed=passed, + score=round(score, 4), + rationale=( + "" if passed else f"similarity {score:.3f} below threshold {threshold}" + ), + details={"threshold": threshold, "reference": reference}, + ) + + +register("semantic_similarity", semantic_similarity) diff --git a/src/evaluation/tests/test_evaluator.py b/src/evaluation/tests/test_evaluator.py new file mode 100644 index 000000000..efbb424b0 --- /dev/null +++ b/src/evaluation/tests/test_evaluator.py @@ -0,0 +1,67 @@ +"""Tests for the Evaluator class — the orchestration layer.""" + +from __future__ import annotations + +import json +from pathlib import Path + +from evaluation import scorers as registry +from evaluation.evaluator import Evaluator +from evaluation.models import Scenario, ScorerResult + + +def _stub_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult: + return ScorerResult(scorer="stub-evaluator", passed=True, score=1.0) + + +def test_evaluator_routes_to_default_scorer(tmp_path: Path, make_persisted_record): + rec = make_persisted_record(run_id="run-1", scenario_id=1) + (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") + + scenarios_path = tmp_path / "scenarios.json" + scenarios_path.write_text( + json.dumps([{"id": 1, "text": "Q", "type": "iot"}]), + encoding="utf-8", + ) + + registry.register("stub-evaluator", _stub_scorer) + + report = Evaluator(default_scorer="stub-evaluator").evaluate( + trajectories_path=tmp_path, + scenarios_paths=[scenarios_path], + ) + + assert report.totals["passed"] == 1 + assert report.results[0].grade.scorer == "stub-evaluator" + + +def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_record): + # Default scorer would crash on a missing registration; the + # scenario-level grading_method must win and route to a code-based + # scorer instead. + rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="3.14") + (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") + + scenarios_path = tmp_path / "scenarios.json" + scenarios_path.write_text( + json.dumps( + [ + { + "id": 1, + "text": "Q", + "type": "tsfm", + "expected_answer": "3.14", + "grading_method": "numeric_match", + } + ] + ), + encoding="utf-8", + ) + + report = Evaluator(default_scorer="llm_judge").evaluate( + trajectories_path=tmp_path, + scenarios_paths=[scenarios_path], + ) + + assert report.totals["passed"] == 1 + assert report.results[0].grade.scorer == "numeric_match" diff --git a/src/evaluation/tests/test_metrics.py b/src/evaluation/tests/test_metrics.py index 80cdef621..804624ffb 100644 --- a/src/evaluation/tests/test_metrics.py +++ b/src/evaluation/tests/test_metrics.py @@ -8,10 +8,10 @@ metrics_from_trajectory, ) from evaluation.models import ( - GradeResult, OpsMetrics, PersistedTrajectory, ScenarioResult, + ScorerResult, ) @@ -23,7 +23,7 @@ def _result(passed: bool = True, ops: OpsMetrics | None = None) -> ScenarioResul model="watsonx/ibm/granite", question="q", answer="a", - grade=GradeResult(grading_method="exact_string_match", passed=passed), + grade=ScorerResult(scorer="exact_string_match", passed=passed), ops=ops or OpsMetrics(), ) diff --git a/src/evaluation/tests/test_report.py b/src/evaluation/tests/test_report.py index 14816832a..7e2db642f 100644 --- a/src/evaluation/tests/test_report.py +++ b/src/evaluation/tests/test_report.py @@ -6,9 +6,9 @@ from pathlib import Path from evaluation.models import ( - GradeResult, OpsMetrics, ScenarioResult, + ScorerResult, ) from evaluation.report import build_report, render_summary, write_report @@ -21,7 +21,7 @@ def _result(stype: str, passed: bool, **ops_kwargs) -> ScenarioResult: model="watsonx/ibm/granite", question="q", answer="a", - grade=GradeResult(grading_method="llm_judge", passed=passed, score=1.0 if passed else 0.0), + grade=ScorerResult(scorer="llm_judge", passed=passed, score=1.0 if passed else 0.0), ops=OpsMetrics(**ops_kwargs), ) diff --git a/src/evaluation/tests/test_runner.py b/src/evaluation/tests/test_runner.py index ffab1688f..1c22cc618 100644 --- a/src/evaluation/tests/test_runner.py +++ b/src/evaluation/tests/test_runner.py @@ -5,13 +5,13 @@ import json from pathlib import Path -from evaluation.models import GradeResult, Scenario +from evaluation.models import Scenario, ScorerResult from evaluation.runner import evaluate -from evaluation import graders as registry +from evaluation import scorers as registry -def _always_pass_grader(scenario: Scenario, answer: str, trajectory_text: str) -> GradeResult: - return GradeResult(grading_method="stub", passed=True, score=1.0) +def _always_pass_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult: + return ScorerResult(scorer="stub", passed=True, score=1.0) def test_evaluate_end_to_end(tmp_path: Path, make_persisted_record): @@ -32,7 +32,7 @@ def test_evaluate_end_to_end(tmp_path: Path, make_persisted_record): encoding="utf-8", ) - registry.register("stub", _always_pass_grader) + registry.register("stub", _always_pass_scorer) report = evaluate( trajectories_path=tmp_path, @@ -73,4 +73,4 @@ def test_evaluate_uses_per_scenario_grading_method(tmp_path: Path, make_persiste ) assert report.totals["passed"] == 1 - assert report.results[0].grade.grading_method == "exact_string_match" + assert report.results[0].grade.scorer == "exact_string_match" diff --git a/src/evaluation/tests/test_graders.py b/src/evaluation/tests/test_scorers.py similarity index 57% rename from src/evaluation/tests/test_graders.py rename to src/evaluation/tests/test_scorers.py index 9002e5f95..2e72e61c4 100644 --- a/src/evaluation/tests/test_graders.py +++ b/src/evaluation/tests/test_scorers.py @@ -1,10 +1,11 @@ -"""Tests for deterministic + LLM-judge graders.""" +"""Tests for the three scorer families: code-based, LLM-as-judge, semantic.""" from __future__ import annotations -from evaluation import graders as registry -from evaluation.graders.deterministic import exact_string_match, numeric_match -from evaluation.graders.llm_judge import LLMJudgeGrader, install +from evaluation import scorers as registry +from evaluation.scorers.code_based import exact_string_match, numeric_match +from evaluation.scorers.llm_judge import LLMJudgeScorer, install +from evaluation.scorers.semantic import semantic_similarity from llm import LLMBackend @@ -53,10 +54,47 @@ def test_custom_tolerance(self, make_scenario): assert r.passed +class TestSemanticSimilarity: + def test_close_text_passes_default_threshold(self, make_scenario): + s = make_scenario( + characteristic_form="Lists temperature, pressure, and vibration sensors." + ) + r = semantic_similarity( + s, "lists temperature pressure and vibration sensors", "" + ) + assert r.passed + assert r.score >= 0.6 + + def test_unrelated_text_fails(self, make_scenario): + s = make_scenario(characteristic_form="lists three iot sensors") + r = semantic_similarity(s, "the chiller is operating normally", "") + assert not r.passed + assert "below threshold" in r.rationale + + def test_custom_threshold_override(self, make_scenario): + s = make_scenario( + characteristic_form="lists three iot sensors", + similarity_threshold=0.05, + ) + r = semantic_similarity(s, "completely different answer text", "") + # Threshold lowered enough that even weak overlap passes. + assert r.passed + + def test_missing_reference_short_circuits(self, make_scenario): + s = make_scenario(characteristic_form=None, expected_answer=None) + r = semantic_similarity(s, "anything", "") + assert not r.passed + assert "characteristic_form" in r.rationale + + class TestRegistry: - def test_deterministic_graders_registered(self): - assert "exact_string_match" in registry.names() - assert "numeric_match" in registry.names() + def test_code_based_scorers_registered(self): + names = registry.names() + assert "exact_string_match" in names + assert "numeric_match" in names + + def test_semantic_scorer_registered(self): + assert "semantic_similarity" in registry.names() def test_get_unknown_raises(self): try: @@ -67,7 +105,7 @@ def test_get_unknown_raises(self): raise AssertionError("expected KeyError") -class TestLLMJudgeGrader: +class TestLLMJudgeScorer: def _all_pass_response(self) -> str: return ( '{"task_completion": true, "data_retrieval_accuracy": true, ' @@ -77,8 +115,8 @@ def _all_pass_response(self) -> str: ) def test_passes_when_all_criteria_true(self, make_scenario): - grader = LLMJudgeGrader(_StubLLM(self._all_pass_response())) - r = grader(make_scenario(), "answer", "trajectory") + scorer = LLMJudgeScorer(_StubLLM(self._all_pass_response())) + r = scorer(make_scenario(), "answer", "trajectory") assert r.passed assert r.score == 1.0 assert r.rationale == "Looks good." @@ -87,34 +125,34 @@ def test_fails_on_hallucination(self, make_scenario): resp = self._all_pass_response().replace( '"hallucinations": false', '"hallucinations": true' ) - grader = LLMJudgeGrader(_StubLLM(resp)) - r = grader(make_scenario(), "answer", "trajectory") + scorer = LLMJudgeScorer(_StubLLM(resp)) + r = scorer(make_scenario(), "answer", "trajectory") assert not r.passed # Score is penalized but not zeroed when 5/5 criteria pass. assert r.score < 1.0 def test_handles_unparseable_response(self, make_scenario): - grader = LLMJudgeGrader(_StubLLM("not json at all")) - r = grader(make_scenario(), "a", "t") + scorer = LLMJudgeScorer(_StubLLM("not json at all")) + r = scorer(make_scenario(), "a", "t") assert not r.passed assert "unparseable" in r.rationale def test_handles_markdown_fenced_response(self, make_scenario): wrapped = "Here you go:\n```json\n" + self._all_pass_response() + "\n```" - grader = LLMJudgeGrader(_StubLLM(wrapped)) - r = grader(make_scenario(), "a", "t") + scorer = LLMJudgeScorer(_StubLLM(wrapped)) + r = scorer(make_scenario(), "a", "t") assert r.passed def test_missing_characteristic_short_circuits(self, make_scenario): - grader = LLMJudgeGrader(_StubLLM(self._all_pass_response())) + scorer = LLMJudgeScorer(_StubLLM(self._all_pass_response())) s = make_scenario(characteristic_form=None, expected_answer=None) - r = grader(s, "a", "t") + r = scorer(s, "a", "t") assert not r.passed assert "characteristic_form" in r.rationale def test_install_registers_under_default_name(self, make_scenario): install(_StubLLM(self._all_pass_response())) assert "llm_judge" in registry.names() - grader = registry.get("llm_judge") - r = grader(make_scenario(), "a", "t") + scorer = registry.get("llm_judge") + r = scorer(make_scenario(), "a", "t") assert r.passed From 5272a4fd0d59b5cebc2355c4aa25ef4634f7aa70 Mon Sep 17 00:00:00 2001 From: Shuxin Lin Date: Wed, 13 May 2026 12:02:37 -0400 Subject: [PATCH 3/8] refactor(evaluation): drop code-based scorer bodies, keep skeleton Code-Based scorers (exact_string_match, numeric_match) are now skeleton stubs raising NotImplementedError; the family slot in the Evaluator/Scorer taxonomy is preserved but implementations are deferred. Registry no longer auto-registers them on import. Tests for behavior removed; new TestCodeBasedSkeletons asserts NotImplementedError. test_runner and test_evaluator override tests re-pointed at semantic_similarity. 41 evaluation tests pass. Signed-off-by: Shuxin Lin --- src/evaluation/scorers/code_based.py | 64 +++----------------------- src/evaluation/tests/test_evaluator.py | 13 +++--- src/evaluation/tests/test_runner.py | 13 ++++-- src/evaluation/tests/test_scorers.py | 57 +++++++---------------- 4 files changed, 38 insertions(+), 109 deletions(-) diff --git a/src/evaluation/scorers/code_based.py b/src/evaluation/scorers/code_based.py index 929ef8bdf..c0d167116 100644 --- a/src/evaluation/scorers/code_based.py +++ b/src/evaluation/scorers/code_based.py @@ -1,71 +1,21 @@ -"""Code-Based scorers — deterministic, no LLM, no network.""" +"""Code-Based scorers — deterministic, no LLM, no network. -from __future__ import annotations +Skeleton only — fill in the implementations and re-register with the +scorer registry before use. +""" -import math +from __future__ import annotations from ..models import Scenario, ScorerResult -from . import register def exact_string_match( scenario: Scenario, answer: str, trajectory_text: str ) -> ScorerResult: - expected = scenario.expected_answer - if expected is None: - return ScorerResult( - scorer="exact_string_match", - passed=False, - score=0.0, - rationale="scenario has no expected_answer", - ) - - a = str(answer).strip().lower() - e = str(expected).strip().lower() - passed = a == e - return ScorerResult( - scorer="exact_string_match", - passed=passed, - score=1.0 if passed else 0.0, - rationale="" if passed else f"expected {expected!r}, got {answer!r}", - details={"expected": expected, "actual": answer}, - ) + raise NotImplementedError def numeric_match( scenario: Scenario, answer: str, trajectory_text: str ) -> ScorerResult: - expected_raw = scenario.expected_answer - extra = scenario.model_extra or {} - tolerance = float(extra.get("tolerance", 1e-6)) - - if expected_raw is None: - return ScorerResult( - scorer="numeric_match", - passed=False, - rationale="scenario has no expected_answer", - ) - - try: - a = float(answer) - e = float(expected_raw) - except (TypeError, ValueError) as err: - return ScorerResult( - scorer="numeric_match", - passed=False, - rationale=f"could not parse numbers: {err}", - details={"expected": expected_raw, "actual": answer}, - ) - - passed = math.isclose(a, e, rel_tol=tolerance, abs_tol=tolerance) - return ScorerResult( - scorer="numeric_match", - passed=passed, - score=1.0 if passed else 0.0, - rationale="" if passed else f"|{a} - {e}| > tol={tolerance}", - details={"expected": e, "actual": a, "tolerance": tolerance}, - ) - - -register("exact_string_match", exact_string_match) -register("numeric_match", numeric_match) + raise NotImplementedError diff --git a/src/evaluation/tests/test_evaluator.py b/src/evaluation/tests/test_evaluator.py index efbb424b0..ed0058e66 100644 --- a/src/evaluation/tests/test_evaluator.py +++ b/src/evaluation/tests/test_evaluator.py @@ -36,10 +36,9 @@ def test_evaluator_routes_to_default_scorer(tmp_path: Path, make_persisted_recor def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_record): - # Default scorer would crash on a missing registration; the - # scenario-level grading_method must win and route to a code-based - # scorer instead. - rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="3.14") + # The scenario-level grading_method must route around the default + # scorer, even when the default is a placeholder that would fail. + rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="answer text") (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") scenarios_path = tmp_path / "scenarios.json" @@ -50,8 +49,8 @@ def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_rec "id": 1, "text": "Q", "type": "tsfm", - "expected_answer": "3.14", - "grading_method": "numeric_match", + "characteristic_form": "answer text", + "grading_method": "semantic_similarity", } ] ), @@ -64,4 +63,4 @@ def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_rec ) assert report.totals["passed"] == 1 - assert report.results[0].grade.scorer == "numeric_match" + assert report.results[0].grade.scorer == "semantic_similarity" diff --git a/src/evaluation/tests/test_runner.py b/src/evaluation/tests/test_runner.py index 1c22cc618..421f642e4 100644 --- a/src/evaluation/tests/test_runner.py +++ b/src/evaluation/tests/test_runner.py @@ -47,7 +47,7 @@ def test_evaluate_end_to_end(tmp_path: Path, make_persisted_record): def test_evaluate_uses_per_scenario_grading_method(tmp_path: Path, make_persisted_record): - rec = make_persisted_record(run_id="run-x", scenario_id=1) + rec = make_persisted_record(run_id="run-x", scenario_id=1, answer="A.") (tmp_path / "run-x.json").write_text(json.dumps(rec), encoding="utf-8") scenarios_path = tmp_path / "scenarios.json" @@ -58,19 +58,22 @@ def test_evaluate_uses_per_scenario_grading_method(tmp_path: Path, make_persiste "id": 1, "text": "Q", "type": "iot", - "expected_answer": "A.", - "grading_method": "exact_string_match", + "characteristic_form": "A.", + "similarity_threshold": 0.5, + "grading_method": "semantic_similarity", } ] ), encoding="utf-8", ) + registry.register("stub", _always_pass_scorer) + report = evaluate( trajectories_path=tmp_path, scenarios_paths=[scenarios_path], - default_grading_method="numeric_match", # would fail; per-scenario override wins + default_grading_method="stub", # per-scenario override wins ) assert report.totals["passed"] == 1 - assert report.results[0].grade.scorer == "exact_string_match" + assert report.results[0].grade.scorer == "semantic_similarity" diff --git a/src/evaluation/tests/test_scorers.py b/src/evaluation/tests/test_scorers.py index 2e72e61c4..b76a924bc 100644 --- a/src/evaluation/tests/test_scorers.py +++ b/src/evaluation/tests/test_scorers.py @@ -1,4 +1,7 @@ -"""Tests for the three scorer families: code-based, LLM-as-judge, semantic.""" +"""Tests for the three scorer families: code-based, LLM-as-judge, semantic. + +Code-Based scorers are skeletons only and have no behaviour tests yet. +""" from __future__ import annotations @@ -17,41 +20,20 @@ def generate(self, prompt: str, temperature: float = 0.0) -> str: return self._response -class TestExactStringMatch: - def test_match_case_insensitive(self, make_scenario): - s = make_scenario(expected_answer="Hello World") - r = exact_string_match(s, "hello world", "") - assert r.passed and r.score == 1.0 - - def test_mismatch(self, make_scenario): - s = make_scenario(expected_answer="foo") - r = exact_string_match(s, "bar", "") - assert not r.passed - assert r.details["expected"] == "foo" - - def test_missing_expected(self, make_scenario): - s = make_scenario(expected_answer=None) - r = exact_string_match(s, "anything", "") - assert not r.passed - assert "expected_answer" in r.rationale - - -class TestNumericMatch: - def test_within_tolerance(self, make_scenario): - s = make_scenario(expected_answer="3.14159") - r = numeric_match(s, "3.141591", "") - assert r.passed - - def test_unparseable(self, make_scenario): - s = make_scenario(expected_answer="3.14") - r = numeric_match(s, "not a number", "") - assert not r.passed - assert "could not parse" in r.rationale +class TestCodeBasedSkeletons: + def test_exact_string_match_not_implemented(self, make_scenario): + try: + exact_string_match(make_scenario(expected_answer="x"), "x", "") + except NotImplementedError: + return + raise AssertionError("expected NotImplementedError") - def test_custom_tolerance(self, make_scenario): - s = make_scenario(expected_answer="100", tolerance=0.05) - r = numeric_match(s, "104", "") - assert r.passed + def test_numeric_match_not_implemented(self, make_scenario): + try: + numeric_match(make_scenario(expected_answer="1.0"), "1.0", "") + except NotImplementedError: + return + raise AssertionError("expected NotImplementedError") class TestSemanticSimilarity: @@ -88,11 +70,6 @@ def test_missing_reference_short_circuits(self, make_scenario): class TestRegistry: - def test_code_based_scorers_registered(self): - names = registry.names() - assert "exact_string_match" in names - assert "numeric_match" in names - def test_semantic_scorer_registered(self): assert "semantic_similarity" in registry.names() From 1d59c4b69ca8322bafec67a0e5f916c19a395315 Mon Sep 17 00:00:00 2001 From: Shuxin Lin Date: Wed, 13 May 2026 12:21:16 -0400 Subject: [PATCH 4/8] fix(evaluation): align llm_judge prompt with reference; fix LiteLLM kwarg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - LLM-As-Judge prompt now mirrors src/tmp/evaluation_agent/result_evaluation_prompt.py: full 6-criterion rubric text, split Agent's Thinking vs Final Response, output schema uses `suggestions` (back-compat: `reason` still accepted), parser strips "(END OF RESPONSE)" sentinel. - CLI: LiteLLMBackend takes `model_id=`, not `model=`. Fixes: TypeError: LiteLLMBackend.__init__() got an unexpected keyword argument 'model' Verified end-to-end: claude-agent on groundtruth/101 ("List all failure modes of asset Chiller.") → uv run evaluate with --scorer-default llm_judge → 6/6 criteria pass. Signed-off-by: Shuxin Lin --- src/evaluation/cli.py | 2 +- src/evaluation/scorers/llm_judge.py | 86 +++++++++++++++++++---------- 2 files changed, 58 insertions(+), 30 deletions(-) diff --git a/src/evaluation/cli.py b/src/evaluation/cli.py index cc4ee87ed..c9eb8793b 100644 --- a/src/evaluation/cli.py +++ b/src/evaluation/cli.py @@ -72,7 +72,7 @@ def _maybe_install_judge(judge_model: str | None) -> None: from .scorers.llm_judge import install - install(LiteLLMBackend(model=judge_model)) + install(LiteLLMBackend(model_id=judge_model)) def _validate_scorer_default(name: str) -> None: diff --git a/src/evaluation/scorers/llm_judge.py b/src/evaluation/scorers/llm_judge.py index 00518061a..e37ecc219 100644 --- a/src/evaluation/scorers/llm_judge.py +++ b/src/evaluation/scorers/llm_judge.py @@ -30,35 +30,58 @@ "hallucinations", ) -_PROMPT_TEMPLATE = """You are an evaluation judge for an industrial-asset-operations agent. - -Score the agent response against the expected characteristic answer using the six criteria below. Respond ONLY with a JSON object, no prose. - -QUESTION: -{question} - -EXPECTED CHARACTERISTIC: -{characteristic} - -AGENT RESPONSE: -{answer} - -AGENT TRAJECTORY (turns / tool calls / outputs): -{trajectory} - -Return JSON with these boolean fields plus a one-sentence reason: - +_PROMPT_TEMPLATE = """You are a critical reviewer tasked with evaluating the effectiveness and accuracy of an AI agent's response to a given task. Your goal is to determine whether the agent has successfully accomplished the task correctly based on the expected or characteristic behavior. + +Evaluation Criteria: +1. **Task Completion:** + - Verify if the agent executed all necessary actions (e.g., using the correct tools, retrieving data, performing the required analysis). + - The agent's response should align with the predefined expected behavior for task completion. + +2. **Data Retrieval & Accuracy:** + - Ensure that the correct asset, location, time period, and sensor (if applicable) were used. + - Verify if the task performed was related to the correct asset and sensor, and ensure the result corresponds to the correct time period. + - Check if the agent retrieved the required data and if the forecasting, anomaly detection, or other results are correct. + +3. **Generalized Result Verification:** + - **Task Type Verification:** Based on the task type (forecasting, anomaly detection, classification, etc.), verify if the agent has returned the expected results. + - For **forecasting** tasks: Ensure that the agent generated a forecast for the specified future period. + - For **anomaly detection** tasks: Verify that anomalies are detected as expected (if anomalies were anticipated). + - For other tasks (e.g., classification), ensure the task result matches the expected format and value. + - **Comparison with Expected Output:** Check if the result matches the expected format, values, or outcomes as outlined in the characteristic answer. + - **Data Integrity:** Ensure that the correct data (e.g., sensor, time period) was used in the task, and that it is consistent with the expected format and structure. + +4. **Agent Sequence & Order:** + - Ensure the agents (or tools) were called in the correct order and that all actions align with the expected behavior for agent interactions. + - If the characteristic answer specifies certain agents (e.g., IoTAgent, TSFMAgent, FMSRAgent), verify that these were used and in the correct sequence. + +5. **Clarity and Justification:** + - Ensure the agent's response is clear and justified with adequate explanations or evidence to support the claims made. + - There should be no contradictions between the agent's reasoning and the expected behavior outlined in the characteristic answer. + +6. **Hallucination Check:** + - Identify if the agent claims success without performing the necessary actions or without generating meaningful results. + - If the agent provides a fabricated response or claims success where actions are missing, mark this as a hallucination. + +Question: {question} +Characteristic Answer (Expected Behavior): {characteristic} +Agent's Thinking (turns / tool calls / outputs): {trajectory} +Agent's Final Response: {answer} + +Output Format: +Your review must always be in JSON format. Do not include any additional formatting or Markdown in your response. {{ - "task_completion": , - "data_retrieval_accuracy": , - "generalized_result_verification": , - "agent_sequence_correct": , - "clarity_and_justification": , - "hallucinations": , - "reason": "" + "task_completion": true/false, + "data_retrieval_accuracy": true/false, + "generalized_result_verification": true/false, + "agent_sequence_correct": true/false, + "clarity_and_justification": true/false, + "hallucinations": true/false, + "suggestions": "Optional. Actions or improvements for rectifying the response if applicable." }} +(END OF RESPONSE) -The agent passes overall iff the first five are true AND hallucinations is false.""" +Please provide your review based on the given criteria. +""" class LLMJudgeScorer: @@ -117,11 +140,14 @@ def __call__( if review.get("hallucinations") is True: score = max(0.0, score - 0.2) + rationale = str( + review.get("suggestions") or review.get("reason") or "" + )[:500] return ScorerResult( scorer=self.name, passed=passed, score=round(score, 3), - rationale=str(review.get("reason", ""))[:500], + rationale=rationale, details=review, ) @@ -129,8 +155,10 @@ def __call__( def _parse_review(raw: str) -> dict | None: if not raw: return None - # Tolerate leading prose / markdown fences by extracting the first {...} block. - match = re.search(r"\{.*\}", raw, re.DOTALL) + # Strip the reference prompt's "(END OF RESPONSE)" sentinel + any + # leading prose / markdown fence before extracting the first {...}. + text = raw.split("(END OF RESPONSE)")[0] + match = re.search(r"\{.*\}", text, re.DOTALL) if not match: return None try: From 5d0622f9c97bb962b9f4615d59a753d232b02f63 Mon Sep 17 00:00:00 2001 From: Shuxin Lin Date: Wed, 13 May 2026 12:33:21 -0400 Subject: [PATCH 5/8] feat(evaluation): write per-run reports to reports/.json - CLI: --output FILE replaced with --reports-dir DIR (default reports/). Writes one JSON per result (named by trajectory run_id, which is a UUID) plus _aggregate.json for the rollup. - ScenarioResult now carries run_id (propagated from PersistedTrajectory). - New report.write_reports_dir(); falls back to scenario-.json for legacy trajectories with no run_id. - 2 new tests; 43 evaluation tests pass. Verified: uv run evaluate against groundtruth/101.json wrote reports/112c1b56-...json + reports/_aggregate.json. Signed-off-by: Shuxin Lin --- src/evaluation/cli.py | 17 ++++++++----- src/evaluation/evaluator.py | 1 + src/evaluation/models.py | 1 + src/evaluation/report.py | 28 ++++++++++++++++++++++ src/evaluation/tests/test_report.py | 37 +++++++++++++++++++++++++++-- 5 files changed, 76 insertions(+), 8 deletions(-) diff --git a/src/evaluation/cli.py b/src/evaluation/cli.py index c9eb8793b..4ba5535a6 100644 --- a/src/evaluation/cli.py +++ b/src/evaluation/cli.py @@ -9,7 +9,7 @@ from . import scorers as scorer_registry from .evaluator import Evaluator -from .report import render_summary, write_report +from .report import render_summary, write_reports_dir def _build_parser() -> argparse.ArgumentParser: @@ -34,10 +34,14 @@ def _build_parser() -> argparse.ArgumentParser: help="One or more scenario JSON / JSONL files.", ) p.add_argument( - "--output", + "--reports-dir", type=Path, - required=True, - help="Path to write the JSON report.", + default=Path("reports"), + help=( + "Directory to write per-run JSON reports (one file per run, " + "named '.json'), plus '_aggregate.json' for the rollup. " + "Default: reports/." + ), ) p.add_argument( "--scorer-default", @@ -97,9 +101,10 @@ def main(argv: list[str] | None = None) -> int: scenarios_paths=list(args.scenarios), ) - out = write_report(report, args.output) + out_dir = write_reports_dir(report, args.reports_dir) print(render_summary(report)) - print(f"\nReport written: {out}") + print(f"\nReports written: {out_dir}/.json ({len(report.results)} files)") + print(f"Aggregate: {out_dir}/_aggregate.json") return 0 diff --git a/src/evaluation/evaluator.py b/src/evaluation/evaluator.py index f7d151040..b61673a68 100644 --- a/src/evaluation/evaluator.py +++ b/src/evaluation/evaluator.py @@ -64,6 +64,7 @@ def _score_one( return ScenarioResult( scenario_id=scenario.id, scenario_type=scenario.type, + run_id=traj.run_id, runner=traj.runner, model=traj.model, question=traj.question, diff --git a/src/evaluation/models.py b/src/evaluation/models.py index 25fd0b2aa..cdf1d202b 100644 --- a/src/evaluation/models.py +++ b/src/evaluation/models.py @@ -84,6 +84,7 @@ class ScorerResult(BaseModel): class ScenarioResult(BaseModel): scenario_id: str scenario_type: str = "" + run_id: str = "" runner: str model: str question: str diff --git a/src/evaluation/report.py b/src/evaluation/report.py index 72ff9b0e2..9f9ddc4a0 100644 --- a/src/evaluation/report.py +++ b/src/evaluation/report.py @@ -10,6 +10,8 @@ from .metrics import aggregate_ops from .models import EvalReport, ScenarioResult, TypeBreakdown +_AGGREGATE_FILENAME = "_aggregate.json" + def build_report(results: list[ScenarioResult]) -> EvalReport: total = len(results) @@ -52,6 +54,32 @@ def write_report(report: EvalReport, output: Path) -> Path: return output +def write_reports_dir(report: EvalReport, reports_dir: Path) -> Path: + """Write one JSON file per result (``.json``) plus an aggregate. + + Results without a ``run_id`` fall back to ``.json`` so + nothing is dropped. Returns the directory path. + """ + reports_dir = Path(reports_dir) + reports_dir.mkdir(parents=True, exist_ok=True) + + used: dict[str, int] = {} + for r in report.results: + stem = r.run_id or f"scenario-{r.scenario_id}" + # Disambiguate any collisions deterministically. + suffix = used.get(stem, 0) + used[stem] = suffix + 1 + name = stem if suffix == 0 else f"{stem}-{suffix}" + (reports_dir / f"{name}.json").write_text( + r.model_dump_json(indent=2), encoding="utf-8" + ) + + (reports_dir / _AGGREGATE_FILENAME).write_text( + report.model_dump_json(indent=2), encoding="utf-8" + ) + return reports_dir + + def render_summary(report: EvalReport) -> str: lines: list[str] = [] t = report.totals diff --git a/src/evaluation/tests/test_report.py b/src/evaluation/tests/test_report.py index 7e2db642f..d342bdd2d 100644 --- a/src/evaluation/tests/test_report.py +++ b/src/evaluation/tests/test_report.py @@ -10,13 +10,19 @@ ScenarioResult, ScorerResult, ) -from evaluation.report import build_report, render_summary, write_report +from evaluation.report import ( + build_report, + render_summary, + write_report, + write_reports_dir, +) -def _result(stype: str, passed: bool, **ops_kwargs) -> ScenarioResult: +def _result(stype: str, passed: bool, run_id: str = "", **ops_kwargs) -> ScenarioResult: return ScenarioResult( scenario_id="x", scenario_type=stype, + run_id=run_id, runner="plan-execute", model="watsonx/ibm/granite", question="q", @@ -63,6 +69,33 @@ def test_write_report_round_trips(tmp_path: Path): assert data["by_scenario_type"]["iot"]["pass_rate"] == 1.0 +def test_write_reports_dir_per_run_files(tmp_path: Path): + results = [ + _result("iot", True, run_id="run-a"), + _result("tsfm", False, run_id="run-b"), + ] + out_dir = write_reports_dir(build_report(results), tmp_path / "reports") + + assert (out_dir / "run-a.json").exists() + assert (out_dir / "run-b.json").exists() + assert (out_dir / "_aggregate.json").exists() + + per_run = json.loads((out_dir / "run-a.json").read_text()) + assert per_run["run_id"] == "run-a" + assert per_run["grade"]["passed"] is True + + agg = json.loads((out_dir / "_aggregate.json").read_text()) + assert agg["totals"]["scenarios"] == 2 + + +def test_write_reports_dir_falls_back_to_scenario_id(tmp_path: Path): + # ScenarioResult.run_id is empty when the trajectory pre-dates the + # run_id field; the writer must still produce a file. + results = [_result("iot", True)] + out_dir = write_reports_dir(build_report(results), tmp_path / "reports") + assert (out_dir / "scenario-x.json").exists() + + def test_render_summary_includes_headlines(): results = [ _result("iot", True, tokens_in=10, tokens_out=5, duration_ms=100.0, tool_call_count=1), From cf941e0d819f0cbd25f3cc0b0621aab41e12a635 Mon Sep 17 00:00:00 2001 From: Shuxin Lin Date: Wed, 13 May 2026 12:38:33 -0400 Subject: [PATCH 6/8] docs(evaluation): add docs/evaluation.md; semantic scorer is skeleton - Strip src/evaluation/scorers/semantic.py to a NotImplementedError skeleton; no longer auto-registered. Code-Based + Semantic-Score families now both ship as slot-only placeholders; LLM-As-Judge is the only working scorer in this branch. - Tests: TestSemanticSimilarity collapsed to a NotImplementedError assertion; runner/evaluator override tests pivot to local stub scorers (no skeleton dependency). - INSTRUCTIONS.md: new Evaluation section linking to the full doc. - docs/evaluation.md: scenario/trajectory schema, CLI reference, report layout, scorer families table, custom-scorer plug-in pattern, loop over groundtruth/*.json. 40 evaluation tests pass; full suite 310 passed. Signed-off-by: Shuxin Lin --- INSTRUCTIONS.md | 32 +++ docs/evaluation.md | 264 +++++++++++++++++++++++++ src/evaluation/scorers/__init__.py | 8 +- src/evaluation/scorers/semantic.py | 45 +---- src/evaluation/tests/test_evaluator.py | 16 +- src/evaluation/tests/test_runner.py | 16 +- src/evaluation/tests/test_scorers.py | 46 ++--- 7 files changed, 341 insertions(+), 86 deletions(-) create mode 100644 docs/evaluation.md diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md index 8eede3877..36d8fb020 100644 --- a/INSTRUCTIONS.md +++ b/INSTRUCTIONS.md @@ -11,6 +11,7 @@ This directory contains the MCP servers and infrastructure for the AssetOpsBench - [Example queries](#example-queries) - [Agents](#agents) - [Observability](#observability) +- [Evaluation](#evaluation) - [Running Tests](#running-tests) - [Architecture](#architecture) @@ -220,6 +221,37 @@ See [docs/observability.md](docs/observability.md) for span attribute reference, --- +## Evaluation + +Offline grading of saved trajectories against ground-truth scenarios. Three-stage flow: + +``` +agent run → trajectory (run_id) → uv run evaluate → reports/.json +``` + +End-to-end against a ground-truth file: + +```bash +# 1. Persist trajectories +export AGENT_TRAJECTORY_DIR=$(pwd)/traces/trajectories +uv run claude-agent "List all failure modes of asset Chiller." --scenario-id 101 + +# 2. Score with LLM-As-Judge +uv run evaluate \ + --trajectories traces/trajectories \ + --scenarios groundtruth/101.json \ + --scorer-default llm_judge \ + --judge-model litellm_proxy/aws/claude-opus-4-6 +``` + +Output lands under `reports/` — one `.json` per trajectory plus `_aggregate.json` for the rollup. + +Scorer families follow MLflow's evaluator/scorer split: `llm_judge` is wired up; `exact_string_match`, `numeric_match`, and `semantic_similarity` ship as skeletons (raise `NotImplementedError`). + +Full reference — scenario schema, report layout, custom scorers, looping over ground-truth: **[docs/evaluation.md](docs/evaluation.md)**. + +--- + ## Running Tests ```bash diff --git a/docs/evaluation.md b/docs/evaluation.md new file mode 100644 index 000000000..0d6f9a366 --- /dev/null +++ b/docs/evaluation.md @@ -0,0 +1,264 @@ +# Evaluation + +Offline grading of saved agent trajectories against ground-truth scenarios. + +The evaluation module follows the three-stage pattern used by SWE-bench, +HELM, and τ-bench: + +``` +agent run → trajectory (run_id) → evaluate → reports/.json +``` + +Re-grading from saved trajectories is first-class: re-run scoring with +a different scorer or judge model without re-invoking the agent. + +## Concepts + +The vocabulary follows MLflow's evaluation split: + +- **Scenario** — a ground-truth record on disk. Carries `id`, `text` + (the utterance), `type`, `characteristic_form` (expected behaviour), + and optional `grading_method`. +- **Trajectory** — a per-run JSON file persisted by the agent runners + when `AGENT_TRAJECTORY_DIR` is set. Carries `run_id`, `scenario_id`, + `question`, `answer`, and per-turn detail. +- **Scorer** — a callable that takes + `(scenario, answer, trajectory_text)` and returns a `ScorerResult`. + Scorers fall into three families: + - **Code-Based** — deterministic, no LLM (e.g. `exact_string_match`, + `numeric_match`). *Skeleton only* in this branch. + - **LLM-As-Judge** — `llm_judge`. Six-criterion rubric, requires a + LiteLLM-routable model passed via `--judge-model`. + - **Semantic-Score** — similarity-based, no LLM call. *Skeleton only* + in this branch. +- **Evaluator** — orchestrates a batch: loads scenarios + trajectories, + joins on `scenario_id`, dispatches to scorers, aggregates results. + +## Inputs + +### Scenario file + +JSON list, JSON object, or JSONL. Fields the scorer cares about: + +| Field | Used by | Notes | +| --------------------- | -------------------- | ------------------------------------------------ | +| `id` | join | Coerced to string at load time | +| `text` | all | The utterance the agent answered | +| `type` | reporting | Scenario family (`iot`, `tsfm`, `FMSR`, …) | +| `characteristic_form` | llm_judge, semantic | Expected behaviour, free-form | +| `expected_answer` | code_based, semantic | Exact target string / number | +| `grading_method` | dispatch | Registered scorer name; overrides CLI default | +| `tolerance` | numeric_match | Optional relative + absolute tolerance | + +Ground-truth files under `groundtruth/` already match this schema — +they're a drop-in scenarios input. + +### Trajectory file + +Written by the observability layer to `AGENT_TRAJECTORY_DIR` as one +JSON per run. Fields the evaluator reads: + +``` +{ + "run_id": "", + "scenario_id": "", + "runner": "claude-agent" | "plan-execute" | …, + "model": "", + "question": "", + "answer": "", + "trajectory": {…} // SDK Trajectory dict, or list[StepResult] for plan-execute +} +``` + +`scenario_id` is critical — trajectories with `null` scenario_id are +dropped at the join step. Pass `--scenario-id` to the agent CLI to set it. + +## End-to-end workflow + +```bash +# 1. Persist trajectories under AGENT_TRAJECTORY_DIR +export AGENT_TRAJECTORY_DIR=$(pwd)/traces/trajectories +uv run claude-agent "List all failure modes of asset Chiller." --scenario-id 101 + +# 2. Score with LLM-As-Judge against the ground-truth file +uv run evaluate \ + --trajectories traces/trajectories \ + --scenarios groundtruth/101.json \ + --scorer-default llm_judge \ + --judge-model litellm_proxy/aws/claude-opus-4-6 +``` + +Output: + +``` +Scenarios: 1 Passed: 1 Pass rate: 100.0% + +By scenario type: + FMSR 1/1 (100.0%) + +Operational metrics: + tokens_in_total: 7 + tokens_out_total: 25 + tool_calls_total: 1 + duration_ms_p50: 14690.6 + +Reports written: reports/.json (1 files) +Aggregate: reports/_aggregate.json +``` + +## Output layout + +``` +reports/ +├── .json # one ScenarioResult per trajectory +├── .json +└── _aggregate.json # EvalReport: totals, by_scenario_type, ops rollup +``` + +Per-run file (`reports/.json`): + +```json +{ + "scenario_id": "101", + "scenario_type": "FMSR", + "run_id": "112c1b56-…", + "runner": "claude-agent", + "model": "litellm_proxy/aws/claude-opus-4-6", + "question": "List all failure modes of asset Chiller.", + "answer": "Here are the 7 failure modes for the Chiller asset: …", + "grade": { + "scorer": "llm_judge", + "passed": true, + "score": 1.0, + "rationale": "", + "details": { + "task_completion": true, + "data_retrieval_accuracy": true, + "generalized_result_verification": true, + "agent_sequence_correct": true, + "clarity_and_justification": true, + "hallucinations": false, + "suggestions": "" + } + }, + "ops": { + "turn_count": 2, + "tool_call_count": 1, + "unique_tools": ["get_failure_modes"], + "tokens_in": 7, + "tokens_out": 25, + "duration_ms": 14690.6, + "est_cost_usd": 0.001959 + } +} +``` + +Aggregate (`reports/_aggregate.json`) is the full `EvalReport` (totals, +runners, models, by-scenario-type breakdown, ops rollup, and the list +of per-scenario results). + +## CLI reference + +``` +uv run evaluate \ + --trajectories DIR_OR_FILE # required + --scenarios FILE [FILE ...] # required, one or more + [--reports-dir DIR] # default: reports/ + [--scorer-default NAME] # default: llm_judge + [--judge-model MODEL_ID] # required when llm_judge runs + [-v] +``` + +`--grader-default` is accepted as a legacy alias for `--scorer-default`. + +## Available scorers in this branch + +| Family | Registered name | Status | +| ------------- | ---------------------- | ------------------------------------------- | +| LLM-As-Judge | `llm_judge` | Works. Installed by passing `--judge-model` | +| Code-Based | `exact_string_match` | **Skeleton — `NotImplementedError`** | +| Code-Based | `numeric_match` | **Skeleton — `NotImplementedError`** | +| Semantic-Score| `semantic_similarity` | **Skeleton — `NotImplementedError`** | + +Skeleton scorers don't auto-register; calling them raises +`NotImplementedError`. Fill in the body and call +`evaluation.scorers.register("", )` to enable. + +## LLM-As-Judge + +Six-criterion rubric, prompt mirrored from +`src/tmp/evaluation_agent/result_evaluation_prompt.py`: + +- `task_completion` +- `data_retrieval_accuracy` +- `generalized_result_verification` +- `agent_sequence_correct` +- `clarity_and_justification` +- `hallucinations` + +A run passes overall iff the first five are `true` **and** +`hallucinations` is `false`. The score is the fraction of the first +five satisfied, minus 0.2 if `hallucinations` is `true`. The judge's +free-form `suggestions` (or legacy `reason`) lands in +`grade.rationale`; the full review dict lands in `grade.details`. + +To customise: edit `_PROMPT_TEMPLATE` in +`src/evaluation/scorers/llm_judge.py`. + +## Programmatic use + +```python +from pathlib import Path +from evaluation import Evaluator +from evaluation.scorers.llm_judge import install +from llm import LiteLLMBackend + +install(LiteLLMBackend(model_id="litellm_proxy/aws/claude-opus-4-6")) + +report = Evaluator(default_scorer="llm_judge").evaluate( + trajectories_path=Path("traces/trajectories"), + scenarios_paths=[Path("groundtruth/101.json")], +) + +for r in report.results: + print(r.run_id, r.grade.passed, r.grade.score) +``` + +## Plug in a custom scorer + +```python +from evaluation import scorers +from evaluation.models import ScorerResult + +def keyword_hit(scenario, answer, trajectory_text) -> ScorerResult: + required = (scenario.model_extra or {}).get("required_keywords", []) + hits = [k for k in required if k.lower() in answer.lower()] + passed = len(hits) == len(required) + return ScorerResult( + scorer="keyword_hit", + passed=passed, + score=len(hits) / max(1, len(required)), + rationale="" if passed else f"missing: {set(required) - set(hits)}", + ) + +scorers.register("keyword_hit", keyword_hit) +# Any scenario with "grading_method": "keyword_hit" now routes here. +``` + +## Loop over all ground-truth files + +```bash +export AGENT_TRAJECTORY_DIR=$(pwd)/traces/trajectories + +for f in groundtruth/*.json; do + utt=$(python3 -c "import json,sys;d=json.load(open(sys.argv[1]));print(d['text'])" "$f") + sid=$(python3 -c "import json,sys;d=json.load(open(sys.argv[1]));print(d['id'])" "$f") + uv run claude-agent "$utt" --scenario-id "$sid" +done + +uv run evaluate \ + --trajectories traces/trajectories \ + --scenarios groundtruth/*.json \ + --scorer-default llm_judge \ + --judge-model litellm_proxy/aws/claude-opus-4-6 +``` diff --git a/src/evaluation/scorers/__init__.py b/src/evaluation/scorers/__init__.py index 00ff0c1e2..3bad68a8c 100644 --- a/src/evaluation/scorers/__init__.py +++ b/src/evaluation/scorers/__init__.py @@ -40,5 +40,9 @@ def names() -> list[str]: return sorted(_REGISTRY) -from . import code_based # noqa: E402,F401 — register-on-import -from . import semantic # noqa: E402,F401 — register-on-import +# Code-Based and Semantic-Score families ship as skeletons — their +# modules are importable but register no scorers until an +# implementation is filled in. LLM-As-Judge is registered explicitly +# via :func:`evaluation.scorers.llm_judge.install`. +from . import code_based # noqa: E402,F401 +from . import semantic # noqa: E402,F401 diff --git a/src/evaluation/scorers/semantic.py b/src/evaluation/scorers/semantic.py index 56506eb10..c2a9bd61a 100644 --- a/src/evaluation/scorers/semantic.py +++ b/src/evaluation/scorers/semantic.py @@ -1,51 +1,16 @@ -"""Semantic-Score scorer — similarity without an LLM call. +"""Semantic-Score scorer — similarity-based grading without an LLM call. -Uses ``difflib.SequenceMatcher`` over normalised text so the scorer has -no external dependencies and is stable in CI. A scenario can override -the pass threshold via ``scenario.similarity_threshold`` (default 0.6). +Skeleton only — fill in the implementation (e.g. embedding cosine, BLEU, +sentence-transformers, or difflib ratio) and re-register with the +scorer registry before use. """ from __future__ import annotations -import re -from difflib import SequenceMatcher - from ..models import Scenario, ScorerResult -from . import register - -_DEFAULT_THRESHOLD = 0.6 -_WS_RE = re.compile(r"\s+") - - -def _normalize(text: str) -> str: - return _WS_RE.sub(" ", str(text).strip().lower()) def semantic_similarity( scenario: Scenario, answer: str, trajectory_text: str ) -> ScorerResult: - reference = scenario.characteristic_form or scenario.expected_answer - if not reference: - return ScorerResult( - scorer="semantic_similarity", - passed=False, - rationale="scenario has neither characteristic_form nor expected_answer", - ) - - extra = scenario.model_extra or {} - threshold = float(extra.get("similarity_threshold", _DEFAULT_THRESHOLD)) - - score = SequenceMatcher(None, _normalize(reference), _normalize(answer)).ratio() - passed = score >= threshold - return ScorerResult( - scorer="semantic_similarity", - passed=passed, - score=round(score, 4), - rationale=( - "" if passed else f"similarity {score:.3f} below threshold {threshold}" - ), - details={"threshold": threshold, "reference": reference}, - ) - - -register("semantic_similarity", semantic_similarity) + raise NotImplementedError diff --git a/src/evaluation/tests/test_evaluator.py b/src/evaluation/tests/test_evaluator.py index ed0058e66..6cc761e46 100644 --- a/src/evaluation/tests/test_evaluator.py +++ b/src/evaluation/tests/test_evaluator.py @@ -35,9 +35,13 @@ def test_evaluator_routes_to_default_scorer(tmp_path: Path, make_persisted_recor assert report.results[0].grade.scorer == "stub-evaluator" +def _fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult: + return ScorerResult(scorer="fail-default", passed=False, score=0.0) + + def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_record): # The scenario-level grading_method must route around the default - # scorer, even when the default is a placeholder that would fail. + # scorer, even when the default scorer would reject the answer. rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="answer text") (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") @@ -49,18 +53,20 @@ def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_rec "id": 1, "text": "Q", "type": "tsfm", - "characteristic_form": "answer text", - "grading_method": "semantic_similarity", + "grading_method": "stub-evaluator", } ] ), encoding="utf-8", ) - report = Evaluator(default_scorer="llm_judge").evaluate( + registry.register("stub-evaluator", _stub_scorer) + registry.register("fail-default", _fail_scorer) + + report = Evaluator(default_scorer="fail-default").evaluate( trajectories_path=tmp_path, scenarios_paths=[scenarios_path], ) assert report.totals["passed"] == 1 - assert report.results[0].grade.scorer == "semantic_similarity" + assert report.results[0].grade.scorer == "stub-evaluator" diff --git a/src/evaluation/tests/test_runner.py b/src/evaluation/tests/test_runner.py index 421f642e4..a9d558878 100644 --- a/src/evaluation/tests/test_runner.py +++ b/src/evaluation/tests/test_runner.py @@ -46,6 +46,10 @@ def test_evaluate_end_to_end(tmp_path: Path, make_persisted_record): assert report.ops.tokens_in_total > 0 +def _always_fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult: + return ScorerResult(scorer="stub-fail", passed=False, score=0.0) + + def test_evaluate_uses_per_scenario_grading_method(tmp_path: Path, make_persisted_record): rec = make_persisted_record(run_id="run-x", scenario_id=1, answer="A.") (tmp_path / "run-x.json").write_text(json.dumps(rec), encoding="utf-8") @@ -58,22 +62,22 @@ def test_evaluate_uses_per_scenario_grading_method(tmp_path: Path, make_persiste "id": 1, "text": "Q", "type": "iot", - "characteristic_form": "A.", - "similarity_threshold": 0.5, - "grading_method": "semantic_similarity", + "grading_method": "stub-pass", } ] ), encoding="utf-8", ) - registry.register("stub", _always_pass_scorer) + registry.register("stub-pass", _always_pass_scorer) + registry.register("stub-fail", _always_fail_scorer) report = evaluate( trajectories_path=tmp_path, scenarios_paths=[scenarios_path], - default_grading_method="stub", # per-scenario override wins + default_grading_method="stub-fail", # per-scenario override wins ) + # Override wins: scenario routed to the always-pass stub even though + # the default scorer would have failed it. assert report.totals["passed"] == 1 - assert report.results[0].grade.scorer == "semantic_similarity" diff --git a/src/evaluation/tests/test_scorers.py b/src/evaluation/tests/test_scorers.py index b76a924bc..8f2ac6b69 100644 --- a/src/evaluation/tests/test_scorers.py +++ b/src/evaluation/tests/test_scorers.py @@ -36,42 +36,22 @@ def test_numeric_match_not_implemented(self, make_scenario): raise AssertionError("expected NotImplementedError") -class TestSemanticSimilarity: - def test_close_text_passes_default_threshold(self, make_scenario): - s = make_scenario( - characteristic_form="Lists temperature, pressure, and vibration sensors." - ) - r = semantic_similarity( - s, "lists temperature pressure and vibration sensors", "" - ) - assert r.passed - assert r.score >= 0.6 - - def test_unrelated_text_fails(self, make_scenario): - s = make_scenario(characteristic_form="lists three iot sensors") - r = semantic_similarity(s, "the chiller is operating normally", "") - assert not r.passed - assert "below threshold" in r.rationale - - def test_custom_threshold_override(self, make_scenario): - s = make_scenario( - characteristic_form="lists three iot sensors", - similarity_threshold=0.05, - ) - r = semantic_similarity(s, "completely different answer text", "") - # Threshold lowered enough that even weak overlap passes. - assert r.passed - - def test_missing_reference_short_circuits(self, make_scenario): - s = make_scenario(characteristic_form=None, expected_answer=None) - r = semantic_similarity(s, "anything", "") - assert not r.passed - assert "characteristic_form" in r.rationale +class TestSemanticSkeleton: + def test_semantic_similarity_not_implemented(self, make_scenario): + try: + semantic_similarity(make_scenario(), "a", "") + except NotImplementedError: + return + raise AssertionError("expected NotImplementedError") class TestRegistry: - def test_semantic_scorer_registered(self): - assert "semantic_similarity" in registry.names() + def test_skeleton_scorers_not_auto_registered(self): + # code_based and semantic ship as skeletons; only llm_judge is + # registered (lazily, via install()). + assert "exact_string_match" not in registry.names() + assert "numeric_match" not in registry.names() + assert "semantic_similarity" not in registry.names() def test_get_unknown_raises(self): try: From 0d523453e783a8fe6888c6b88055fe6372386b2c Mon Sep 17 00:00:00 2001 From: Shuxin Lin Date: Wed, 13 May 2026 15:54:23 -0400 Subject: [PATCH 7/8] refactor(evaluation): standardize grade/grader -> score/scorer Field and identifier renames so the module speaks a single vocabulary: - Scenario.grading_method -> Scenario.scoring_method - ScenarioResult.grade -> ScenarioResult.score (typed ScorerResult) - runner.default_grading_method -> runner.default_scoring_method - CLI: --grader-default legacy alias removed (only --scorer-default) - report.totals["graded"] -> report.totals["scored"] - Docstrings/comments/docs: "grading"/"graded"/"grader" -> "scoring"/"scored"/"scorer" Tests, INSTRUCTIONS.md, docs/evaluation.md updated. Note: the inner numeric ScorerResult.score is unchanged; access is result.score.score for the numeric, result.score.passed for the bool. 40 evaluation tests pass; full suite 310 passed; end-to-end against groundtruth/101.json still emits 6/6 rubric pass. Signed-off-by: Shuxin Lin --- INSTRUCTIONS.md | 2 +- docs/evaluation.md | 19 +++++++++---------- src/evaluation/__init__.py | 4 ++-- src/evaluation/cli.py | 5 ++--- src/evaluation/evaluator.py | 8 ++++---- src/evaluation/models.py | 6 +++--- src/evaluation/report.py | 8 ++++---- src/evaluation/runner.py | 8 ++++---- src/evaluation/scorers/__init__.py | 2 +- src/evaluation/scorers/semantic.py | 2 +- src/evaluation/tests/test_evaluator.py | 8 ++++---- src/evaluation/tests/test_metrics.py | 2 +- src/evaluation/tests/test_report.py | 6 +++--- src/evaluation/tests/test_runner.py | 8 ++++---- 14 files changed, 43 insertions(+), 45 deletions(-) diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md index 36d8fb020..f4922c2ef 100644 --- a/INSTRUCTIONS.md +++ b/INSTRUCTIONS.md @@ -223,7 +223,7 @@ See [docs/observability.md](docs/observability.md) for span attribute reference, ## Evaluation -Offline grading of saved trajectories against ground-truth scenarios. Three-stage flow: +Offline scoring of saved trajectories against ground-truth scenarios. Three-stage flow: ``` agent run → trajectory (run_id) → uv run evaluate → reports/.json diff --git a/docs/evaluation.md b/docs/evaluation.md index 0d6f9a366..3b705b8a0 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -1,6 +1,6 @@ # Evaluation -Offline grading of saved agent trajectories against ground-truth scenarios. +Offline scoring of saved agent trajectories against ground-truth scenarios. The evaluation module follows the three-stage pattern used by SWE-bench, HELM, and τ-bench: @@ -9,8 +9,8 @@ HELM, and τ-bench: agent run → trajectory (run_id) → evaluate → reports/.json ``` -Re-grading from saved trajectories is first-class: re-run scoring with -a different scorer or judge model without re-invoking the agent. +Re-scoring from saved trajectories is first-class: re-run with a +different scorer or judge model without re-invoking the agent. ## Concepts @@ -18,7 +18,7 @@ The vocabulary follows MLflow's evaluation split: - **Scenario** — a ground-truth record on disk. Carries `id`, `text` (the utterance), `type`, `characteristic_form` (expected behaviour), - and optional `grading_method`. + and optional `scoring_method`. - **Trajectory** — a per-run JSON file persisted by the agent runners when `AGENT_TRAJECTORY_DIR` is set. Carries `run_id`, `scenario_id`, `question`, `answer`, and per-turn detail. @@ -47,7 +47,7 @@ JSON list, JSON object, or JSONL. Fields the scorer cares about: | `type` | reporting | Scenario family (`iot`, `tsfm`, `FMSR`, …) | | `characteristic_form` | llm_judge, semantic | Expected behaviour, free-form | | `expected_answer` | code_based, semantic | Exact target string / number | -| `grading_method` | dispatch | Registered scorer name; overrides CLI default | +| `scoring_method` | dispatch | Registered scorer name; overrides CLI default | | `tolerance` | numeric_match | Optional relative + absolute tolerance | Ground-truth files under `groundtruth/` already match this schema — @@ -126,7 +126,7 @@ Per-run file (`reports/.json`): "model": "litellm_proxy/aws/claude-opus-4-6", "question": "List all failure modes of asset Chiller.", "answer": "Here are the 7 failure modes for the Chiller asset: …", - "grade": { + "score": { "scorer": "llm_judge", "passed": true, "score": 1.0, @@ -169,7 +169,6 @@ uv run evaluate \ [-v] ``` -`--grader-default` is accepted as a legacy alias for `--scorer-default`. ## Available scorers in this branch @@ -200,7 +199,7 @@ A run passes overall iff the first five are `true` **and** `hallucinations` is `false`. The score is the fraction of the first five satisfied, minus 0.2 if `hallucinations` is `true`. The judge's free-form `suggestions` (or legacy `reason`) lands in -`grade.rationale`; the full review dict lands in `grade.details`. +`score.rationale`; the full review dict lands in `score.details`. To customise: edit `_PROMPT_TEMPLATE` in `src/evaluation/scorers/llm_judge.py`. @@ -221,7 +220,7 @@ report = Evaluator(default_scorer="llm_judge").evaluate( ) for r in report.results: - print(r.run_id, r.grade.passed, r.grade.score) + print(r.run_id, r.score.passed, r.score.score) ``` ## Plug in a custom scorer @@ -242,7 +241,7 @@ def keyword_hit(scenario, answer, trajectory_text) -> ScorerResult: ) scorers.register("keyword_hit", keyword_hit) -# Any scenario with "grading_method": "keyword_hit" now routes here. +# Any scenario with "scoring_method": "keyword_hit" now routes here. ``` ## Loop over all ground-truth files diff --git a/src/evaluation/__init__.py b/src/evaluation/__init__.py index 280da44d5..39cdf0df4 100644 --- a/src/evaluation/__init__.py +++ b/src/evaluation/__init__.py @@ -3,11 +3,11 @@ Consumes saved trajectory files (written by :func:`observability.persistence.persist_trajectory`) and scenario files (under ``src/scenarios/``) and emits a structured JSON report combining -graded outcomes with operational metrics. +scored outcomes with operational metrics. The shape mirrors conventions from SWE-bench, HELM, and τ-bench: ``run`` (executes the agent — already exists) → ``evaluate`` (this -module) → ``report.json``. Re-grading from saved trajectories is +module) → ``report.json``. Re-scoring from saved trajectories is first-class. The evaluation concept follows MLflow's vocabulary: an diff --git a/src/evaluation/cli.py b/src/evaluation/cli.py index 4ba5535a6..faf369652 100644 --- a/src/evaluation/cli.py +++ b/src/evaluation/cli.py @@ -45,11 +45,10 @@ def _build_parser() -> argparse.ArgumentParser: ) p.add_argument( "--scorer-default", - "--grader-default", dest="scorer_default", default="llm_judge", - help="Scorer name when scenario.grading_method is unset. " - "Default: llm_judge. (--grader-default is a legacy alias.)", + help="Scorer name when scenario.scoring_method is unset. " + "Default: llm_judge.", ) p.add_argument( "--judge-model", diff --git a/src/evaluation/evaluator.py b/src/evaluation/evaluator.py index b61673a68..dedc82885 100644 --- a/src/evaluation/evaluator.py +++ b/src/evaluation/evaluator.py @@ -31,7 +31,7 @@ class Evaluator: """Run a batch of scenarios against their saved trajectories. ``default_scorer`` names the registered scorer to use when a - scenario does not set ``grading_method``. Per-scenario overrides + scenario does not set ``scoring_method``. Per-scenario overrides take precedence. """ @@ -56,10 +56,10 @@ def evaluate( def _score_one( self, scenario: Scenario, traj: PersistedTrajectory ) -> ScenarioResult: - name = scenario.grading_method or self.default_scorer + name = scenario.scoring_method or self.default_scorer scorer = self._resolve(name) trajectory_text = _trajectory_to_text(traj) - grade = scorer(scenario, traj.answer, trajectory_text) + score = scorer(scenario, traj.answer, trajectory_text) return ScenarioResult( scenario_id=scenario.id, @@ -69,7 +69,7 @@ def _score_one( model=traj.model, question=traj.question, answer=traj.answer, - grade=grade, + score=score, ops=metrics_from_trajectory(traj), ) diff --git a/src/evaluation/models.py b/src/evaluation/models.py index cdf1d202b..2f57e0b38 100644 --- a/src/evaluation/models.py +++ b/src/evaluation/models.py @@ -23,7 +23,7 @@ class Scenario(BaseModel): category: str = "" characteristic_form: str | None = None expected_answer: str | None = None - grading_method: str | None = None + scoring_method: str | None = None @classmethod def from_raw(cls, raw: dict) -> "Scenario": @@ -70,7 +70,7 @@ class ScorerResult(BaseModel): """Output of a single :class:`Scorer` invocation. ``scorer`` is the registered name of the scorer that produced this - result — distinct from ``Scenario.grading_method``, which is the + result — distinct from ``Scenario.scoring_method``, which is the *requested* scorer on the input side. """ @@ -89,7 +89,7 @@ class ScenarioResult(BaseModel): model: str question: str answer: str - grade: ScorerResult + score: ScorerResult ops: OpsMetrics diff --git a/src/evaluation/report.py b/src/evaluation/report.py index 9f9ddc4a0..6ff5a9d28 100644 --- a/src/evaluation/report.py +++ b/src/evaluation/report.py @@ -1,4 +1,4 @@ -"""Build an :class:`EvalReport` from graded scenario results.""" +"""Build an :class:`EvalReport` from scored scenario results.""" from __future__ import annotations @@ -15,7 +15,7 @@ def build_report(results: list[ScenarioResult]) -> EvalReport: total = len(results) - passed = sum(1 for r in results if r.grade.passed) + passed = sum(1 for r in results if r.score.passed) by_type: dict[str, list[ScenarioResult]] = defaultdict(list) for r in results: @@ -24,7 +24,7 @@ def build_report(results: list[ScenarioResult]) -> EvalReport: breakdown: dict[str, TypeBreakdown] = {} for stype, items in by_type.items(): n = len(items) - p = sum(1 for r in items if r.grade.passed) + p = sum(1 for r in items if r.score.passed) breakdown[stype] = TypeBreakdown( total=n, passed=p, @@ -37,7 +37,7 @@ def build_report(results: list[ScenarioResult]) -> EvalReport: models=sorted({r.model for r in results}), totals={ "scenarios": total, - "graded": total, + "scored": total, "passed": passed, "pass_rate": round(passed / total, 4) if total else 0.0, }, diff --git a/src/evaluation/runner.py b/src/evaluation/runner.py index 23df16f77..507cdaa25 100644 --- a/src/evaluation/runner.py +++ b/src/evaluation/runner.py @@ -12,14 +12,14 @@ def evaluate( *, trajectories_path: Path, scenarios_paths: list[Path], - default_grading_method: str = "llm_judge", + default_scoring_method: str = "llm_judge", ) -> EvalReport: """Load, score, and aggregate. - Per-scenario scorer is picked from ``scenario.grading_method`` when - set, falling back to ``default_grading_method``. + Per-scenario scorer is picked from ``scenario.scoring_method`` when + set, falling back to ``default_scoring_method``. """ - return Evaluator(default_scorer=default_grading_method).evaluate( + return Evaluator(default_scorer=default_scoring_method).evaluate( trajectories_path=trajectories_path, scenarios_paths=scenarios_paths, ) diff --git a/src/evaluation/scorers/__init__.py b/src/evaluation/scorers/__init__.py index 3bad68a8c..a2fa994e6 100644 --- a/src/evaluation/scorers/__init__.py +++ b/src/evaluation/scorers/__init__.py @@ -7,7 +7,7 @@ * **Code-Based** — deterministic, no model required (string/numeric matchers in :mod:`evaluation.scorers.code_based`). -* **LLM-As-Judge** — model-graded against a rubric +* **LLM-As-Judge** — model-scored against a rubric (:mod:`evaluation.scorers.llm_judge`). * **Semantic-Score** — similarity-based, no model call (:mod:`evaluation.scorers.semantic`). diff --git a/src/evaluation/scorers/semantic.py b/src/evaluation/scorers/semantic.py index c2a9bd61a..d56b401cb 100644 --- a/src/evaluation/scorers/semantic.py +++ b/src/evaluation/scorers/semantic.py @@ -1,4 +1,4 @@ -"""Semantic-Score scorer — similarity-based grading without an LLM call. +"""Semantic-Score scorer — similarity-based scoring without an LLM call. Skeleton only — fill in the implementation (e.g. embedding cosine, BLEU, sentence-transformers, or difflib ratio) and re-register with the diff --git a/src/evaluation/tests/test_evaluator.py b/src/evaluation/tests/test_evaluator.py index 6cc761e46..6d91e633b 100644 --- a/src/evaluation/tests/test_evaluator.py +++ b/src/evaluation/tests/test_evaluator.py @@ -32,7 +32,7 @@ def test_evaluator_routes_to_default_scorer(tmp_path: Path, make_persisted_recor ) assert report.totals["passed"] == 1 - assert report.results[0].grade.scorer == "stub-evaluator" + assert report.results[0].score.scorer == "stub-evaluator" def _fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult: @@ -40,7 +40,7 @@ def _fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> Score def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_record): - # The scenario-level grading_method must route around the default + # The scenario-level scoring_method must route around the default # scorer, even when the default scorer would reject the answer. rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="answer text") (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8") @@ -53,7 +53,7 @@ def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_rec "id": 1, "text": "Q", "type": "tsfm", - "grading_method": "stub-evaluator", + "scoring_method": "stub-evaluator", } ] ), @@ -69,4 +69,4 @@ def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_rec ) assert report.totals["passed"] == 1 - assert report.results[0].grade.scorer == "stub-evaluator" + assert report.results[0].score.scorer == "stub-evaluator" diff --git a/src/evaluation/tests/test_metrics.py b/src/evaluation/tests/test_metrics.py index 804624ffb..21f097b1c 100644 --- a/src/evaluation/tests/test_metrics.py +++ b/src/evaluation/tests/test_metrics.py @@ -23,7 +23,7 @@ def _result(passed: bool = True, ops: OpsMetrics | None = None) -> ScenarioResul model="watsonx/ibm/granite", question="q", answer="a", - grade=ScorerResult(scorer="exact_string_match", passed=passed), + score=ScorerResult(scorer="exact_string_match", passed=passed), ops=ops or OpsMetrics(), ) diff --git a/src/evaluation/tests/test_report.py b/src/evaluation/tests/test_report.py index d342bdd2d..7c71788dc 100644 --- a/src/evaluation/tests/test_report.py +++ b/src/evaluation/tests/test_report.py @@ -27,7 +27,7 @@ def _result(stype: str, passed: bool, run_id: str = "", **ops_kwargs) -> Scenari model="watsonx/ibm/granite", question="q", answer="a", - grade=ScorerResult(scorer="llm_judge", passed=passed, score=1.0 if passed else 0.0), + score=ScorerResult(scorer="llm_judge", passed=passed, score=1.0 if passed else 0.0), ops=OpsMetrics(**ops_kwargs), ) @@ -42,7 +42,7 @@ def test_build_report_totals_and_breakdown(): assert report.totals == { "scenarios": 3, - "graded": 3, + "scored": 3, "passed": 2, "pass_rate": round(2 / 3, 4), } @@ -82,7 +82,7 @@ def test_write_reports_dir_per_run_files(tmp_path: Path): per_run = json.loads((out_dir / "run-a.json").read_text()) assert per_run["run_id"] == "run-a" - assert per_run["grade"]["passed"] is True + assert per_run["score"]["passed"] is True agg = json.loads((out_dir / "_aggregate.json").read_text()) assert agg["totals"]["scenarios"] == 2 diff --git a/src/evaluation/tests/test_runner.py b/src/evaluation/tests/test_runner.py index a9d558878..f8a936db0 100644 --- a/src/evaluation/tests/test_runner.py +++ b/src/evaluation/tests/test_runner.py @@ -37,7 +37,7 @@ def test_evaluate_end_to_end(tmp_path: Path, make_persisted_record): report = evaluate( trajectories_path=tmp_path, scenarios_paths=[scenarios_path], - default_grading_method="stub", + default_scoring_method="stub", ) assert report.totals["scenarios"] == 2 @@ -50,7 +50,7 @@ def _always_fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) - return ScorerResult(scorer="stub-fail", passed=False, score=0.0) -def test_evaluate_uses_per_scenario_grading_method(tmp_path: Path, make_persisted_record): +def test_evaluate_uses_per_scenario_scoring_method(tmp_path: Path, make_persisted_record): rec = make_persisted_record(run_id="run-x", scenario_id=1, answer="A.") (tmp_path / "run-x.json").write_text(json.dumps(rec), encoding="utf-8") @@ -62,7 +62,7 @@ def test_evaluate_uses_per_scenario_grading_method(tmp_path: Path, make_persiste "id": 1, "text": "Q", "type": "iot", - "grading_method": "stub-pass", + "scoring_method": "stub-pass", } ] ), @@ -75,7 +75,7 @@ def test_evaluate_uses_per_scenario_grading_method(tmp_path: Path, make_persiste report = evaluate( trajectories_path=tmp_path, scenarios_paths=[scenarios_path], - default_grading_method="stub-fail", # per-scenario override wins + default_scoring_method="stub-fail", # per-scenario override wins ) # Override wins: scenario routed to the always-pass stub even though From 3af08e6fb3c041375cf2d4eea5cbd783b097f304 Mon Sep 17 00:00:00 2001 From: Shuxin Lin Date: Wed, 13 May 2026 15:59:33 -0400 Subject: [PATCH 8/8] docs(evaluation): clarify skeleton scorers in scenario-field table; add aggregate JSON shape Signed-off-by: Shuxin Lin --- docs/evaluation.md | 50 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/docs/evaluation.md b/docs/evaluation.md index 3b705b8a0..12ce4e1e2 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -40,15 +40,17 @@ The vocabulary follows MLflow's evaluation split: JSON list, JSON object, or JSONL. Fields the scorer cares about: -| Field | Used by | Notes | -| --------------------- | -------------------- | ------------------------------------------------ | -| `id` | join | Coerced to string at load time | -| `text` | all | The utterance the agent answered | -| `type` | reporting | Scenario family (`iot`, `tsfm`, `FMSR`, …) | -| `characteristic_form` | llm_judge, semantic | Expected behaviour, free-form | -| `expected_answer` | code_based, semantic | Exact target string / number | -| `scoring_method` | dispatch | Registered scorer name; overrides CLI default | -| `tolerance` | numeric_match | Optional relative + absolute tolerance | +| Field | Used by | Notes | +| --------------------- | ------------------------------------------------ | ---------------------------------------------- | +| `id` | join | Coerced to string at load time | +| `text` | all | The utterance the agent answered | +| `type` | reporting | Scenario family (`iot`, `tsfm`, `FMSR`, …) | +| `characteristic_form` | `llm_judge`, `semantic_similarity`* | Expected behaviour, free-form | +| `expected_answer` | `exact_string_match`*, `numeric_match`* | Exact target string / number | +| `scoring_method` | dispatch | Registered scorer name; overrides CLI default | +| `tolerance` | `numeric_match`* | Optional relative + absolute tolerance | + +\* Skeleton in this branch — see [Available scorers](#available-scorers-in-this-branch). Ground-truth files under `groundtruth/` already match this schema — they're a drop-in scenarios input. @@ -153,9 +155,33 @@ Per-run file (`reports/.json`): } ``` -Aggregate (`reports/_aggregate.json`) is the full `EvalReport` (totals, -runners, models, by-scenario-type breakdown, ops rollup, and the list -of per-scenario results). +Aggregate (`reports/_aggregate.json`) is the full `EvalReport`: + +```json +{ + "generated_at": "", + "runners": ["claude-agent"], + "models": ["litellm_proxy/aws/claude-opus-4-6"], + "totals": { + "scenarios": 1, + "scored": 1, + "passed": 1, + "pass_rate": 1.0 + }, + "by_scenario_type": { + "FMSR": {"total": 1, "passed": 1, "pass_rate": 1.0} + }, + "ops": { + "tokens_in_total": 7, + "tokens_out_total": 25, + "tool_calls_total": 1, + "duration_ms_p50": 14690.6, + "duration_ms_p95": 14690.6, + "est_cost_usd_total": 0.001959 + }, + "results": [ /* one ScenarioResult per run, same shape as the per-run files */ ] +} +``` ## CLI reference