From fcff318ba131385f30574b835ed6217f16545af3 Mon Sep 17 00:00:00 2001
From: Shuxin Lin <linshuhsin@gmail.com>
Date: Mon, 27 Apr 2026 16:27:38 -0400
Subject: [PATCH 1/8] feat(evaluation): add offline evaluation module with uv
 run evaluate CLI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement src/evaluation/ — consumes saved agent trajectories
({run_id}.json under AGENT_TRAJECTORY_DIR) and scenario files, joins
them on scenario_id, runs a registered grader per scenario, and emits
a JSON report combining grading results with operational metrics
(tokens, duration p50/p95, tool calls, optional cost estimate).

The shape follows SWE-bench / HELM / τ-bench conventions: agent run
→ evaluate → report.json, with offline re-grading from saved
trajectories as a first-class workflow.

Includes:
- Pydantic models (Scenario, PersistedTrajectory, GradeResult,
  OpsMetrics, EvalReport)
- Loader for trajectory dirs and JSON/JSONL scenario files
- Grader registry with two deterministic graders
  (exact_string_match, numeric_match) and a pluggable LLM judge
  bound to LLMBackend (six-criterion rubric)
- Per-task ops metric extraction (handles both SDK Trajectory and
  plan-execute list[StepResult] shapes) plus aggregate rollups
- Report writer with terminal summary and JSON output
- evaluate script registered in [project.scripts]
- 39 unit tests covering models, loader, graders, metrics, report,
  and end-to-end runner — all passing alongside existing 270 tests

Closes #279

Signed-off-by: Shuxin Lin <linshuhsin@gmail.com>
---
 pyproject.toml                          |   3 +-
 src/evaluation/__init__.py              |  34 ++++++
 src/evaluation/cli.py                   | 106 +++++++++++++++++
 src/evaluation/graders/__init__.py      |  36 ++++++
 src/evaluation/graders/deterministic.py |  71 ++++++++++++
 src/evaluation/graders/llm_judge.py     | 144 ++++++++++++++++++++++++
 src/evaluation/loader.py                |  93 +++++++++++++++
 src/evaluation/metrics.py               | 125 ++++++++++++++++++++
 src/evaluation/models.py                | 110 ++++++++++++++++++
 src/evaluation/report.py                |  87 ++++++++++++++
 src/evaluation/runner.py                |  68 +++++++++++
 src/evaluation/tests/__init__.py        |   0
 src/evaluation/tests/conftest.py        |  72 ++++++++++++
 src/evaluation/tests/test_graders.py    | 120 ++++++++++++++++++++
 src/evaluation/tests/test_loader.py     |  72 ++++++++++++
 src/evaluation/tests/test_metrics.py    | 101 +++++++++++++++++
 src/evaluation/tests/test_models.py     |  45 ++++++++
 src/evaluation/tests/test_report.py     |  74 ++++++++++++
 src/evaluation/tests/test_runner.py     |  76 +++++++++++++
 19 files changed, 1436 insertions(+), 1 deletion(-)
 create mode 100644 src/evaluation/cli.py
 create mode 100644 src/evaluation/graders/__init__.py
 create mode 100644 src/evaluation/graders/deterministic.py
 create mode 100644 src/evaluation/graders/llm_judge.py
 create mode 100644 src/evaluation/loader.py
 create mode 100644 src/evaluation/metrics.py
 create mode 100644 src/evaluation/models.py
 create mode 100644 src/evaluation/report.py
 create mode 100644 src/evaluation/runner.py
 create mode 100644 src/evaluation/tests/__init__.py
 create mode 100644 src/evaluation/tests/conftest.py
 create mode 100644 src/evaluation/tests/test_graders.py
 create mode 100644 src/evaluation/tests/test_loader.py
 create mode 100644 src/evaluation/tests/test_metrics.py
 create mode 100644 src/evaluation/tests/test_models.py
 create mode 100644 src/evaluation/tests/test_report.py
 create mode 100644 src/evaluation/tests/test_runner.py

diff --git a/pyproject.toml b/pyproject.toml
index 95d794c90..1d928f9e6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["hatchling"]
 build-backend = "hatchling.build"
 
 [tool.hatch.build.targets.wheel]
-packages = ["src/agent", "src/llm", "src/observability"]
+packages = ["src/agent", "src/evaluation", "src/llm", "src/observability"]
 
 [project]
 name = "assetopsbench-mcp"
@@ -42,6 +42,7 @@ wo-mcp-server = "servers.wo.main:main"
 vibration-mcp-server = "servers.vibration.main:main"
 openai-agent = "agent.openai_agent.cli:main"
 deep-agent = "agent.deep_agent.cli:main"
+evaluate = "evaluation.cli:main"
 
 
 [dependency-groups]
diff --git a/src/evaluation/__init__.py b/src/evaluation/__init__.py
index e69de29bb..ca632ab1b 100644
--- a/src/evaluation/__init__.py
+++ b/src/evaluation/__init__.py
@@ -0,0 +1,34 @@
+"""Offline evaluation harness for AssetOpsBench agent runs.
+
+Consumes saved trajectory files (written by
+:func:`observability.persistence.persist_trajectory`) and scenario files
+(under ``src/scenarios/``) and emits a structured JSON report combining
+graded outcomes with operational metrics.
+
+The shape mirrors conventions from SWE-bench, HELM, and τ-bench:
+``run`` (executes the agent — already exists) → ``evaluate`` (this
+module) → ``report.json``.  Re-grading from saved trajectories is
+first-class.
+"""
+
+from .models import (
+    AggregateOps,
+    EvalReport,
+    GradeResult,
+    OpsMetrics,
+    PersistedTrajectory,
+    Scenario,
+    ScenarioResult,
+    TypeBreakdown,
+)
+
+__all__ = [
+    "AggregateOps",
+    "EvalReport",
+    "GradeResult",
+    "OpsMetrics",
+    "PersistedTrajectory",
+    "Scenario",
+    "ScenarioResult",
+    "TypeBreakdown",
+]
diff --git a/src/evaluation/cli.py b/src/evaluation/cli.py
new file mode 100644
index 000000000..806452899
--- /dev/null
+++ b/src/evaluation/cli.py
@@ -0,0 +1,106 @@
+"""``uv run evaluate`` — offline grading + report generation."""
+
+from __future__ import annotations
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+
+from . import graders as grader_registry
+from .report import render_summary, write_report
+from .runner import evaluate
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="evaluate",
+        description=(
+            "Grade saved agent trajectories against scenario files and "
+            "emit a JSON report."
+        ),
+    )
+    p.add_argument(
+        "--trajectories",
+        type=Path,
+        required=True,
+        help="Directory of {run_id}.json trajectory files (or a single file).",
+    )
+    p.add_argument(
+        "--scenarios",
+        type=Path,
+        nargs="+",
+        required=True,
+        help="One or more scenario JSON / JSONL files.",
+    )
+    p.add_argument(
+        "--output",
+        type=Path,
+        required=True,
+        help="Path to write the JSON report.",
+    )
+    p.add_argument(
+        "--grader-default",
+        default="llm_judge",
+        help="Grader name when scenario.grading_method is unset. "
+        "Default: llm_judge.",
+    )
+    p.add_argument(
+        "--judge-model",
+        default=None,
+        help="Model id for the LLM judge (e.g. "
+        "litellm_proxy/anthropic/claude-opus-4-5). "
+        "Required when any scenario routes to llm_judge.",
+    )
+    p.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Enable INFO-level logging.",
+    )
+    return p
+
+
+def _maybe_install_judge(judge_model: str | None) -> None:
+    if not judge_model:
+        return
+    # Imported lazily so the CLI works for deterministic-only runs even
+    # if the LiteLLM dep happens to be flaky in the dev environment.
+    from llm import LiteLLMBackend  # type: ignore[import-not-found]
+
+    from .graders.llm_judge import install
+
+    install(LiteLLMBackend(model=judge_model))
+
+
+def _validate_grader_default(name: str) -> None:
+    try:
+        grader_registry.get(name)
+    except KeyError as exc:
+        raise SystemExit(str(exc))
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = _build_parser().parse_args(argv)
+    logging.basicConfig(
+        level=logging.INFO if args.verbose else logging.WARNING,
+        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+    )
+
+    _maybe_install_judge(args.judge_model)
+    _validate_grader_default(args.grader_default)
+
+    report = evaluate(
+        trajectories_path=args.trajectories,
+        scenarios_paths=list(args.scenarios),
+        default_grading_method=args.grader_default,
+    )
+
+    out = write_report(report, args.output)
+    print(render_summary(report))
+    print(f"\nReport written: {out}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/evaluation/graders/__init__.py b/src/evaluation/graders/__init__.py
new file mode 100644
index 000000000..f58a074e0
--- /dev/null
+++ b/src/evaluation/graders/__init__.py
@@ -0,0 +1,36 @@
+"""Pluggable grader registry.
+
+Each grader is a callable taking ``(scenario, answer, trajectory_text)``
+and returning a :class:`~evaluation.models.GradeResult`.  Registration
+happens via :func:`register`; the CLI looks up graders by name from
+``scenario.grading_method`` (falling back to a CLI-supplied default).
+"""
+
+from __future__ import annotations
+
+from typing import Callable
+
+from ..models import GradeResult, Scenario
+
+Grader = Callable[[Scenario, str, str], GradeResult]
+
+_REGISTRY: dict[str, Grader] = {}
+
+
+def register(name: str, grader: Grader) -> None:
+    _REGISTRY[name] = grader
+
+
+def get(name: str) -> Grader:
+    if name not in _REGISTRY:
+        raise KeyError(
+            f"unknown grader {name!r}; registered: {sorted(_REGISTRY)}"
+        )
+    return _REGISTRY[name]
+
+
+def names() -> list[str]:
+    return sorted(_REGISTRY)
+
+
+from . import deterministic  # noqa: E402,F401  — register-on-import
diff --git a/src/evaluation/graders/deterministic.py b/src/evaluation/graders/deterministic.py
new file mode 100644
index 000000000..35db1c299
--- /dev/null
+++ b/src/evaluation/graders/deterministic.py
@@ -0,0 +1,71 @@
+"""Pure deterministic graders — no LLM, no network."""
+
+from __future__ import annotations
+
+import math
+
+from ..models import GradeResult, Scenario
+from . import register
+
+
+def exact_string_match(
+    scenario: Scenario, answer: str, trajectory_text: str
+) -> GradeResult:
+    expected = scenario.expected_answer
+    if expected is None:
+        return GradeResult(
+            grading_method="exact_string_match",
+            passed=False,
+            score=0.0,
+            rationale="scenario has no expected_answer",
+        )
+
+    a = str(answer).strip().lower()
+    e = str(expected).strip().lower()
+    passed = a == e
+    return GradeResult(
+        grading_method="exact_string_match",
+        passed=passed,
+        score=1.0 if passed else 0.0,
+        rationale="" if passed else f"expected {expected!r}, got {answer!r}",
+        details={"expected": expected, "actual": answer},
+    )
+
+
+def numeric_match(
+    scenario: Scenario, answer: str, trajectory_text: str
+) -> GradeResult:
+    expected_raw = scenario.expected_answer
+    extra = scenario.model_extra or {}
+    tolerance = float(extra.get("tolerance", 1e-6))
+
+    if expected_raw is None:
+        return GradeResult(
+            grading_method="numeric_match",
+            passed=False,
+            rationale="scenario has no expected_answer",
+        )
+
+    try:
+        a = float(answer)
+        e = float(expected_raw)
+    except (TypeError, ValueError) as err:
+        return GradeResult(
+            grading_method="numeric_match",
+            passed=False,
+            rationale=f"could not parse numbers: {err}",
+            details={"expected": expected_raw, "actual": answer},
+        )
+
+    passed = math.isclose(a, e, rel_tol=tolerance, abs_tol=tolerance)
+    return GradeResult(
+        grading_method="numeric_match",
+        passed=passed,
+        score=1.0 if passed else 0.0,
+        rationale="" if passed else f"|{a} - {e}| > tol={tolerance}",
+        details={"expected": e, "actual": a, "tolerance": tolerance},
+    )
+
+
+register("exact_string_match", exact_string_match)
+register("numeric_match", numeric_match)
diff --git a/src/evaluation/graders/llm_judge.py b/src/evaluation/graders/llm_judge.py
new file mode 100644
index 000000000..fb55bf73f
--- /dev/null
+++ b/src/evaluation/graders/llm_judge.py
@@ -0,0 +1,144 @@
+"""LLM-judge grader.
+
+Free-form answers are scored against ``scenario.characteristic_form``
+using a six-criterion rubric (task completion, data retrieval accuracy,
+result verification, agent sequence, clarity, hallucinations) — the
+same shape as ``aobench/scenario-server/grading/graders.evaluation_agent``
+but built directly on :class:`~llm.LLMBackend` so the evaluation module
+has no dependency on the scenario-server codebase.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+
+from llm import LLMBackend
+
+from ..models import GradeResult, Scenario
+from . import register
+
+_log = logging.getLogger(__name__)
+
+_RUBRIC_KEYS = (
+    "task_completion",
+    "data_retrieval_accuracy",
+    "generalized_result_verification",
+    "agent_sequence_correct",
+    "clarity_and_justification",
+    "hallucinations",
+)
+
+_PROMPT_TEMPLATE = """You are an evaluation judge for an industrial-asset-operations agent.
+
+Score the agent response against the expected characteristic answer using the six criteria below. Respond ONLY with a JSON object, no prose.
+
+QUESTION:
+{question}
+
+EXPECTED CHARACTERISTIC:
+{characteristic}
+
+AGENT RESPONSE:
+{answer}
+
+AGENT TRAJECTORY (turns / tool calls / outputs):
+{trajectory}
+
+Return JSON with these boolean fields plus a one-sentence reason:
+
+{{
+  "task_completion": <bool>,
+  "data_retrieval_accuracy": <bool>,
+  "generalized_result_verification": <bool>,
+  "agent_sequence_correct": <bool>,
+  "clarity_and_justification": <bool>,
+  "hallucinations": <bool>,
+  "reason": "<one sentence>"
+}}
+
+The agent passes overall iff the first five are true AND hallucinations is false."""
+
+
+class LLMJudgeGrader:
+    """Closure-style grader that holds an :class:`LLMBackend`."""
+
+    def __init__(self, llm: LLMBackend, name: str = "llm_judge") -> None:
+        self._llm = llm
+        self.name = name
+
+    def __call__(
+        self, scenario: Scenario, answer: str, trajectory_text: str
+    ) -> GradeResult:
+        characteristic = scenario.characteristic_form or scenario.expected_answer or ""
+        if not characteristic:
+            return GradeResult(
+                grading_method=self.name,
+                passed=False,
+                rationale="scenario has neither characteristic_form nor expected_answer",
+            )
+
+        prompt = _PROMPT_TEMPLATE.format(
+            question=scenario.text,
+            characteristic=characteristic,
+            answer=answer,
+            trajectory=trajectory_text[:8000],
+        )
+
+        try:
+            raw = self._llm.generate(prompt)
+        except Exception as exc:  # judge call failure is a grading failure, not a crash
+            _log.exception("llm_judge: backend error")
+            return GradeResult(
+                grading_method=self.name,
+                passed=False,
+                rationale=f"judge backend error: {exc}",
+            )
+
+        review = _parse_review(raw)
+        if review is None:
+            return GradeResult(
+                grading_method=self.name,
+                passed=False,
+                rationale="judge returned unparseable JSON",
+                details={"raw": raw[:2000]},
+            )
+
+        passed = (
+            review.get("task_completion") is True
+            and review.get("data_retrieval_accuracy") is True
+            and review.get("generalized_result_verification") is True
+            and review.get("agent_sequence_correct") is True
+            and review.get("clarity_and_justification") is True
+            and review.get("hallucinations") is False
+        )
+        score = sum(1 for k in _RUBRIC_KEYS[:5] if review.get(k) is True) / 5.0
+        if review.get("hallucinations") is True:
+            score = max(0.0, score - 0.2)
+
+        return GradeResult(
+            grading_method=self.name,
+            passed=passed,
+            score=round(score, 3),
+            rationale=str(review.get("reason", ""))[:500],
+            details=review,
+        )
+
+
+def _parse_review(raw: str) -> dict | None:
+    if not raw:
+        return None
+    # Tolerate leading prose / markdown fences by extracting the first {...} block.
+    match = re.search(r"\{.*\}", raw, re.DOTALL)
+    if not match:
+        return None
+    try:
+        return json.loads(match.group(0))
+    except json.JSONDecodeError:
+        return None
+
+
+def install(llm: LLMBackend, name: str = "llm_judge") -> None:
+    """Register an LLM-judge grader bound to ``llm`` under ``name``."""
+    register(name, LLMJudgeGrader(llm, name=name))
diff --git a/src/evaluation/loader.py b/src/evaluation/loader.py
new file mode 100644
index 000000000..31b9c761b
--- /dev/null
+++ b/src/evaluation/loader.py
@@ -0,0 +1,93 @@
+"""Load trajectories and scenarios, then join them by ``scenario_id``."""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from typing import Iterable, Iterator
+
+from .models import PersistedTrajectory, Scenario
+
+_log = logging.getLogger(__name__)
+
+
+def load_trajectories(path: Path) -> list[PersistedTrajectory]:
+    """Load every ``*.json`` trajectory under ``path``.
+
+    ``path`` may be a directory (the ``AGENT_TRAJECTORY_DIR`` layout) or
+    a single JSON file.  Files that fail to parse are logged and
+    skipped — a partial directory should still yield a usable batch.
+    """
+    p = Path(path)
+    if p.is_file():
+        return [_load_one(p)] if p.suffix == ".json" else []
+
+    out: list[PersistedTrajectory] = []
+    for child in sorted(p.glob("*.json")):
+        try:
+            out.append(_load_one(child))
+        except Exception:
+            _log.exception("loader: failed to parse %s", child)
+    return out
+
+
+def _load_one(path: Path) -> PersistedTrajectory:
+    raw = json.loads(path.read_text(encoding="utf-8"))
+    return PersistedTrajectory.from_raw(raw)
+
+
+def load_scenarios(paths: Iterable[Path] | Path) -> list[Scenario]:
+    """Load scenarios from one or more files.
+
+    Each file may be a JSON list, a single JSON object, or JSONL.
+    Scenario IDs are coerced to strings to make the join key uniform
+    (CouchDB-style trajectories use string IDs; local JSON files use
+    ints).
+    """
+    if isinstance(paths, (str, Path)):
+        paths = [Path(paths)]
+
+    out: list[Scenario] = []
+    for p in paths:
+        out.extend(_load_scenario_file(Path(p)))
+    return out
+
+
+def _load_scenario_file(path: Path) -> list[Scenario]:
+    text = path.read_text(encoding="utf-8").strip()
+    if not text:
+        return []
+
+    if path.suffix == ".jsonl":
+        return [
+            Scenario.from_raw(json.loads(line))
+            for line in text.splitlines()
+            if line.strip()
+        ]
+
+    raw = json.loads(text)
+    if isinstance(raw, list):
+        return [Scenario.from_raw(item) for item in raw]
+    if isinstance(raw, dict):
+        return [Scenario.from_raw(raw)]
+    raise ValueError(f"unexpected scenario JSON shape in {path}: {type(raw).__name__}")
+
+
+def join_records(
+    scenarios: list[Scenario],
+    trajectories: list[PersistedTrajectory],
+) -> Iterator[tuple[Scenario, PersistedTrajectory]]:
+    """Yield (scenario, trajectory) pairs joined on ``scenario_id``.
+
+    Scenarios with no matching trajectory and trajectories with no
+    matching scenario are silently dropped — the caller can compute the
+    diff from the input lists if reporting is needed.
+    """
+    by_id: dict[str, Scenario] = {s.id: s for s in scenarios}
+    for traj in trajectories:
+        if traj.scenario_id is None:
+            continue
+        scenario = by_id.get(traj.scenario_id)
+        if scenario is not None:
+            yield scenario, traj
diff --git a/src/evaluation/metrics.py b/src/evaluation/metrics.py
new file mode 100644
index 000000000..325074a7e
--- /dev/null
+++ b/src/evaluation/metrics.py
@@ -0,0 +1,125 @@
+"""Operational metric extraction and aggregation."""
+
+from __future__ import annotations
+
+import statistics
+from typing import Any
+
+from .models import AggregateOps, OpsMetrics, PersistedTrajectory, ScenarioResult
+
+# USD per 1M tokens, rough public list-prices.  None when unknown.  Used
+# only for the optional ``est_cost_usd`` rollup; consumers should treat
+# it as an estimate, not a billing source of truth.
+_PRICE_PER_1M: dict[str, tuple[float, float]] = {
+    "claude-opus-4-5": (15.0, 75.0),
+    "claude-opus-4-1": (15.0, 75.0),
+    "claude-sonnet-4-6": (3.0, 15.0),
+    "claude-haiku-4-5": (1.0, 5.0),
+    "gpt-5": (10.0, 30.0),
+    "gpt-4.1": (3.0, 12.0),
+    "gpt-4o": (2.5, 10.0),
+    "llama-4-maverick": (0.27, 0.85),
+}
+
+
+def metrics_from_trajectory(record: PersistedTrajectory) -> OpsMetrics:
+    """Extract per-task ops metrics from a persisted trajectory record."""
+    traj = record.trajectory
+    if traj is None:
+        return OpsMetrics()
+
+    if isinstance(traj, dict) and "turns" in traj:
+        return _from_sdk_trajectory(traj, record.model)
+    if isinstance(traj, list):
+        return _from_plan_execute(traj, record.model)
+    return OpsMetrics()
+
+
+def _from_sdk_trajectory(traj: dict, model: str) -> OpsMetrics:
+    turns = traj.get("turns", []) or []
+    tokens_in = sum(int(t.get("input_tokens") or 0) for t in turns)
+    tokens_out = sum(int(t.get("output_tokens") or 0) for t in turns)
+
+    durations_ms = [t.get("duration_ms") for t in turns if t.get("duration_ms") is not None]
+    duration_ms = sum(durations_ms) if durations_ms else None
+
+    tool_names: list[str] = []
+    for t in turns:
+        for tc in t.get("tool_calls") or []:
+            name = tc.get("name")
+            if name:
+                tool_names.append(name)
+
+    return OpsMetrics(
+        turn_count=len(turns),
+        tool_call_count=len(tool_names),
+        unique_tools=sorted(set(tool_names)),
+        tokens_in=tokens_in,
+        tokens_out=tokens_out,
+        duration_ms=duration_ms,
+        est_cost_usd=_estimate_cost(model, tokens_in, tokens_out),
+    )
+
+
+def _from_plan_execute(steps: list[Any], model: str) -> OpsMetrics:
+    # plan-execute persists ``list[StepResult]``; the dataclass exposes
+    # ``server`` / ``tool`` / ``response`` fields but no per-step token
+    # counts, so we surface what is available and leave the rest at zero.
+    tool_names = [
+        s.get("tool")
+        for s in steps
+        if isinstance(s, dict) and s.get("tool")
+    ]
+    return OpsMetrics(
+        turn_count=len(steps),
+        tool_call_count=len(tool_names),
+        unique_tools=sorted(set(tool_names)),
+        est_cost_usd=_estimate_cost(model, 0, 0),
+    )
+
+
+def _estimate_cost(model: str, tokens_in: int, tokens_out: int) -> float | None:
+    if not model or (tokens_in == 0 and tokens_out == 0):
+        return None
+    key = _normalize_model(model)
+    rate = _PRICE_PER_1M.get(key)
+    if rate is None:
+        return None
+    in_rate, out_rate = rate
+    return round((tokens_in * in_rate + tokens_out * out_rate) / 1_000_000, 6)
+
+
+def _normalize_model(model: str) -> str:
+    # Strip provider prefixes like ``litellm_proxy/anthropic/`` and
+    # version suffixes like ``-20250101``.
+    tail = model.rsplit("/", 1)[-1].lower()
+    parts = tail.split("-")
+    if parts and parts[-1].isdigit() and len(parts[-1]) >= 6:
+        parts = parts[:-1]
+    return "-".join(parts)
+
+
+def aggregate_ops(results: list[ScenarioResult]) -> AggregateOps:
+    if not results:
+        return AggregateOps()
+
+    durations = [r.ops.duration_ms for r in results if r.ops.duration_ms is not None]
+    costs = [r.ops.est_cost_usd for r in results if r.ops.est_cost_usd is not None]
+
+    return AggregateOps(
+        tokens_in_total=sum(r.ops.tokens_in for r in results),
+        tokens_out_total=sum(r.ops.tokens_out for r in results),
+        duration_ms_p50=_percentile(durations, 50),
+        duration_ms_p95=_percentile(durations, 95),
+        tool_calls_total=sum(r.ops.tool_call_count for r in results),
+        est_cost_usd_total=round(sum(costs), 6) if costs else None,
+    )
+
+
+def _percentile(values: list[float], pct: float) -> float | None:
+    if not values:
+        return None
+    if len(values) == 1:
+        return float(values[0])
+    quantiles = statistics.quantiles(values, n=100, method="inclusive")
+    return float(quantiles[int(pct) - 1])
diff --git a/src/evaluation/models.py b/src/evaluation/models.py
new file mode 100644
index 000000000..846e98911
--- /dev/null
+++ b/src/evaluation/models.py
@@ -0,0 +1,110 @@
+"""Pydantic models for the offline evaluation pipeline."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class Scenario(BaseModel):
+    """One evaluation scenario.
+
+    Mirrors the on-disk shape under ``src/scenarios/`` and is permissive
+    via ``extra='allow'`` so domain-specific fields (e.g. category,
+    characteristic_form) survive the round-trip.
+    """
+
+    model_config = ConfigDict(extra="allow")
+
+    id: str
+    text: str
+    type: str = ""
+    category: str = ""
+    characteristic_form: str | None = None
+    expected_answer: str | None = None
+    grading_method: str | None = None
+
+    @classmethod
+    def from_raw(cls, raw: dict) -> "Scenario":
+        d = dict(raw)
+        if "id" in d:
+            d["id"] = str(d["id"])
+        return cls.model_validate(d)
+
+
+class PersistedTrajectory(BaseModel):
+    """Record written by ``observability.persistence.persist_trajectory``."""
+
+    model_config = ConfigDict(extra="allow")
+
+    run_id: str
+    scenario_id: str | None = None
+    runner: str
+    model: str
+    question: str
+    answer: str
+    trajectory: Any = None
+
+    @classmethod
+    def from_raw(cls, raw: dict) -> "PersistedTrajectory":
+        d = dict(raw)
+        if d.get("scenario_id") is not None:
+            d["scenario_id"] = str(d["scenario_id"])
+        return cls.model_validate(d)
+
+
+class OpsMetrics(BaseModel):
+    """Per-task operational metrics derived from a trajectory."""
+
+    turn_count: int = 0
+    tool_call_count: int = 0
+    unique_tools: list[str] = Field(default_factory=list)
+    tokens_in: int = 0
+    tokens_out: int = 0
+    duration_ms: float | None = None
+    est_cost_usd: float | None = None
+
+
+class GradeResult(BaseModel):
+    grading_method: str
+    passed: bool
+    score: float = 0.0
+    rationale: str = ""
+    details: dict[str, Any] = Field(default_factory=dict)
+
+
+class ScenarioResult(BaseModel):
+    scenario_id: str
+    scenario_type: str = ""
+    runner: str
+    model: str
+    question: str
+    answer: str
+    grade: GradeResult
+    ops: OpsMetrics
+
+
+class AggregateOps(BaseModel):
+    tokens_in_total: int = 0
+    tokens_out_total: int = 0
+    duration_ms_p50: float | None = None
+    duration_ms_p95: float | None = None
+    tool_calls_total: int = 0
+    est_cost_usd_total: float | None = None
+
+
+class TypeBreakdown(BaseModel):
+    total: int = 0
+    passed: int = 0
+    pass_rate: float = 0.0
+
+
+class EvalReport(BaseModel):
+    generated_at: str
+    runners: list[str] = Field(default_factory=list)
+    models: list[str] = Field(default_factory=list)
+    totals: dict[str, Any] = Field(default_factory=dict)
+    by_scenario_type: dict[str, TypeBreakdown] = Field(default_factory=dict)
+    ops: AggregateOps = Field(default_factory=AggregateOps)
+    results: list[ScenarioResult] = Field(default_factory=list)
diff --git a/src/evaluation/report.py b/src/evaluation/report.py
new file mode 100644
index 000000000..72ff9b0e2
--- /dev/null
+++ b/src/evaluation/report.py
@@ -0,0 +1,87 @@
+"""Build an :class:`EvalReport` from graded scenario results."""
+
+from __future__ import annotations
+
+import datetime as _dt
+import json
+from collections import defaultdict
+from pathlib import Path
+
+from .metrics import aggregate_ops
+from .models import EvalReport, ScenarioResult, TypeBreakdown
+
+
+def build_report(results: list[ScenarioResult]) -> EvalReport:
+    total = len(results)
+    passed = sum(1 for r in results if r.grade.passed)
+
+    by_type: dict[str, list[ScenarioResult]] = defaultdict(list)
+    for r in results:
+        by_type[r.scenario_type or "unknown"].append(r)
+
+    breakdown: dict[str, TypeBreakdown] = {}
+    for stype, items in by_type.items():
+        n = len(items)
+        p = sum(1 for r in items if r.grade.passed)
+        breakdown[stype] = TypeBreakdown(
+            total=n,
+            passed=p,
+            pass_rate=round(p / n, 4) if n else 0.0,
+        )
+
+    return EvalReport(
+        generated_at=_dt.datetime.now(_dt.timezone.utc).isoformat(),
+        runners=sorted({r.runner for r in results}),
+        models=sorted({r.model for r in results}),
+        totals={
+            "scenarios": total,
+            "graded": total,
+            "passed": passed,
+            "pass_rate": round(passed / total, 4) if total else 0.0,
+        },
+        by_scenario_type=breakdown,
+        ops=aggregate_ops(results),
+        results=results,
+    )
+
+
+def write_report(report: EvalReport, output: Path) -> Path:
+    output = Path(output)
+    output.parent.mkdir(parents=True, exist_ok=True)
+    output.write_text(report.model_dump_json(indent=2), encoding="utf-8")
+    return output
+
+
+def render_summary(report: EvalReport) -> str:
+    lines: list[str] = []
+    t = report.totals
+    lines.append(
+        f"Scenarios: {t.get('scenarios', 0)}  "
+        f"Passed: {t.get('passed', 0)}  "
+        f"Pass rate: {t.get('pass_rate', 0):.1%}"
+    )
+    if report.by_scenario_type:
+        lines.append("")
+        lines.append("By scenario type:")
+        for stype, b in sorted(report.by_scenario_type.items()):
+            lines.append(
+                f"  {stype:<16} {b.passed:>4}/{b.total:<4}  ({b.pass_rate:.1%})"
+            )
+    o = report.ops
+    lines.append("")
+    lines.append("Operational metrics:")
+    lines.append(f"  tokens_in_total:   {o.tokens_in_total}")
+    lines.append(f"  tokens_out_total:  {o.tokens_out_total}")
+    lines.append(f"  tool_calls_total:  {o.tool_calls_total}")
+    if o.duration_ms_p50 is not None:
+        lines.append(f"  duration_ms_p50:   {o.duration_ms_p50:.1f}")
+    if o.duration_ms_p95 is not None:
+        lines.append(f"  duration_ms_p95:   {o.duration_ms_p95:.1f}")
+    if o.est_cost_usd_total is not None:
+        lines.append(f"  est_cost_usd:      ${o.est_cost_usd_total:.4f}")
+    return "\n".join(lines)
+
+
+def report_to_json(report: EvalReport) -> str:
+    """Convenience JSON dump that round-trips through pydantic."""
+    return json.dumps(json.loads(report.model_dump_json()), indent=2)
diff --git a/src/evaluation/runner.py b/src/evaluation/runner.py
new file mode 100644
index 000000000..f87da2fa6
--- /dev/null
+++ b/src/evaluation/runner.py
@@ -0,0 +1,68 @@
+"""Glue: load → grade → assemble report."""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+
+from . import graders as grader_registry
+from .loader import join_records, load_scenarios, load_trajectories
+from .metrics import metrics_from_trajectory
+from .models import EvalReport, PersistedTrajectory, Scenario, ScenarioResult
+from .report import build_report
+
+_log = logging.getLogger(__name__)
+
+
+def evaluate(
+    *,
+    trajectories_path: Path,
+    scenarios_paths: list[Path],
+    default_grading_method: str = "llm_judge",
+) -> EvalReport:
+    """Load, grade, and aggregate.
+
+    Per-scenario grader is picked from ``scenario.grading_method`` when
+    set, falling back to ``default_grading_method``.
+    """
+    scenarios = load_scenarios(scenarios_paths)
+    trajectories = load_trajectories(trajectories_path)
+
+    results: list[ScenarioResult] = []
+    for scenario, traj in join_records(scenarios, trajectories):
+        results.append(_grade_one(scenario, traj, default_grading_method))
+
+    return build_report(results)
+
+
+def _grade_one(
+    scenario: Scenario,
+    traj: PersistedTrajectory,
+    default_grading_method: str,
+) -> ScenarioResult:
+    method = scenario.grading_method or default_grading_method
+    grader = grader_registry.get(method)
+    trajectory_text = _trajectory_to_text(traj)
+    grade = grader(scenario, traj.answer, trajectory_text)
+
+    return ScenarioResult(
+        scenario_id=scenario.id,
+        scenario_type=scenario.type,
+        runner=traj.runner,
+        model=traj.model,
+        question=traj.question,
+        answer=traj.answer,
+        grade=grade,
+        ops=metrics_from_trajectory(traj),
+    )
+
+
+def _trajectory_to_text(traj: PersistedTrajectory) -> str:
+    """Flatten a trajectory to a text blob for the LLM judge prompt."""
+    if traj.trajectory is None:
+        return ""
+    try:
+        return json.dumps(traj.trajectory, indent=2, default=str)
+    except (TypeError, ValueError):
+        return str(traj.trajectory)
diff --git a/src/evaluation/tests/__init__.py b/src/evaluation/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/evaluation/tests/conftest.py b/src/evaluation/tests/conftest.py
new file mode 100644
index 000000000..65eedf7d9
--- /dev/null
+++ b/src/evaluation/tests/conftest.py
@@ -0,0 +1,72 @@
+"""Shared fixtures for evaluation unit tests."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import pytest
+
+from evaluation.models import Scenario
+
+
+@pytest.fixture
+def make_scenario():
+    def _factory(**overrides) -> Scenario:
+        defaults = {
+            "id": "1",
+            "text": "What sensors are on Chiller 6?",
+            "type": "iot",
+            "category": "Knowledge Query",
+            "characteristic_form": "Should list temperature, pressure, vibration sensors.",
+        }
+        defaults.update(overrides)
+        return Scenario.from_raw(defaults)
+
+    return _factory
+
+
+@pytest.fixture
+def make_persisted_record():
+    def _factory(**overrides) -> dict:
+        defaults = {
+            "run_id": "run-1",
+            "scenario_id": "1",
+            "runner": "plan-execute",
+            "model": "watsonx/ibm/granite",
+            "question": "Q?",
+            "answer": "A.",
+            "trajectory": {
+                "turns": [
+                    {
+                        "index": 0,
+                        "text": "thinking",
+                        "tool_calls": [{"name": "sites", "input": {}}],
+                        "input_tokens": 10,
+                        "output_tokens": 5,
+                        "duration_ms": 100.0,
+                    },
+                    {
+                        "index": 1,
+                        "text": "answer",
+                        "tool_calls": [],
+                        "input_tokens": 12,
+                        "output_tokens": 7,
+                        "duration_ms": 200.0,
+                    },
+                ],
+                "started_at": "2026-04-27T00:00:00Z",
+            },
+        }
+        defaults.update(overrides)
+        return defaults
+
+    return _factory
+
+
+@pytest.fixture
+def trajectory_dir(tmp_path: Path, make_persisted_record):
+    """A directory pre-populated with one trajectory JSON file."""
+    rec = make_persisted_record()
+    (tmp_path / f"{rec['run_id']}.json").write_text(json.dumps(rec), encoding="utf-8")
+    return tmp_path
diff --git a/src/evaluation/tests/test_graders.py b/src/evaluation/tests/test_graders.py
new file mode 100644
index 000000000..9002e5f95
--- /dev/null
+++ b/src/evaluation/tests/test_graders.py
@@ -0,0 +1,120 @@
+"""Tests for deterministic + LLM-judge graders."""
+
+from __future__ import annotations
+
+from evaluation import graders as registry
+from evaluation.graders.deterministic import exact_string_match, numeric_match
+from evaluation.graders.llm_judge import LLMJudgeGrader, install
+from llm import LLMBackend
+
+
+class _StubLLM(LLMBackend):
+    def __init__(self, response: str) -> None:
+        self._response = response
+
+    def generate(self, prompt: str, temperature: float = 0.0) -> str:
+        return self._response
+
+
+class TestExactStringMatch:
+    def test_match_case_insensitive(self, make_scenario):
+        s = make_scenario(expected_answer="Hello World")
+        r = exact_string_match(s, "hello world", "")
+        assert r.passed and r.score == 1.0
+
+    def test_mismatch(self, make_scenario):
+        s = make_scenario(expected_answer="foo")
+        r = exact_string_match(s, "bar", "")
+        assert not r.passed
+        assert r.details["expected"] == "foo"
+
+    def test_missing_expected(self, make_scenario):
+        s = make_scenario(expected_answer=None)
+        r = exact_string_match(s, "anything", "")
+        assert not r.passed
+        assert "expected_answer" in r.rationale
+
+
+class TestNumericMatch:
+    def test_within_tolerance(self, make_scenario):
+        s = make_scenario(expected_answer="3.14159")
+        r = numeric_match(s, "3.141591", "")
+        assert r.passed
+
+    def test_unparseable(self, make_scenario):
+        s = make_scenario(expected_answer="3.14")
+        r = numeric_match(s, "not a number", "")
+        assert not r.passed
+        assert "could not parse" in r.rationale
+
+    def test_custom_tolerance(self, make_scenario):
+        s = make_scenario(expected_answer="100", tolerance=0.05)
+        r = numeric_match(s, "104", "")
+        assert r.passed
+
+
+class TestRegistry:
+    def test_deterministic_graders_registered(self):
+        assert "exact_string_match" in registry.names()
+        assert "numeric_match" in registry.names()
+
+    def test_get_unknown_raises(self):
+        try:
+            registry.get("does_not_exist")
+        except KeyError as e:
+            assert "does_not_exist" in str(e)
+        else:
+            raise AssertionError("expected KeyError")
+
+
+class TestLLMJudgeGrader:
+    def _all_pass_response(self) -> str:
+        return (
+            '{"task_completion": true, "data_retrieval_accuracy": true, '
+            '"generalized_result_verification": true, "agent_sequence_correct": true, '
+            '"clarity_and_justification": true, "hallucinations": false, '
+            '"reason": "Looks good."}'
+        )
+
+    def test_passes_when_all_criteria_true(self, make_scenario):
+        grader = LLMJudgeGrader(_StubLLM(self._all_pass_response()))
+        r = grader(make_scenario(), "answer", "trajectory")
+        assert r.passed
+        assert r.score == 1.0
+        assert r.rationale == "Looks good."
+
+    def test_fails_on_hallucination(self, make_scenario):
+        resp = self._all_pass_response().replace(
+            '"hallucinations": false', '"hallucinations": true'
+        )
+        grader = LLMJudgeGrader(_StubLLM(resp))
+        r = grader(make_scenario(), "answer", "trajectory")
+        assert not r.passed
+        # Score is penalized but not zeroed when 5/5 criteria pass.
+        assert r.score < 1.0
+
+    def test_handles_unparseable_response(self, make_scenario):
+        grader = LLMJudgeGrader(_StubLLM("not json at all"))
+        r = grader(make_scenario(), "a", "t")
+        assert not r.passed
+        assert "unparseable" in r.rationale
+
+    def test_handles_markdown_fenced_response(self, make_scenario):
+        wrapped = "Here you go:\n```json\n" + self._all_pass_response() + "\n```"
+        grader = LLMJudgeGrader(_StubLLM(wrapped))
+        r = grader(make_scenario(), "a", "t")
+        assert r.passed
+
+    def test_missing_characteristic_short_circuits(self, make_scenario):
+        grader = LLMJudgeGrader(_StubLLM(self._all_pass_response()))
+        s = make_scenario(characteristic_form=None, expected_answer=None)
+        r = grader(s, "a", "t")
+        assert not r.passed
+        assert "characteristic_form" in r.rationale
+
+    def test_install_registers_under_default_name(self, make_scenario):
+        install(_StubLLM(self._all_pass_response()))
+        assert "llm_judge" in registry.names()
+        grader = registry.get("llm_judge")
+        r = grader(make_scenario(), "a", "t")
+        assert r.passed
diff --git a/src/evaluation/tests/test_loader.py b/src/evaluation/tests/test_loader.py
new file mode 100644
index 000000000..24260b34b
--- /dev/null
+++ b/src/evaluation/tests/test_loader.py
@@ -0,0 +1,72 @@
+"""Tests for the trajectory + scenario loader."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from evaluation.loader import (
+    join_records,
+    load_scenarios,
+    load_trajectories,
+)
+from evaluation.models import Scenario
+
+
+def test_load_trajectories_from_dir(trajectory_dir: Path):
+    records = load_trajectories(trajectory_dir)
+    assert len(records) == 1
+    assert records[0].run_id == "run-1"
+    assert records[0].scenario_id == "1"
+
+
+def test_load_trajectories_skips_unparseable(tmp_path: Path, make_persisted_record):
+    (tmp_path / "good.json").write_text(json.dumps(make_persisted_record()), encoding="utf-8")
+    (tmp_path / "bad.json").write_text("{not json", encoding="utf-8")
+    records = load_trajectories(tmp_path)
+    assert len(records) == 1
+
+
+def test_load_scenarios_json_list(tmp_path: Path):
+    p = tmp_path / "s.json"
+    p.write_text(
+        json.dumps(
+            [{"id": 1, "text": "Q1"}, {"id": "2", "text": "Q2"}]
+        ),
+        encoding="utf-8",
+    )
+    out = load_scenarios(p)
+    assert [s.id for s in out] == ["1", "2"]
+
+
+def test_load_scenarios_jsonl(tmp_path: Path):
+    p = tmp_path / "s.jsonl"
+    p.write_text(
+        '{"id": 1, "text": "Q1"}\n{"id": 2, "text": "Q2"}\n',
+        encoding="utf-8",
+    )
+    out = load_scenarios(p)
+    assert [s.id for s in out] == ["1", "2"]
+
+
+def test_load_scenarios_single_object(tmp_path: Path):
+    p = tmp_path / "s.json"
+    p.write_text(json.dumps({"id": 7, "text": "Q"}), encoding="utf-8")
+    out = load_scenarios(p)
+    assert [s.id for s in out] == ["7"]
+
+
+def test_join_drops_orphans(make_persisted_record):
+    from evaluation.models import PersistedTrajectory
+
+    scenarios = [
+        Scenario.from_raw({"id": 1, "text": "Q1"}),
+        Scenario.from_raw({"id": 2, "text": "Q2"}),
+    ]
+    trajs = [
+        PersistedTrajectory.from_raw(make_persisted_record(scenario_id=1)),
+        PersistedTrajectory.from_raw(make_persisted_record(run_id="r2", scenario_id=99)),
+    ]
+    pairs = list(join_records(scenarios, trajs))
+    assert len(pairs) == 1
+    assert pairs[0][0].id == "1"
diff --git a/src/evaluation/tests/test_metrics.py b/src/evaluation/tests/test_metrics.py
new file mode 100644
index 000000000..80cdef621
--- /dev/null
+++ b/src/evaluation/tests/test_metrics.py
@@ -0,0 +1,101 @@
+"""Tests for ops metrics extraction and aggregation."""
+
+from __future__ import annotations
+
+from evaluation.metrics import (
+    _normalize_model,
+    aggregate_ops,
+    metrics_from_trajectory,
+)
+from evaluation.models import (
+    GradeResult,
+    OpsMetrics,
+    PersistedTrajectory,
+    ScenarioResult,
+)
+
+
+def _result(passed: bool = True, ops: OpsMetrics | None = None) -> ScenarioResult:
+    return ScenarioResult(
+        scenario_id="1",
+        scenario_type="iot",
+        runner="plan-execute",
+        model="watsonx/ibm/granite",
+        question="q",
+        answer="a",
+        grade=GradeResult(grading_method="exact_string_match", passed=passed),
+        ops=ops or OpsMetrics(),
+    )
+
+
+class TestMetricsFromTrajectory:
+    def test_sdk_trajectory_sums_per_turn(self, make_persisted_record):
+        rec = PersistedTrajectory.from_raw(make_persisted_record())
+        m = metrics_from_trajectory(rec)
+        assert m.turn_count == 2
+        assert m.tokens_in == 22
+        assert m.tokens_out == 12
+        assert m.tool_call_count == 1
+        assert m.unique_tools == ["sites"]
+        assert m.duration_ms == 300.0
+
+    def test_handles_none_trajectory(self, make_persisted_record):
+        rec = PersistedTrajectory.from_raw(make_persisted_record(trajectory=None))
+        assert metrics_from_trajectory(rec) == OpsMetrics()
+
+    def test_plan_execute_list_trajectory(self, make_persisted_record):
+        rec = PersistedTrajectory.from_raw(
+            make_persisted_record(
+                trajectory=[
+                    {"step_number": 1, "task": "t", "server": "iot", "tool": "sites", "response": "ok"},
+                    {"step_number": 2, "task": "t2", "server": "iot", "tool": "assets", "response": "ok"},
+                    {"step_number": 3, "task": "t3", "server": "iot", "tool": "sites", "response": "ok"},
+                ]
+            )
+        )
+        m = metrics_from_trajectory(rec)
+        assert m.turn_count == 3
+        assert m.tool_call_count == 3
+        assert m.unique_tools == ["assets", "sites"]
+
+
+class TestAggregateOps:
+    def test_empty(self):
+        agg = aggregate_ops([])
+        assert agg.tokens_in_total == 0
+        assert agg.duration_ms_p50 is None
+
+    def test_sums_and_percentiles(self):
+        results = [
+            _result(ops=OpsMetrics(tokens_in=10, tokens_out=5, duration_ms=100.0, tool_call_count=1)),
+            _result(ops=OpsMetrics(tokens_in=20, tokens_out=10, duration_ms=300.0, tool_call_count=2)),
+            _result(ops=OpsMetrics(tokens_in=30, tokens_out=15, duration_ms=500.0, tool_call_count=3)),
+        ]
+        agg = aggregate_ops(results)
+        assert agg.tokens_in_total == 60
+        assert agg.tokens_out_total == 30
+        assert agg.tool_calls_total == 6
+        assert agg.duration_ms_p50 is not None
+        assert agg.duration_ms_p95 is not None
+        assert agg.duration_ms_p50 <= agg.duration_ms_p95
+
+    def test_cost_only_when_some_present(self):
+        results = [
+            _result(ops=OpsMetrics(est_cost_usd=0.01)),
+            _result(ops=OpsMetrics(est_cost_usd=0.02)),
+        ]
+        agg = aggregate_ops(results)
+        assert agg.est_cost_usd_total == 0.03
+
+
+class TestNormalizeModel:
+    def test_strips_provider_prefix(self):
+        assert _normalize_model("litellm_proxy/anthropic/claude-opus-4-5") == "claude-opus-4-5"
+        assert _normalize_model("watsonx/ibm/granite-13b") == "granite-13b"
+
+    def test_strips_long_numeric_suffix(self):
+        assert _normalize_model("claude-opus-4-5-20250101") == "claude-opus-4-5"
+
+    def test_keeps_short_numeric_suffix(self):
+        # "4-5" suffix is the model version, not a date — leave it intact.
+        assert _normalize_model("claude-opus-4-5") == "claude-opus-4-5"
diff --git a/src/evaluation/tests/test_models.py b/src/evaluation/tests/test_models.py
new file mode 100644
index 000000000..4aca4d551
--- /dev/null
+++ b/src/evaluation/tests/test_models.py
@@ -0,0 +1,45 @@
+"""Tests for evaluation Pydantic models."""
+
+from evaluation.models import PersistedTrajectory, Scenario
+
+
+def test_scenario_from_raw_coerces_int_id_to_str():
+    s = Scenario.from_raw({"id": 301, "text": "Q"})
+    assert s.id == "301"
+    assert isinstance(s.id, str)
+
+
+def test_scenario_preserves_extra_fields():
+    s = Scenario.from_raw({"id": "1", "text": "Q", "characteristic_form": "X", "tolerance": 0.01})
+    extra = s.model_extra or {}
+    assert extra.get("tolerance") == 0.01
+
+
+def test_persisted_trajectory_coerces_scenario_id():
+    t = PersistedTrajectory.from_raw(
+        {
+            "run_id": "r",
+            "scenario_id": 42,
+            "runner": "plan-execute",
+            "model": "m",
+            "question": "q",
+            "answer": "a",
+            "trajectory": None,
+        }
+    )
+    assert t.scenario_id == "42"
+
+
+def test_persisted_trajectory_allows_none_scenario_id():
+    t = PersistedTrajectory.from_raw(
+        {
+            "run_id": "r",
+            "scenario_id": None,
+            "runner": "plan-execute",
+            "model": "m",
+            "question": "q",
+            "answer": "a",
+            "trajectory": None,
+        }
+    )
+    assert t.scenario_id is None
diff --git a/src/evaluation/tests/test_report.py b/src/evaluation/tests/test_report.py
new file mode 100644
index 000000000..14816832a
--- /dev/null
+++ b/src/evaluation/tests/test_report.py
@@ -0,0 +1,74 @@
+"""Tests for EvalReport assembly and serialization."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from evaluation.models import (
+    GradeResult,
+    OpsMetrics,
+    ScenarioResult,
+)
+from evaluation.report import build_report, render_summary, write_report
+
+
+def _result(stype: str, passed: bool, **ops_kwargs) -> ScenarioResult:
+    return ScenarioResult(
+        scenario_id="x",
+        scenario_type=stype,
+        runner="plan-execute",
+        model="watsonx/ibm/granite",
+        question="q",
+        answer="a",
+        grade=GradeResult(grading_method="llm_judge", passed=passed, score=1.0 if passed else 0.0),
+        ops=OpsMetrics(**ops_kwargs),
+    )
+
+
+def test_build_report_totals_and_breakdown():
+    results = [
+        _result("iot", True, tokens_in=10, tokens_out=5),
+        _result("iot", False, tokens_in=8, tokens_out=4),
+        _result("tsfm", True, tokens_in=20, tokens_out=10),
+    ]
+    report = build_report(results)
+
+    assert report.totals == {
+        "scenarios": 3,
+        "graded": 3,
+        "passed": 2,
+        "pass_rate": round(2 / 3, 4),
+    }
+    assert report.by_scenario_type["iot"].total == 2
+    assert report.by_scenario_type["iot"].passed == 1
+    assert report.by_scenario_type["tsfm"].pass_rate == 1.0
+    assert report.ops.tokens_in_total == 38
+
+
+def test_build_report_handles_empty():
+    report = build_report([])
+    assert report.totals["scenarios"] == 0
+    assert report.totals["pass_rate"] == 0.0
+    assert report.by_scenario_type == {}
+
+
+def test_write_report_round_trips(tmp_path: Path):
+    results = [_result("iot", True)]
+    report = build_report(results)
+    out = write_report(report, tmp_path / "nested" / "report.json")
+    assert out.exists()
+    data = json.loads(out.read_text(encoding="utf-8"))
+    assert data["totals"]["passed"] == 1
+    assert data["by_scenario_type"]["iot"]["pass_rate"] == 1.0
+
+
+def test_render_summary_includes_headlines():
+    results = [
+        _result("iot", True, tokens_in=10, tokens_out=5, duration_ms=100.0, tool_call_count=1),
+        _result("iot", False, tokens_in=8, tokens_out=4, duration_ms=200.0),
+    ]
+    text = render_summary(build_report(results))
+    assert "Pass rate" in text
+    assert "iot" in text
+    assert "tokens_in_total" in text
diff --git a/src/evaluation/tests/test_runner.py b/src/evaluation/tests/test_runner.py
new file mode 100644
index 000000000..ffab1688f
--- /dev/null
+++ b/src/evaluation/tests/test_runner.py
@@ -0,0 +1,76 @@
+"""Smoke test for the end-to-end evaluation runner."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from evaluation.models import GradeResult, Scenario
+from evaluation.runner import evaluate
+from evaluation import graders as registry
+
+
+def _always_pass_grader(scenario: Scenario, answer: str, trajectory_text: str) -> GradeResult:
+    return GradeResult(grading_method="stub", passed=True, score=1.0)
+
+
+def test_evaluate_end_to_end(tmp_path: Path, make_persisted_record):
+    # Two trajectories, both joinable to scenarios.
+    rec_a = make_persisted_record(run_id="run-a", scenario_id=1, answer="A")
+    rec_b = make_persisted_record(run_id="run-b", scenario_id=2, answer="B")
+    (tmp_path / "run-a.json").write_text(json.dumps(rec_a), encoding="utf-8")
+    (tmp_path / "run-b.json").write_text(json.dumps(rec_b), encoding="utf-8")
+
+    scenarios_path = tmp_path / "scenarios.json"
+    scenarios_path.write_text(
+        json.dumps(
+            [
+                {"id": 1, "text": "Q1", "type": "iot"},
+                {"id": 2, "text": "Q2", "type": "tsfm"},
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    registry.register("stub", _always_pass_grader)
+
+    report = evaluate(
+        trajectories_path=tmp_path,
+        scenarios_paths=[scenarios_path],
+        default_grading_method="stub",
+    )
+
+    assert report.totals["scenarios"] == 2
+    assert report.totals["passed"] == 2
+    assert set(report.by_scenario_type.keys()) == {"iot", "tsfm"}
+    assert report.ops.tokens_in_total > 0
+
+
+def test_evaluate_uses_per_scenario_grading_method(tmp_path: Path, make_persisted_record):
+    rec = make_persisted_record(run_id="run-x", scenario_id=1)
+    (tmp_path / "run-x.json").write_text(json.dumps(rec), encoding="utf-8")
+
+    scenarios_path = tmp_path / "scenarios.json"
+    scenarios_path.write_text(
+        json.dumps(
+            [
+                {
+                    "id": 1,
+                    "text": "Q",
+                    "type": "iot",
+                    "expected_answer": "A.",
+                    "grading_method": "exact_string_match",
+                }
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    report = evaluate(
+        trajectories_path=tmp_path,
+        scenarios_paths=[scenarios_path],
+        default_grading_method="numeric_match",  # would fail; per-scenario override wins
+    )
+
+    assert report.totals["passed"] == 1
+    assert report.results[0].grade.grading_method == "exact_string_match"

From 079bf6ad11fa50f6d60d79b79f4c7d8ddb4818c4 Mon Sep 17 00:00:00 2001
From: Shuxin Lin <linshuhsin@gmail.com>
Date: Wed, 13 May 2026 11:59:58 -0400
Subject: [PATCH 2/8] refactor(evaluation): adopt MLflow Evaluator/Scorer
 concept
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewer feedback (PR #280): align with MLflow's evaluator/scorer split.

- Rename src/evaluation/graders/ -> src/evaluation/scorers/ and organise
  by family: code_based (exact/numeric), llm_judge (LLM-As-Judge),
  semantic (new).
- Rename GradeResult -> ScorerResult with field `scorer` (Scenario
  input field `grading_method` unchanged — input contract preserved).
- Add `Evaluator` class as the orchestration entry point; functional
  `evaluate()` now delegates to it.
- Add Semantic-Score scorer using difflib.SequenceMatcher (stdlib only,
  no extra deps); threshold overridable via scenario.similarity_threshold.
- CLI: add --scorer-default (keeps --grader-default as alias).

Tests: 7 new (4 semantic + 2 evaluator + 1 registry); 46 total in
src/evaluation/, full suite 316 passed.

Signed-off-by: Shuxin Lin <linshuhsin@gmail.com>
---
 src/evaluation/__init__.py                    | 14 ++-
 src/evaluation/cli.py                         | 31 +++----
 src/evaluation/evaluator.py                   | 87 +++++++++++++++++++
 src/evaluation/graders/__init__.py            | 36 --------
 src/evaluation/models.py                      | 13 ++-
 src/evaluation/runner.py                      | 59 ++-----------
 src/evaluation/scorers/__init__.py            | 44 ++++++++++
 .../code_based.py}                            | 28 +++---
 .../{graders => scorers}/llm_judge.py         | 32 +++----
 src/evaluation/scorers/semantic.py            | 51 +++++++++++
 src/evaluation/tests/test_evaluator.py        | 67 ++++++++++++++
 src/evaluation/tests/test_metrics.py          |  4 +-
 src/evaluation/tests/test_report.py           |  4 +-
 src/evaluation/tests/test_runner.py           | 12 +--
 .../{test_graders.py => test_scorers.py}      | 78 ++++++++++++-----
 15 files changed, 393 insertions(+), 167 deletions(-)
 create mode 100644 src/evaluation/evaluator.py
 delete mode 100644 src/evaluation/graders/__init__.py
 create mode 100644 src/evaluation/scorers/__init__.py
 rename src/evaluation/{graders/deterministic.py => scorers/code_based.py} (76%)
 rename src/evaluation/{graders => scorers}/llm_judge.py (85%)
 create mode 100644 src/evaluation/scorers/semantic.py
 create mode 100644 src/evaluation/tests/test_evaluator.py
 rename src/evaluation/tests/{test_graders.py => test_scorers.py} (57%)

diff --git a/src/evaluation/__init__.py b/src/evaluation/__init__.py
index ca632ab1b..280da44d5 100644
--- a/src/evaluation/__init__.py
+++ b/src/evaluation/__init__.py
@@ -9,26 +9,36 @@
 ``run`` (executes the agent — already exists) → ``evaluate`` (this
 module) → ``report.json``.  Re-grading from saved trajectories is
 first-class.
+
+The evaluation concept follows MLflow's vocabulary: an
+:class:`Evaluator` orchestrates one or more :data:`Scorer` callables
+(:class:`ScorerResult` records the outcome).  Scorers fall into three
+families — Code-Based, LLM-As-Judge, and Semantic-Score — registered
+under :mod:`evaluation.scorers`.
 """
 
+from .evaluator import Evaluator
 from .models import (
     AggregateOps,
     EvalReport,
-    GradeResult,
     OpsMetrics,
     PersistedTrajectory,
     Scenario,
     ScenarioResult,
+    ScorerResult,
     TypeBreakdown,
 )
+from .scorers import Scorer
 
 __all__ = [
     "AggregateOps",
     "EvalReport",
-    "GradeResult",
+    "Evaluator",
     "OpsMetrics",
     "PersistedTrajectory",
     "Scenario",
     "ScenarioResult",
+    "Scorer",
+    "ScorerResult",
     "TypeBreakdown",
 ]
diff --git a/src/evaluation/cli.py b/src/evaluation/cli.py
index 806452899..cc4ee87ed 100644
--- a/src/evaluation/cli.py
+++ b/src/evaluation/cli.py
@@ -1,4 +1,4 @@
-"""``uv run evaluate`` — offline grading + report generation."""
+"""``uv run evaluate`` — offline scoring + report generation."""
 
 from __future__ import annotations
 
@@ -7,16 +7,16 @@
 import sys
 from pathlib import Path
 
-from . import graders as grader_registry
+from . import scorers as scorer_registry
+from .evaluator import Evaluator
 from .report import render_summary, write_report
-from .runner import evaluate
 
 
 def _build_parser() -> argparse.ArgumentParser:
     p = argparse.ArgumentParser(
         prog="evaluate",
         description=(
-            "Grade saved agent trajectories against scenario files and "
+            "Score saved agent trajectories against scenario files and "
             "emit a JSON report."
         ),
     )
@@ -40,15 +40,17 @@ def _build_parser() -> argparse.ArgumentParser:
         help="Path to write the JSON report.",
     )
     p.add_argument(
+        "--scorer-default",
         "--grader-default",
+        dest="scorer_default",
         default="llm_judge",
-        help="Grader name when scenario.grading_method is unset. "
-        "Default: llm_judge.",
+        help="Scorer name when scenario.grading_method is unset. "
+        "Default: llm_judge. (--grader-default is a legacy alias.)",
     )
     p.add_argument(
         "--judge-model",
         default=None,
-        help="Model id for the LLM judge (e.g. "
+        help="Model id for the LLM-As-Judge scorer (e.g. "
         "litellm_proxy/anthropic/claude-opus-4-5). "
         "Required when any scenario routes to llm_judge.",
     )
@@ -64,18 +66,18 @@ def _build_parser() -> argparse.ArgumentParser:
 def _maybe_install_judge(judge_model: str | None) -> None:
     if not judge_model:
         return
-    # Imported lazily so the CLI works for deterministic-only runs even
-    # if the LiteLLM dep happens to be flaky in the dev environment.
+    # Imported lazily so the CLI works for code-based-only runs even if
+    # the LiteLLM dep happens to be flaky in the dev environment.
     from llm import LiteLLMBackend  # type: ignore[import-not-found]
 
-    from .graders.llm_judge import install
+    from .scorers.llm_judge import install
 
     install(LiteLLMBackend(model=judge_model))
 
 
-def _validate_grader_default(name: str) -> None:
+def _validate_scorer_default(name: str) -> None:
     try:
-        grader_registry.get(name)
+        scorer_registry.get(name)
     except KeyError as exc:
         raise SystemExit(str(exc))
 
@@ -88,12 +90,11 @@ def main(argv: list[str] | None = None) -> int:
     )
 
     _maybe_install_judge(args.judge_model)
-    _validate_grader_default(args.grader_default)
+    _validate_scorer_default(args.scorer_default)
 
-    report = evaluate(
+    report = Evaluator(default_scorer=args.scorer_default).evaluate(
         trajectories_path=args.trajectories,
         scenarios_paths=list(args.scenarios),
-        default_grading_method=args.grader_default,
     )
 
     out = write_report(report, args.output)
diff --git a/src/evaluation/evaluator.py b/src/evaluation/evaluator.py
new file mode 100644
index 000000000..f7d151040
--- /dev/null
+++ b/src/evaluation/evaluator.py
@@ -0,0 +1,87 @@
+"""Evaluator — orchestrates a set of scorers over a batch of records.
+
+Mirrors MLflow's evaluator/scorer split: the :class:`Evaluator` owns
+the loading + per-record dispatch, while each :data:`Scorer` is a small
+callable that produces a single :class:`ScorerResult`.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+
+from . import scorers as scorer_registry
+from .loader import join_records, load_scenarios, load_trajectories
+from .metrics import metrics_from_trajectory
+from .models import (
+    EvalReport,
+    PersistedTrajectory,
+    Scenario,
+    ScenarioResult,
+    ScorerResult,
+)
+from .report import build_report
+from .scorers import Scorer
+
+_log = logging.getLogger(__name__)
+
+
+class Evaluator:
+    """Run a batch of scenarios against their saved trajectories.
+
+    ``default_scorer`` names the registered scorer to use when a
+    scenario does not set ``grading_method``.  Per-scenario overrides
+    take precedence.
+    """
+
+    def __init__(self, default_scorer: str = "llm_judge") -> None:
+        self.default_scorer = default_scorer
+
+    def evaluate(
+        self,
+        *,
+        trajectories_path: Path,
+        scenarios_paths: list[Path],
+    ) -> EvalReport:
+        scenarios = load_scenarios(scenarios_paths)
+        trajectories = load_trajectories(trajectories_path)
+
+        results: list[ScenarioResult] = []
+        for scenario, traj in join_records(scenarios, trajectories):
+            results.append(self._score_one(scenario, traj))
+
+        return build_report(results)
+
+    def _score_one(
+        self, scenario: Scenario, traj: PersistedTrajectory
+    ) -> ScenarioResult:
+        name = scenario.grading_method or self.default_scorer
+        scorer = self._resolve(name)
+        trajectory_text = _trajectory_to_text(traj)
+        grade = scorer(scenario, traj.answer, trajectory_text)
+
+        return ScenarioResult(
+            scenario_id=scenario.id,
+            scenario_type=scenario.type,
+            runner=traj.runner,
+            model=traj.model,
+            question=traj.question,
+            answer=traj.answer,
+            grade=grade,
+            ops=metrics_from_trajectory(traj),
+        )
+
+    @staticmethod
+    def _resolve(name: str) -> Scorer:
+        return scorer_registry.get(name)
+
+
+def _trajectory_to_text(traj: PersistedTrajectory) -> str:
+    """Flatten a trajectory to a text blob for the LLM-As-Judge prompt."""
+    if traj.trajectory is None:
+        return ""
+    try:
+        return json.dumps(traj.trajectory, indent=2, default=str)
+    except (TypeError, ValueError):
+        return str(traj.trajectory)
diff --git a/src/evaluation/graders/__init__.py b/src/evaluation/graders/__init__.py
deleted file mode 100644
index f58a074e0..000000000
--- a/src/evaluation/graders/__init__.py
+++ /dev/null
@@ -1,36 +0,0 @@
-"""Pluggable grader registry.
-
-Each grader is a callable taking ``(scenario, answer, trajectory_text)``
-and returning a :class:`~evaluation.models.GradeResult`.  Registration
-happens via :func:`register`; the CLI looks up graders by name from
-``scenario.grading_method`` (falling back to a CLI-supplied default).
-"""
-
-from __future__ import annotations
-
-from typing import Callable
-
-from ..models import GradeResult, Scenario
-
-Grader = Callable[[Scenario, str, str], GradeResult]
-
-_REGISTRY: dict[str, Grader] = {}
-
-
-def register(name: str, grader: Grader) -> None:
-    _REGISTRY[name] = grader
-
-
-def get(name: str) -> Grader:
-    if name not in _REGISTRY:
-        raise KeyError(
-            f"unknown grader {name!r}; registered: {sorted(_REGISTRY)}"
-        )
-    return _REGISTRY[name]
-
-
-def names() -> list[str]:
-    return sorted(_REGISTRY)
-
-
-from . import deterministic  # noqa: E402,F401  — register-on-import
diff --git a/src/evaluation/models.py b/src/evaluation/models.py
index 846e98911..25fd0b2aa 100644
--- a/src/evaluation/models.py
+++ b/src/evaluation/models.py
@@ -66,8 +66,15 @@ class OpsMetrics(BaseModel):
     est_cost_usd: float | None = None
 
 
-class GradeResult(BaseModel):
-    grading_method: str
+class ScorerResult(BaseModel):
+    """Output of a single :class:`Scorer` invocation.
+
+    ``scorer`` is the registered name of the scorer that produced this
+    result — distinct from ``Scenario.grading_method``, which is the
+    *requested* scorer on the input side.
+    """
+
+    scorer: str
     passed: bool
     score: float = 0.0
     rationale: str = ""
@@ -81,7 +88,7 @@ class ScenarioResult(BaseModel):
     model: str
     question: str
     answer: str
-    grade: GradeResult
+    grade: ScorerResult
     ops: OpsMetrics
 
 
diff --git a/src/evaluation/runner.py b/src/evaluation/runner.py
index f87da2fa6..23df16f77 100644
--- a/src/evaluation/runner.py
+++ b/src/evaluation/runner.py
@@ -1,18 +1,11 @@
-"""Glue: load → grade → assemble report."""
+"""Backwards-friendly functional entry point delegating to :class:`Evaluator`."""
 
 from __future__ import annotations
 
-import json
-import logging
 from pathlib import Path
 
-from . import graders as grader_registry
-from .loader import join_records, load_scenarios, load_trajectories
-from .metrics import metrics_from_trajectory
-from .models import EvalReport, PersistedTrajectory, Scenario, ScenarioResult
-from .report import build_report
-
-_log = logging.getLogger(__name__)
+from .evaluator import Evaluator
+from .models import EvalReport
 
 
 def evaluate(
@@ -21,48 +14,12 @@ def evaluate(
     scenarios_paths: list[Path],
     default_grading_method: str = "llm_judge",
 ) -> EvalReport:
-    """Load, grade, and aggregate.
+    """Load, score, and aggregate.
 
-    Per-scenario grader is picked from ``scenario.grading_method`` when
+    Per-scenario scorer is picked from ``scenario.grading_method`` when
     set, falling back to ``default_grading_method``.
     """
-    scenarios = load_scenarios(scenarios_paths)
-    trajectories = load_trajectories(trajectories_path)
-
-    results: list[ScenarioResult] = []
-    for scenario, traj in join_records(scenarios, trajectories):
-        results.append(_grade_one(scenario, traj, default_grading_method))
-
-    return build_report(results)
-
-
-def _grade_one(
-    scenario: Scenario,
-    traj: PersistedTrajectory,
-    default_grading_method: str,
-) -> ScenarioResult:
-    method = scenario.grading_method or default_grading_method
-    grader = grader_registry.get(method)
-    trajectory_text = _trajectory_to_text(traj)
-    grade = grader(scenario, traj.answer, trajectory_text)
-
-    return ScenarioResult(
-        scenario_id=scenario.id,
-        scenario_type=scenario.type,
-        runner=traj.runner,
-        model=traj.model,
-        question=traj.question,
-        answer=traj.answer,
-        grade=grade,
-        ops=metrics_from_trajectory(traj),
+    return Evaluator(default_scorer=default_grading_method).evaluate(
+        trajectories_path=trajectories_path,
+        scenarios_paths=scenarios_paths,
     )
-
-
-def _trajectory_to_text(traj: PersistedTrajectory) -> str:
-    """Flatten a trajectory to a text blob for the LLM judge prompt."""
-    if traj.trajectory is None:
-        return ""
-    try:
-        return json.dumps(traj.trajectory, indent=2, default=str)
-    except (TypeError, ValueError):
-        return str(traj.trajectory)
diff --git a/src/evaluation/scorers/__init__.py b/src/evaluation/scorers/__init__.py
new file mode 100644
index 000000000..00ff0c1e2
--- /dev/null
+++ b/src/evaluation/scorers/__init__.py
@@ -0,0 +1,44 @@
+"""Pluggable scorer registry.
+
+Each scorer is a callable taking ``(scenario, answer, trajectory_text)``
+and returning a :class:`~evaluation.models.ScorerResult`.  The vocabulary
+follows MLflow's evaluation concept: an ``Evaluator`` orchestrates one
+or more ``Scorer`` s; scorers fall into three families:
+
+* **Code-Based** — deterministic, no model required (string/numeric
+  matchers in :mod:`evaluation.scorers.code_based`).
+* **LLM-As-Judge** — model-graded against a rubric
+  (:mod:`evaluation.scorers.llm_judge`).
+* **Semantic-Score** — similarity-based, no model call
+  (:mod:`evaluation.scorers.semantic`).
+"""
+
+from __future__ import annotations
+
+from typing import Callable
+
+from ..models import Scenario, ScorerResult
+
+Scorer = Callable[[Scenario, str, str], ScorerResult]
+
+_REGISTRY: dict[str, Scorer] = {}
+
+
+def register(name: str, scorer: Scorer) -> None:
+    _REGISTRY[name] = scorer
+
+
+def get(name: str) -> Scorer:
+    if name not in _REGISTRY:
+        raise KeyError(
+            f"unknown scorer {name!r}; registered: {sorted(_REGISTRY)}"
+        )
+    return _REGISTRY[name]
+
+
+def names() -> list[str]:
+    return sorted(_REGISTRY)
+
+
+from . import code_based  # noqa: E402,F401  — register-on-import
+from . import semantic  # noqa: E402,F401  — register-on-import
diff --git a/src/evaluation/graders/deterministic.py b/src/evaluation/scorers/code_based.py
similarity index 76%
rename from src/evaluation/graders/deterministic.py
rename to src/evaluation/scorers/code_based.py
index 35db1c299..929ef8bdf 100644
--- a/src/evaluation/graders/deterministic.py
+++ b/src/evaluation/scorers/code_based.py
@@ -1,20 +1,20 @@
-"""Pure deterministic graders — no LLM, no network."""
+"""Code-Based scorers — deterministic, no LLM, no network."""
 
 from __future__ import annotations
 
 import math
 
-from ..models import GradeResult, Scenario
+from ..models import Scenario, ScorerResult
 from . import register
 
 
 def exact_string_match(
     scenario: Scenario, answer: str, trajectory_text: str
-) -> GradeResult:
+) -> ScorerResult:
     expected = scenario.expected_answer
     if expected is None:
-        return GradeResult(
-            grading_method="exact_string_match",
+        return ScorerResult(
+            scorer="exact_string_match",
             passed=False,
             score=0.0,
             rationale="scenario has no expected_answer",
@@ -23,8 +23,8 @@ def exact_string_match(
     a = str(answer).strip().lower()
     e = str(expected).strip().lower()
     passed = a == e
-    return GradeResult(
-        grading_method="exact_string_match",
+    return ScorerResult(
+        scorer="exact_string_match",
         passed=passed,
         score=1.0 if passed else 0.0,
         rationale="" if passed else f"expected {expected!r}, got {answer!r}",
@@ -34,14 +34,14 @@ def exact_string_match(
 
 def numeric_match(
     scenario: Scenario, answer: str, trajectory_text: str
-) -> GradeResult:
+) -> ScorerResult:
     expected_raw = scenario.expected_answer
     extra = scenario.model_extra or {}
     tolerance = float(extra.get("tolerance", 1e-6))
 
     if expected_raw is None:
-        return GradeResult(
-            grading_method="numeric_match",
+        return ScorerResult(
+            scorer="numeric_match",
             passed=False,
             rationale="scenario has no expected_answer",
         )
@@ -50,16 +50,16 @@ def numeric_match(
         a = float(answer)
         e = float(expected_raw)
     except (TypeError, ValueError) as err:
-        return GradeResult(
-            grading_method="numeric_match",
+        return ScorerResult(
+            scorer="numeric_match",
             passed=False,
             rationale=f"could not parse numbers: {err}",
             details={"expected": expected_raw, "actual": answer},
         )
 
     passed = math.isclose(a, e, rel_tol=tolerance, abs_tol=tolerance)
-    return GradeResult(
-        grading_method="numeric_match",
+    return ScorerResult(
+        scorer="numeric_match",
         passed=passed,
         score=1.0 if passed else 0.0,
         rationale="" if passed else f"|{a} - {e}| > tol={tolerance}",
diff --git a/src/evaluation/graders/llm_judge.py b/src/evaluation/scorers/llm_judge.py
similarity index 85%
rename from src/evaluation/graders/llm_judge.py
rename to src/evaluation/scorers/llm_judge.py
index fb55bf73f..00518061a 100644
--- a/src/evaluation/graders/llm_judge.py
+++ b/src/evaluation/scorers/llm_judge.py
@@ -1,4 +1,4 @@
-"""LLM-judge grader.
+"""LLM-As-Judge scorer.
 
 Free-form answers are scored against ``scenario.characteristic_form``
 using a six-criterion rubric (task completion, data retrieval accuracy,
@@ -16,7 +16,7 @@
 
 from llm import LLMBackend
 
-from ..models import GradeResult, Scenario
+from ..models import Scenario, ScorerResult
 from . import register
 
 _log = logging.getLogger(__name__)
@@ -61,8 +61,8 @@
 The agent passes overall iff the first five are true AND hallucinations is false."""
 
 
-class LLMJudgeGrader:
-    """Closure-style grader that holds an :class:`LLMBackend`."""
+class LLMJudgeScorer:
+    """Closure-style scorer that holds an :class:`LLMBackend`."""
 
     def __init__(self, llm: LLMBackend, name: str = "llm_judge") -> None:
         self._llm = llm
@@ -70,11 +70,11 @@ def __init__(self, llm: LLMBackend, name: str = "llm_judge") -> None:
 
     def __call__(
         self, scenario: Scenario, answer: str, trajectory_text: str
-    ) -> GradeResult:
+    ) -> ScorerResult:
         characteristic = scenario.characteristic_form or scenario.expected_answer or ""
         if not characteristic:
-            return GradeResult(
-                grading_method=self.name,
+            return ScorerResult(
+                scorer=self.name,
                 passed=False,
                 rationale="scenario has neither characteristic_form nor expected_answer",
             )
@@ -88,18 +88,18 @@ def __call__(
 
         try:
             raw = self._llm.generate(prompt)
-        except Exception as exc:  # judge call failure is a grading failure, not a crash
+        except Exception as exc:  # judge call failure is a scoring failure, not a crash
             _log.exception("llm_judge: backend error")
-            return GradeResult(
-                grading_method=self.name,
+            return ScorerResult(
+                scorer=self.name,
                 passed=False,
                 rationale=f"judge backend error: {exc}",
             )
 
         review = _parse_review(raw)
         if review is None:
-            return GradeResult(
-                grading_method=self.name,
+            return ScorerResult(
+                scorer=self.name,
                 passed=False,
                 rationale="judge returned unparseable JSON",
                 details={"raw": raw[:2000]},
@@ -117,8 +117,8 @@ def __call__(
         if review.get("hallucinations") is True:
             score = max(0.0, score - 0.2)
 
-        return GradeResult(
-            grading_method=self.name,
+        return ScorerResult(
+            scorer=self.name,
             passed=passed,
             score=round(score, 3),
             rationale=str(review.get("reason", ""))[:500],
@@ -140,5 +140,5 @@ def _parse_review(raw: str) -> dict | None:
 
 
 def install(llm: LLMBackend, name: str = "llm_judge") -> None:
-    """Register an LLM-judge grader bound to ``llm`` under ``name``."""
-    register(name, LLMJudgeGrader(llm, name=name))
+    """Register an LLM-As-Judge scorer bound to ``llm`` under ``name``."""
+    register(name, LLMJudgeScorer(llm, name=name))
diff --git a/src/evaluation/scorers/semantic.py b/src/evaluation/scorers/semantic.py
new file mode 100644
index 000000000..56506eb10
--- /dev/null
+++ b/src/evaluation/scorers/semantic.py
@@ -0,0 +1,51 @@
+"""Semantic-Score scorer — similarity without an LLM call.
+
+Uses ``difflib.SequenceMatcher`` over normalised text so the scorer has
+no external dependencies and is stable in CI.  A scenario can override
+the pass threshold via ``scenario.similarity_threshold`` (default 0.6).
+"""
+
+from __future__ import annotations
+
+import re
+from difflib import SequenceMatcher
+
+from ..models import Scenario, ScorerResult
+from . import register
+
+_DEFAULT_THRESHOLD = 0.6
+_WS_RE = re.compile(r"\s+")
+
+
+def _normalize(text: str) -> str:
+    return _WS_RE.sub(" ", str(text).strip().lower())
+
+
+def semantic_similarity(
+    scenario: Scenario, answer: str, trajectory_text: str
+) -> ScorerResult:
+    reference = scenario.characteristic_form or scenario.expected_answer
+    if not reference:
+        return ScorerResult(
+            scorer="semantic_similarity",
+            passed=False,
+            rationale="scenario has neither characteristic_form nor expected_answer",
+        )
+
+    extra = scenario.model_extra or {}
+    threshold = float(extra.get("similarity_threshold", _DEFAULT_THRESHOLD))
+
+    score = SequenceMatcher(None, _normalize(reference), _normalize(answer)).ratio()
+    passed = score >= threshold
+    return ScorerResult(
+        scorer="semantic_similarity",
+        passed=passed,
+        score=round(score, 4),
+        rationale=(
+            "" if passed else f"similarity {score:.3f} below threshold {threshold}"
+        ),
+        details={"threshold": threshold, "reference": reference},
+    )
+
+
+register("semantic_similarity", semantic_similarity)
diff --git a/src/evaluation/tests/test_evaluator.py b/src/evaluation/tests/test_evaluator.py
new file mode 100644
index 000000000..efbb424b0
--- /dev/null
+++ b/src/evaluation/tests/test_evaluator.py
@@ -0,0 +1,67 @@
+"""Tests for the Evaluator class — the orchestration layer."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from evaluation import scorers as registry
+from evaluation.evaluator import Evaluator
+from evaluation.models import Scenario, ScorerResult
+
+
+def _stub_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult:
+    return ScorerResult(scorer="stub-evaluator", passed=True, score=1.0)
+
+
+def test_evaluator_routes_to_default_scorer(tmp_path: Path, make_persisted_record):
+    rec = make_persisted_record(run_id="run-1", scenario_id=1)
+    (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")
+
+    scenarios_path = tmp_path / "scenarios.json"
+    scenarios_path.write_text(
+        json.dumps([{"id": 1, "text": "Q", "type": "iot"}]),
+        encoding="utf-8",
+    )
+
+    registry.register("stub-evaluator", _stub_scorer)
+
+    report = Evaluator(default_scorer="stub-evaluator").evaluate(
+        trajectories_path=tmp_path,
+        scenarios_paths=[scenarios_path],
+    )
+
+    assert report.totals["passed"] == 1
+    assert report.results[0].grade.scorer == "stub-evaluator"
+
+
+def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_record):
+    # Default scorer would crash on a missing registration; the
+    # scenario-level grading_method must win and route to a code-based
+    # scorer instead.
+    rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="3.14")
+    (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")
+
+    scenarios_path = tmp_path / "scenarios.json"
+    scenarios_path.write_text(
+        json.dumps(
+            [
+                {
+                    "id": 1,
+                    "text": "Q",
+                    "type": "tsfm",
+                    "expected_answer": "3.14",
+                    "grading_method": "numeric_match",
+                }
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    report = Evaluator(default_scorer="llm_judge").evaluate(
+        trajectories_path=tmp_path,
+        scenarios_paths=[scenarios_path],
+    )
+
+    assert report.totals["passed"] == 1
+    assert report.results[0].grade.scorer == "numeric_match"
diff --git a/src/evaluation/tests/test_metrics.py b/src/evaluation/tests/test_metrics.py
index 80cdef621..804624ffb 100644
--- a/src/evaluation/tests/test_metrics.py
+++ b/src/evaluation/tests/test_metrics.py
@@ -8,10 +8,10 @@
     metrics_from_trajectory,
 )
 from evaluation.models import (
-    GradeResult,
     OpsMetrics,
     PersistedTrajectory,
     ScenarioResult,
+    ScorerResult,
 )
 
 
@@ -23,7 +23,7 @@ def _result(passed: bool = True, ops: OpsMetrics | None = None) -> ScenarioResul
         model="watsonx/ibm/granite",
         question="q",
         answer="a",
-        grade=GradeResult(grading_method="exact_string_match", passed=passed),
+        grade=ScorerResult(scorer="exact_string_match", passed=passed),
         ops=ops or OpsMetrics(),
     )
 
diff --git a/src/evaluation/tests/test_report.py b/src/evaluation/tests/test_report.py
index 14816832a..7e2db642f 100644
--- a/src/evaluation/tests/test_report.py
+++ b/src/evaluation/tests/test_report.py
@@ -6,9 +6,9 @@
 from pathlib import Path
 
 from evaluation.models import (
-    GradeResult,
     OpsMetrics,
     ScenarioResult,
+    ScorerResult,
 )
 from evaluation.report import build_report, render_summary, write_report
 
@@ -21,7 +21,7 @@ def _result(stype: str, passed: bool, **ops_kwargs) -> ScenarioResult:
         model="watsonx/ibm/granite",
         question="q",
         answer="a",
-        grade=GradeResult(grading_method="llm_judge", passed=passed, score=1.0 if passed else 0.0),
+        grade=ScorerResult(scorer="llm_judge", passed=passed, score=1.0 if passed else 0.0),
         ops=OpsMetrics(**ops_kwargs),
     )
 
diff --git a/src/evaluation/tests/test_runner.py b/src/evaluation/tests/test_runner.py
index ffab1688f..1c22cc618 100644
--- a/src/evaluation/tests/test_runner.py
+++ b/src/evaluation/tests/test_runner.py
@@ -5,13 +5,13 @@
 import json
 from pathlib import Path
 
-from evaluation.models import GradeResult, Scenario
+from evaluation.models import Scenario, ScorerResult
 from evaluation.runner import evaluate
-from evaluation import graders as registry
+from evaluation import scorers as registry
 
 
-def _always_pass_grader(scenario: Scenario, answer: str, trajectory_text: str) -> GradeResult:
-    return GradeResult(grading_method="stub", passed=True, score=1.0)
+def _always_pass_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult:
+    return ScorerResult(scorer="stub", passed=True, score=1.0)
 
 
 def test_evaluate_end_to_end(tmp_path: Path, make_persisted_record):
@@ -32,7 +32,7 @@ def test_evaluate_end_to_end(tmp_path: Path, make_persisted_record):
         encoding="utf-8",
     )
 
-    registry.register("stub", _always_pass_grader)
+    registry.register("stub", _always_pass_scorer)
 
     report = evaluate(
         trajectories_path=tmp_path,
@@ -73,4 +73,4 @@ def test_evaluate_uses_per_scenario_grading_method(tmp_path: Path, make_persiste
     )
 
     assert report.totals["passed"] == 1
-    assert report.results[0].grade.grading_method == "exact_string_match"
+    assert report.results[0].grade.scorer == "exact_string_match"
diff --git a/src/evaluation/tests/test_graders.py b/src/evaluation/tests/test_scorers.py
similarity index 57%
rename from src/evaluation/tests/test_graders.py
rename to src/evaluation/tests/test_scorers.py
index 9002e5f95..2e72e61c4 100644
--- a/src/evaluation/tests/test_graders.py
+++ b/src/evaluation/tests/test_scorers.py
@@ -1,10 +1,11 @@
-"""Tests for deterministic + LLM-judge graders."""
+"""Tests for the three scorer families: code-based, LLM-as-judge, semantic."""
 
 from __future__ import annotations
 
-from evaluation import graders as registry
-from evaluation.graders.deterministic import exact_string_match, numeric_match
-from evaluation.graders.llm_judge import LLMJudgeGrader, install
+from evaluation import scorers as registry
+from evaluation.scorers.code_based import exact_string_match, numeric_match
+from evaluation.scorers.llm_judge import LLMJudgeScorer, install
+from evaluation.scorers.semantic import semantic_similarity
 from llm import LLMBackend
 
 
@@ -53,10 +54,47 @@ def test_custom_tolerance(self, make_scenario):
         assert r.passed
 
 
+class TestSemanticSimilarity:
+    def test_close_text_passes_default_threshold(self, make_scenario):
+        s = make_scenario(
+            characteristic_form="Lists temperature, pressure, and vibration sensors."
+        )
+        r = semantic_similarity(
+            s, "lists temperature pressure and vibration sensors", ""
+        )
+        assert r.passed
+        assert r.score >= 0.6
+
+    def test_unrelated_text_fails(self, make_scenario):
+        s = make_scenario(characteristic_form="lists three iot sensors")
+        r = semantic_similarity(s, "the chiller is operating normally", "")
+        assert not r.passed
+        assert "below threshold" in r.rationale
+
+    def test_custom_threshold_override(self, make_scenario):
+        s = make_scenario(
+            characteristic_form="lists three iot sensors",
+            similarity_threshold=0.05,
+        )
+        r = semantic_similarity(s, "completely different answer text", "")
+        # Threshold lowered enough that even weak overlap passes.
+        assert r.passed
+
+    def test_missing_reference_short_circuits(self, make_scenario):
+        s = make_scenario(characteristic_form=None, expected_answer=None)
+        r = semantic_similarity(s, "anything", "")
+        assert not r.passed
+        assert "characteristic_form" in r.rationale
+
+
 class TestRegistry:
-    def test_deterministic_graders_registered(self):
-        assert "exact_string_match" in registry.names()
-        assert "numeric_match" in registry.names()
+    def test_code_based_scorers_registered(self):
+        names = registry.names()
+        assert "exact_string_match" in names
+        assert "numeric_match" in names
+
+    def test_semantic_scorer_registered(self):
+        assert "semantic_similarity" in registry.names()
 
     def test_get_unknown_raises(self):
         try:
@@ -67,7 +105,7 @@ def test_get_unknown_raises(self):
             raise AssertionError("expected KeyError")
 
 
-class TestLLMJudgeGrader:
+class TestLLMJudgeScorer:
     def _all_pass_response(self) -> str:
         return (
             '{"task_completion": true, "data_retrieval_accuracy": true, '
@@ -77,8 +115,8 @@ def _all_pass_response(self) -> str:
         )
 
     def test_passes_when_all_criteria_true(self, make_scenario):
-        grader = LLMJudgeGrader(_StubLLM(self._all_pass_response()))
-        r = grader(make_scenario(), "answer", "trajectory")
+        scorer = LLMJudgeScorer(_StubLLM(self._all_pass_response()))
+        r = scorer(make_scenario(), "answer", "trajectory")
         assert r.passed
         assert r.score == 1.0
         assert r.rationale == "Looks good."
@@ -87,34 +125,34 @@ def test_fails_on_hallucination(self, make_scenario):
         resp = self._all_pass_response().replace(
             '"hallucinations": false', '"hallucinations": true'
         )
-        grader = LLMJudgeGrader(_StubLLM(resp))
-        r = grader(make_scenario(), "answer", "trajectory")
+        scorer = LLMJudgeScorer(_StubLLM(resp))
+        r = scorer(make_scenario(), "answer", "trajectory")
         assert not r.passed
         # Score is penalized but not zeroed when 5/5 criteria pass.
         assert r.score < 1.0
 
     def test_handles_unparseable_response(self, make_scenario):
-        grader = LLMJudgeGrader(_StubLLM("not json at all"))
-        r = grader(make_scenario(), "a", "t")
+        scorer = LLMJudgeScorer(_StubLLM("not json at all"))
+        r = scorer(make_scenario(), "a", "t")
         assert not r.passed
         assert "unparseable" in r.rationale
 
     def test_handles_markdown_fenced_response(self, make_scenario):
         wrapped = "Here you go:\n```json\n" + self._all_pass_response() + "\n```"
-        grader = LLMJudgeGrader(_StubLLM(wrapped))
-        r = grader(make_scenario(), "a", "t")
+        scorer = LLMJudgeScorer(_StubLLM(wrapped))
+        r = scorer(make_scenario(), "a", "t")
         assert r.passed
 
     def test_missing_characteristic_short_circuits(self, make_scenario):
-        grader = LLMJudgeGrader(_StubLLM(self._all_pass_response()))
+        scorer = LLMJudgeScorer(_StubLLM(self._all_pass_response()))
         s = make_scenario(characteristic_form=None, expected_answer=None)
-        r = grader(s, "a", "t")
+        r = scorer(s, "a", "t")
         assert not r.passed
         assert "characteristic_form" in r.rationale
 
     def test_install_registers_under_default_name(self, make_scenario):
         install(_StubLLM(self._all_pass_response()))
         assert "llm_judge" in registry.names()
-        grader = registry.get("llm_judge")
-        r = grader(make_scenario(), "a", "t")
+        scorer = registry.get("llm_judge")
+        r = scorer(make_scenario(), "a", "t")
         assert r.passed

From 5272a4fd0d59b5cebc2355c4aa25ef4634f7aa70 Mon Sep 17 00:00:00 2001
From: Shuxin Lin <linshuhsin@gmail.com>
Date: Wed, 13 May 2026 12:02:37 -0400
Subject: [PATCH 3/8] refactor(evaluation): drop code-based scorer bodies, keep
 skeleton

Code-Based scorers (exact_string_match, numeric_match) are now
skeleton stubs raising NotImplementedError; the family slot in the
Evaluator/Scorer taxonomy is preserved but implementations are
deferred. Registry no longer auto-registers them on import.

Tests for behavior removed; new TestCodeBasedSkeletons asserts
NotImplementedError. test_runner and test_evaluator override tests
re-pointed at semantic_similarity. 41 evaluation tests pass.

Signed-off-by: Shuxin Lin <linshuhsin@gmail.com>
---
 src/evaluation/scorers/code_based.py   | 64 +++-----------------------
 src/evaluation/tests/test_evaluator.py | 13 +++---
 src/evaluation/tests/test_runner.py    | 13 ++++--
 src/evaluation/tests/test_scorers.py   | 57 +++++++----------------
 4 files changed, 38 insertions(+), 109 deletions(-)

diff --git a/src/evaluation/scorers/code_based.py b/src/evaluation/scorers/code_based.py
index 929ef8bdf..c0d167116 100644
--- a/src/evaluation/scorers/code_based.py
+++ b/src/evaluation/scorers/code_based.py
@@ -1,71 +1,21 @@
-"""Code-Based scorers — deterministic, no LLM, no network."""
+"""Code-Based scorers — deterministic, no LLM, no network.
 
-from __future__ import annotations
+Skeleton only — fill in the implementations and re-register with the
+scorer registry before use.
+"""
 
-import math
+from __future__ import annotations
 
 from ..models import Scenario, ScorerResult
-from . import register
 
 
 def exact_string_match(
     scenario: Scenario, answer: str, trajectory_text: str
 ) -> ScorerResult:
-    expected = scenario.expected_answer
-    if expected is None:
-        return ScorerResult(
-            scorer="exact_string_match",
-            passed=False,
-            score=0.0,
-            rationale="scenario has no expected_answer",
-        )
-
-    a = str(answer).strip().lower()
-    e = str(expected).strip().lower()
-    passed = a == e
-    return ScorerResult(
-        scorer="exact_string_match",
-        passed=passed,
-        score=1.0 if passed else 0.0,
-        rationale="" if passed else f"expected {expected!r}, got {answer!r}",
-        details={"expected": expected, "actual": answer},
-    )
+    raise NotImplementedError
 
 
 def numeric_match(
     scenario: Scenario, answer: str, trajectory_text: str
 ) -> ScorerResult:
-    expected_raw = scenario.expected_answer
-    extra = scenario.model_extra or {}
-    tolerance = float(extra.get("tolerance", 1e-6))
-
-    if expected_raw is None:
-        return ScorerResult(
-            scorer="numeric_match",
-            passed=False,
-            rationale="scenario has no expected_answer",
-        )
-
-    try:
-        a = float(answer)
-        e = float(expected_raw)
-    except (TypeError, ValueError) as err:
-        return ScorerResult(
-            scorer="numeric_match",
-            passed=False,
-            rationale=f"could not parse numbers: {err}",
-            details={"expected": expected_raw, "actual": answer},
-        )
-
-    passed = math.isclose(a, e, rel_tol=tolerance, abs_tol=tolerance)
-    return ScorerResult(
-        scorer="numeric_match",
-        passed=passed,
-        score=1.0 if passed else 0.0,
-        rationale="" if passed else f"|{a} - {e}| > tol={tolerance}",
-        details={"expected": e, "actual": a, "tolerance": tolerance},
-    )
-
-
-register("exact_string_match", exact_string_match)
-register("numeric_match", numeric_match)
+    raise NotImplementedError
diff --git a/src/evaluation/tests/test_evaluator.py b/src/evaluation/tests/test_evaluator.py
index efbb424b0..ed0058e66 100644
--- a/src/evaluation/tests/test_evaluator.py
+++ b/src/evaluation/tests/test_evaluator.py
@@ -36,10 +36,9 @@ def test_evaluator_routes_to_default_scorer(tmp_path: Path, make_persisted_recor
 
 
 def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_record):
-    # Default scorer would crash on a missing registration; the
-    # scenario-level grading_method must win and route to a code-based
-    # scorer instead.
-    rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="3.14")
+    # The scenario-level grading_method must route around the default
+    # scorer, even when the default is a placeholder that would fail.
+    rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="answer text")
     (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")
 
     scenarios_path = tmp_path / "scenarios.json"
@@ -50,8 +49,8 @@ def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_rec
                     "id": 1,
                     "text": "Q",
                     "type": "tsfm",
-                    "expected_answer": "3.14",
-                    "grading_method": "numeric_match",
+                    "characteristic_form": "answer text",
+                    "grading_method": "semantic_similarity",
                 }
             ]
         ),
@@ -64,4 +63,4 @@ def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_rec
     )
 
     assert report.totals["passed"] == 1
-    assert report.results[0].grade.scorer == "numeric_match"
+    assert report.results[0].grade.scorer == "semantic_similarity"
diff --git a/src/evaluation/tests/test_runner.py b/src/evaluation/tests/test_runner.py
index 1c22cc618..421f642e4 100644
--- a/src/evaluation/tests/test_runner.py
+++ b/src/evaluation/tests/test_runner.py
@@ -47,7 +47,7 @@ def test_evaluate_end_to_end(tmp_path: Path, make_persisted_record):
 
 
 def test_evaluate_uses_per_scenario_grading_method(tmp_path: Path, make_persisted_record):
-    rec = make_persisted_record(run_id="run-x", scenario_id=1)
+    rec = make_persisted_record(run_id="run-x", scenario_id=1, answer="A.")
     (tmp_path / "run-x.json").write_text(json.dumps(rec), encoding="utf-8")
 
     scenarios_path = tmp_path / "scenarios.json"
@@ -58,19 +58,22 @@ def test_evaluate_uses_per_scenario_grading_method(tmp_path: Path, make_persiste
                     "id": 1,
                     "text": "Q",
                     "type": "iot",
-                    "expected_answer": "A.",
-                    "grading_method": "exact_string_match",
+                    "characteristic_form": "A.",
+                    "similarity_threshold": 0.5,
+                    "grading_method": "semantic_similarity",
                 }
             ]
         ),
         encoding="utf-8",
     )
 
+    registry.register("stub", _always_pass_scorer)
+
     report = evaluate(
         trajectories_path=tmp_path,
         scenarios_paths=[scenarios_path],
-        default_grading_method="numeric_match",  # would fail; per-scenario override wins
+        default_grading_method="stub",  # per-scenario override wins
     )
 
     assert report.totals["passed"] == 1
-    assert report.results[0].grade.scorer == "exact_string_match"
+    assert report.results[0].grade.scorer == "semantic_similarity"
diff --git a/src/evaluation/tests/test_scorers.py b/src/evaluation/tests/test_scorers.py
index 2e72e61c4..b76a924bc 100644
--- a/src/evaluation/tests/test_scorers.py
+++ b/src/evaluation/tests/test_scorers.py
@@ -1,4 +1,7 @@
-"""Tests for the three scorer families: code-based, LLM-as-judge, semantic."""
+"""Tests for the three scorer families: code-based, LLM-as-judge, semantic.
+
+Code-Based scorers are skeletons only and have no behaviour tests yet.
+"""
 
 from __future__ import annotations
 
@@ -17,41 +20,20 @@ def generate(self, prompt: str, temperature: float = 0.0) -> str:
         return self._response
 
 
-class TestExactStringMatch:
-    def test_match_case_insensitive(self, make_scenario):
-        s = make_scenario(expected_answer="Hello World")
-        r = exact_string_match(s, "hello world", "")
-        assert r.passed and r.score == 1.0
-
-    def test_mismatch(self, make_scenario):
-        s = make_scenario(expected_answer="foo")
-        r = exact_string_match(s, "bar", "")
-        assert not r.passed
-        assert r.details["expected"] == "foo"
-
-    def test_missing_expected(self, make_scenario):
-        s = make_scenario(expected_answer=None)
-        r = exact_string_match(s, "anything", "")
-        assert not r.passed
-        assert "expected_answer" in r.rationale
-
-
-class TestNumericMatch:
-    def test_within_tolerance(self, make_scenario):
-        s = make_scenario(expected_answer="3.14159")
-        r = numeric_match(s, "3.141591", "")
-        assert r.passed
-
-    def test_unparseable(self, make_scenario):
-        s = make_scenario(expected_answer="3.14")
-        r = numeric_match(s, "not a number", "")
-        assert not r.passed
-        assert "could not parse" in r.rationale
+class TestCodeBasedSkeletons:
+    def test_exact_string_match_not_implemented(self, make_scenario):
+        try:
+            exact_string_match(make_scenario(expected_answer="x"), "x", "")
+        except NotImplementedError:
+            return
+        raise AssertionError("expected NotImplementedError")
 
-    def test_custom_tolerance(self, make_scenario):
-        s = make_scenario(expected_answer="100", tolerance=0.05)
-        r = numeric_match(s, "104", "")
-        assert r.passed
+    def test_numeric_match_not_implemented(self, make_scenario):
+        try:
+            numeric_match(make_scenario(expected_answer="1.0"), "1.0", "")
+        except NotImplementedError:
+            return
+        raise AssertionError("expected NotImplementedError")
 
 
 class TestSemanticSimilarity:
@@ -88,11 +70,6 @@ def test_missing_reference_short_circuits(self, make_scenario):
 
 
 class TestRegistry:
-    def test_code_based_scorers_registered(self):
-        names = registry.names()
-        assert "exact_string_match" in names
-        assert "numeric_match" in names
-
     def test_semantic_scorer_registered(self):
         assert "semantic_similarity" in registry.names()
 

From 1d59c4b69ca8322bafec67a0e5f916c19a395315 Mon Sep 17 00:00:00 2001
From: Shuxin Lin <linshuhsin@gmail.com>
Date: Wed, 13 May 2026 12:21:16 -0400
Subject: [PATCH 4/8] fix(evaluation): align llm_judge prompt with reference;
 fix LiteLLM kwarg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- LLM-As-Judge prompt now mirrors src/tmp/evaluation_agent/result_evaluation_prompt.py:
  full 6-criterion rubric text, split Agent's Thinking vs Final Response,
  output schema uses `suggestions` (back-compat: `reason` still accepted),
  parser strips "(END OF RESPONSE)" sentinel.
- CLI: LiteLLMBackend takes `model_id=`, not `model=`. Fixes:
    TypeError: LiteLLMBackend.__init__() got an unexpected keyword argument 'model'

Verified end-to-end: claude-agent on groundtruth/101 ("List all failure
modes of asset Chiller.") → uv run evaluate with --scorer-default
llm_judge → 6/6 criteria pass.

Signed-off-by: Shuxin Lin <linshuhsin@gmail.com>
---
 src/evaluation/cli.py               |  2 +-
 src/evaluation/scorers/llm_judge.py | 86 +++++++++++++++++++----------
 2 files changed, 58 insertions(+), 30 deletions(-)

diff --git a/src/evaluation/cli.py b/src/evaluation/cli.py
index cc4ee87ed..c9eb8793b 100644
--- a/src/evaluation/cli.py
+++ b/src/evaluation/cli.py
@@ -72,7 +72,7 @@ def _maybe_install_judge(judge_model: str | None) -> None:
 
     from .scorers.llm_judge import install
 
-    install(LiteLLMBackend(model=judge_model))
+    install(LiteLLMBackend(model_id=judge_model))
 
 
 def _validate_scorer_default(name: str) -> None:
diff --git a/src/evaluation/scorers/llm_judge.py b/src/evaluation/scorers/llm_judge.py
index 00518061a..e37ecc219 100644
--- a/src/evaluation/scorers/llm_judge.py
+++ b/src/evaluation/scorers/llm_judge.py
@@ -30,35 +30,58 @@
     "hallucinations",
 )
 
-_PROMPT_TEMPLATE = """You are an evaluation judge for an industrial-asset-operations agent.
-
-Score the agent response against the expected characteristic answer using the six criteria below. Respond ONLY with a JSON object, no prose.
-
-QUESTION:
-{question}
-
-EXPECTED CHARACTERISTIC:
-{characteristic}
-
-AGENT RESPONSE:
-{answer}
-
-AGENT TRAJECTORY (turns / tool calls / outputs):
-{trajectory}
-
-Return JSON with these boolean fields plus a one-sentence reason:
-
+_PROMPT_TEMPLATE = """You are a critical reviewer tasked with evaluating the effectiveness and accuracy of an AI agent's response to a given task. Your goal is to determine whether the agent has successfully accomplished the task correctly based on the expected or characteristic behavior.
+
+Evaluation Criteria:
+1. **Task Completion:**
+   - Verify if the agent executed all necessary actions (e.g., using the correct tools, retrieving data, performing the required analysis).
+   - The agent's response should align with the predefined expected behavior for task completion.
+
+2. **Data Retrieval & Accuracy:**
+   - Ensure that the correct asset, location, time period, and sensor (if applicable) were used.
+   - Verify if the task performed was related to the correct asset and sensor, and ensure the result corresponds to the correct time period.
+   - Check if the agent retrieved the required data and if the forecasting, anomaly detection, or other results are correct.
+
+3. **Generalized Result Verification:**
+   - **Task Type Verification:** Based on the task type (forecasting, anomaly detection, classification, etc.), verify if the agent has returned the expected results.
+       - For **forecasting** tasks: Ensure that the agent generated a forecast for the specified future period.
+       - For **anomaly detection** tasks: Verify that anomalies are detected as expected (if anomalies were anticipated).
+       - For other tasks (e.g., classification), ensure the task result matches the expected format and value.
+   - **Comparison with Expected Output:** Check if the result matches the expected format, values, or outcomes as outlined in the characteristic answer.
+   - **Data Integrity:** Ensure that the correct data (e.g., sensor, time period) was used in the task, and that it is consistent with the expected format and structure.
+
+4. **Agent Sequence & Order:**
+   - Ensure the agents (or tools) were called in the correct order and that all actions align with the expected behavior for agent interactions.
+   - If the characteristic answer specifies certain agents (e.g., IoTAgent, TSFMAgent, FMSRAgent), verify that these were used and in the correct sequence.
+
+5. **Clarity and Justification:**
+   - Ensure the agent's response is clear and justified with adequate explanations or evidence to support the claims made.
+   - There should be no contradictions between the agent's reasoning and the expected behavior outlined in the characteristic answer.
+
+6. **Hallucination Check:**
+   - Identify if the agent claims success without performing the necessary actions or without generating meaningful results.
+   - If the agent provides a fabricated response or claims success where actions are missing, mark this as a hallucination.
+
+Question: {question}
+Characteristic Answer (Expected Behavior): {characteristic}
+Agent's Thinking (turns / tool calls / outputs): {trajectory}
+Agent's Final Response: {answer}
+
+Output Format:
+Your review must always be in JSON format. Do not include any additional formatting or Markdown in your response.
 {{
-  "task_completion": <bool>,
-  "data_retrieval_accuracy": <bool>,
-  "generalized_result_verification": <bool>,
-  "agent_sequence_correct": <bool>,
-  "clarity_and_justification": <bool>,
-  "hallucinations": <bool>,
-  "reason": "<one sentence>"
+    "task_completion": true/false,
+    "data_retrieval_accuracy": true/false,
+    "generalized_result_verification": true/false,
+    "agent_sequence_correct": true/false,
+    "clarity_and_justification": true/false,
+    "hallucinations": true/false,
+    "suggestions": "Optional. Actions or improvements for rectifying the response if applicable."
 }}
+(END OF RESPONSE)
 
-The agent passes overall iff the first five are true AND hallucinations is false."""
+Please provide your review based on the given criteria.
+"""
 
 
 class LLMJudgeScorer:
@@ -117,11 +140,14 @@ def __call__(
         if review.get("hallucinations") is True:
             score = max(0.0, score - 0.2)
 
+        rationale = str(
+            review.get("suggestions") or review.get("reason") or ""
+        )[:500]
         return ScorerResult(
             scorer=self.name,
             passed=passed,
             score=round(score, 3),
-            rationale=str(review.get("reason", ""))[:500],
+            rationale=rationale,
             details=review,
         )
 
@@ -129,8 +155,10 @@ def __call__(
 def _parse_review(raw: str) -> dict | None:
     if not raw:
         return None
-    # Tolerate leading prose / markdown fences by extracting the first {...} block.
-    match = re.search(r"\{.*\}", raw, re.DOTALL)
+    # Strip the reference prompt's "(END OF RESPONSE)" sentinel + any
+    # leading prose / markdown fence before extracting the first {...}.
+    text = raw.split("(END OF RESPONSE)")[0]
+    match = re.search(r"\{.*\}", text, re.DOTALL)
     if not match:
         return None
     try:

From 5d0622f9c97bb962b9f4615d59a753d232b02f63 Mon Sep 17 00:00:00 2001
From: Shuxin Lin <linshuhsin@gmail.com>
Date: Wed, 13 May 2026 12:33:21 -0400
Subject: [PATCH 5/8] feat(evaluation): write per-run reports to
 reports/<run_id>.json

- CLI: --output FILE replaced with --reports-dir DIR (default reports/).
  Writes one JSON per result (named by trajectory run_id, which is a
  UUID) plus _aggregate.json for the rollup.
- ScenarioResult now carries run_id (propagated from PersistedTrajectory).
- New report.write_reports_dir(); falls back to scenario-<id>.json for
  legacy trajectories with no run_id.
- 2 new tests; 43 evaluation tests pass.

Verified: uv run evaluate against groundtruth/101.json wrote
reports/112c1b56-...json + reports/_aggregate.json.

Signed-off-by: Shuxin Lin <linshuhsin@gmail.com>
---
 src/evaluation/cli.py               | 17 ++++++++-----
 src/evaluation/evaluator.py         |  1 +
 src/evaluation/models.py            |  1 +
 src/evaluation/report.py            | 28 ++++++++++++++++++++++
 src/evaluation/tests/test_report.py | 37 +++++++++++++++++++++++++++--
 5 files changed, 76 insertions(+), 8 deletions(-)

diff --git a/src/evaluation/cli.py b/src/evaluation/cli.py
index c9eb8793b..4ba5535a6 100644
--- a/src/evaluation/cli.py
+++ b/src/evaluation/cli.py
@@ -9,7 +9,7 @@
 
 from . import scorers as scorer_registry
 from .evaluator import Evaluator
-from .report import render_summary, write_report
+from .report import render_summary, write_reports_dir
 
 
 def _build_parser() -> argparse.ArgumentParser:
@@ -34,10 +34,14 @@ def _build_parser() -> argparse.ArgumentParser:
         help="One or more scenario JSON / JSONL files.",
     )
     p.add_argument(
-        "--output",
+        "--reports-dir",
         type=Path,
-        required=True,
-        help="Path to write the JSON report.",
+        default=Path("reports"),
+        help=(
+            "Directory to write per-run JSON reports (one file per run, "
+            "named '<run_id>.json'), plus '_aggregate.json' for the rollup. "
+            "Default: reports/."
+        ),
     )
     p.add_argument(
         "--scorer-default",
@@ -97,9 +101,10 @@ def main(argv: list[str] | None = None) -> int:
         scenarios_paths=list(args.scenarios),
     )
 
-    out = write_report(report, args.output)
+    out_dir = write_reports_dir(report, args.reports_dir)
     print(render_summary(report))
-    print(f"\nReport written: {out}")
+    print(f"\nReports written: {out_dir}/<run_id>.json ({len(report.results)} files)")
+    print(f"Aggregate:       {out_dir}/_aggregate.json")
     return 0
 
 
diff --git a/src/evaluation/evaluator.py b/src/evaluation/evaluator.py
index f7d151040..b61673a68 100644
--- a/src/evaluation/evaluator.py
+++ b/src/evaluation/evaluator.py
@@ -64,6 +64,7 @@ def _score_one(
         return ScenarioResult(
             scenario_id=scenario.id,
             scenario_type=scenario.type,
+            run_id=traj.run_id,
             runner=traj.runner,
             model=traj.model,
             question=traj.question,
diff --git a/src/evaluation/models.py b/src/evaluation/models.py
index 25fd0b2aa..cdf1d202b 100644
--- a/src/evaluation/models.py
+++ b/src/evaluation/models.py
@@ -84,6 +84,7 @@ class ScorerResult(BaseModel):
 class ScenarioResult(BaseModel):
     scenario_id: str
     scenario_type: str = ""
+    run_id: str = ""
     runner: str
     model: str
     question: str
diff --git a/src/evaluation/report.py b/src/evaluation/report.py
index 72ff9b0e2..9f9ddc4a0 100644
--- a/src/evaluation/report.py
+++ b/src/evaluation/report.py
@@ -10,6 +10,8 @@
 from .metrics import aggregate_ops
 from .models import EvalReport, ScenarioResult, TypeBreakdown
 
+_AGGREGATE_FILENAME = "_aggregate.json"
+
 
 def build_report(results: list[ScenarioResult]) -> EvalReport:
     total = len(results)
@@ -52,6 +54,32 @@ def write_report(report: EvalReport, output: Path) -> Path:
     return output
 
 
+def write_reports_dir(report: EvalReport, reports_dir: Path) -> Path:
+    """Write one JSON file per result (``<run_id>.json``) plus an aggregate.
+
+    Results without a ``run_id`` fall back to ``<scenario_id>.json`` so
+    nothing is dropped.  Returns the directory path.
+    """
+    reports_dir = Path(reports_dir)
+    reports_dir.mkdir(parents=True, exist_ok=True)
+
+    used: dict[str, int] = {}
+    for r in report.results:
+        stem = r.run_id or f"scenario-{r.scenario_id}"
+        # Disambiguate any collisions deterministically.
+        suffix = used.get(stem, 0)
+        used[stem] = suffix + 1
+        name = stem if suffix == 0 else f"{stem}-{suffix}"
+        (reports_dir / f"{name}.json").write_text(
+            r.model_dump_json(indent=2), encoding="utf-8"
+        )
+
+    (reports_dir / _AGGREGATE_FILENAME).write_text(
+        report.model_dump_json(indent=2), encoding="utf-8"
+    )
+    return reports_dir
+
+
 def render_summary(report: EvalReport) -> str:
     lines: list[str] = []
     t = report.totals
diff --git a/src/evaluation/tests/test_report.py b/src/evaluation/tests/test_report.py
index 7e2db642f..d342bdd2d 100644
--- a/src/evaluation/tests/test_report.py
+++ b/src/evaluation/tests/test_report.py
@@ -10,13 +10,19 @@
     ScenarioResult,
     ScorerResult,
 )
-from evaluation.report import build_report, render_summary, write_report
+from evaluation.report import (
+    build_report,
+    render_summary,
+    write_report,
+    write_reports_dir,
+)
 
 
-def _result(stype: str, passed: bool, **ops_kwargs) -> ScenarioResult:
+def _result(stype: str, passed: bool, run_id: str = "", **ops_kwargs) -> ScenarioResult:
     return ScenarioResult(
         scenario_id="x",
         scenario_type=stype,
+        run_id=run_id,
         runner="plan-execute",
         model="watsonx/ibm/granite",
         question="q",
@@ -63,6 +69,33 @@ def test_write_report_round_trips(tmp_path: Path):
     assert data["by_scenario_type"]["iot"]["pass_rate"] == 1.0
 
 
+def test_write_reports_dir_per_run_files(tmp_path: Path):
+    results = [
+        _result("iot", True, run_id="run-a"),
+        _result("tsfm", False, run_id="run-b"),
+    ]
+    out_dir = write_reports_dir(build_report(results), tmp_path / "reports")
+
+    assert (out_dir / "run-a.json").exists()
+    assert (out_dir / "run-b.json").exists()
+    assert (out_dir / "_aggregate.json").exists()
+
+    per_run = json.loads((out_dir / "run-a.json").read_text())
+    assert per_run["run_id"] == "run-a"
+    assert per_run["grade"]["passed"] is True
+
+    agg = json.loads((out_dir / "_aggregate.json").read_text())
+    assert agg["totals"]["scenarios"] == 2
+
+
+def test_write_reports_dir_falls_back_to_scenario_id(tmp_path: Path):
+    # ScenarioResult.run_id is empty when the trajectory pre-dates the
+    # run_id field; the writer must still produce a file.
+    results = [_result("iot", True)]
+    out_dir = write_reports_dir(build_report(results), tmp_path / "reports")
+    assert (out_dir / "scenario-x.json").exists()
+
+
 def test_render_summary_includes_headlines():
     results = [
         _result("iot", True, tokens_in=10, tokens_out=5, duration_ms=100.0, tool_call_count=1),

From cf941e0d819f0cbd25f3cc0b0621aab41e12a635 Mon Sep 17 00:00:00 2001
From: Shuxin Lin <linshuhsin@gmail.com>
Date: Wed, 13 May 2026 12:38:33 -0400
Subject: [PATCH 6/8] docs(evaluation): add docs/evaluation.md; semantic scorer
 is skeleton

- Strip src/evaluation/scorers/semantic.py to a NotImplementedError
  skeleton; no longer auto-registered. Code-Based + Semantic-Score
  families now both ship as slot-only placeholders; LLM-As-Judge is
  the only working scorer in this branch.
- Tests: TestSemanticSimilarity collapsed to a NotImplementedError
  assertion; runner/evaluator override tests pivot to local stub
  scorers (no skeleton dependency).
- INSTRUCTIONS.md: new Evaluation section linking to the full doc.
- docs/evaluation.md: scenario/trajectory schema, CLI reference, report
  layout, scorer families table, custom-scorer plug-in pattern, loop
  over groundtruth/*.json.

40 evaluation tests pass; full suite 310 passed.

Signed-off-by: Shuxin Lin <linshuhsin@gmail.com>
---
 INSTRUCTIONS.md                        |  32 +++
 docs/evaluation.md                     | 264 +++++++++++++++++++++++++
 src/evaluation/scorers/__init__.py     |   8 +-
 src/evaluation/scorers/semantic.py     |  45 +----
 src/evaluation/tests/test_evaluator.py |  16 +-
 src/evaluation/tests/test_runner.py    |  16 +-
 src/evaluation/tests/test_scorers.py   |  46 ++---
 7 files changed, 341 insertions(+), 86 deletions(-)
 create mode 100644 docs/evaluation.md

diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md
index 8eede3877..36d8fb020 100644
--- a/INSTRUCTIONS.md
+++ b/INSTRUCTIONS.md
@@ -11,6 +11,7 @@ This directory contains the MCP servers and infrastructure for the AssetOpsBench
 - [Example queries](#example-queries)
 - [Agents](#agents)
 - [Observability](#observability)
+- [Evaluation](#evaluation)
 - [Running Tests](#running-tests)
 - [Architecture](#architecture)
 
@@ -220,6 +221,37 @@ See [docs/observability.md](docs/observability.md) for span attribute reference,
 
 ---
 
+## Evaluation
+
+Offline grading of saved trajectories against ground-truth scenarios. Three-stage flow:
+
+```
+agent run  →  trajectory (run_id)  →  uv run evaluate  →  reports/<run_id>.json
+```
+
+End-to-end against a ground-truth file:
+
+```bash
+# 1. Persist trajectories
+export AGENT_TRAJECTORY_DIR=$(pwd)/traces/trajectories
+uv run claude-agent "List all failure modes of asset Chiller." --scenario-id 101
+
+# 2. Score with LLM-As-Judge
+uv run evaluate \
+  --trajectories traces/trajectories \
+  --scenarios groundtruth/101.json \
+  --scorer-default llm_judge \
+  --judge-model litellm_proxy/aws/claude-opus-4-6
+```
+
+Output lands under `reports/` — one `<run_id>.json` per trajectory plus `_aggregate.json` for the rollup.
+
+Scorer families follow MLflow's evaluator/scorer split: `llm_judge` is wired up; `exact_string_match`, `numeric_match`, and `semantic_similarity` ship as skeletons (raise `NotImplementedError`).
+
+Full reference — scenario schema, report layout, custom scorers, looping over ground-truth: **[docs/evaluation.md](docs/evaluation.md)**.
+
+---
+
 ## Running Tests
 
 ```bash
diff --git a/docs/evaluation.md b/docs/evaluation.md
new file mode 100644
index 000000000..0d6f9a366
--- /dev/null
+++ b/docs/evaluation.md
@@ -0,0 +1,264 @@
+# Evaluation
+
+Offline grading of saved agent trajectories against ground-truth scenarios.
+
+The evaluation module follows the three-stage pattern used by SWE-bench,
+HELM, and τ-bench:
+
+```
+agent run  →  trajectory (run_id)  →  evaluate  →  reports/<run_id>.json
+```
+
+Re-grading from saved trajectories is first-class: re-run scoring with
+a different scorer or judge model without re-invoking the agent.
+
+## Concepts
+
+The vocabulary follows MLflow's evaluation split:
+
+- **Scenario** — a ground-truth record on disk. Carries `id`, `text`
+  (the utterance), `type`, `characteristic_form` (expected behaviour),
+  and optional `grading_method`.
+- **Trajectory** — a per-run JSON file persisted by the agent runners
+  when `AGENT_TRAJECTORY_DIR` is set. Carries `run_id`, `scenario_id`,
+  `question`, `answer`, and per-turn detail.
+- **Scorer** — a callable that takes
+  `(scenario, answer, trajectory_text)` and returns a `ScorerResult`.
+  Scorers fall into three families:
+  - **Code-Based** — deterministic, no LLM (e.g. `exact_string_match`,
+    `numeric_match`). *Skeleton only* in this branch.
+  - **LLM-As-Judge** — `llm_judge`. Six-criterion rubric, requires a
+    LiteLLM-routable model passed via `--judge-model`.
+  - **Semantic-Score** — similarity-based, no LLM call. *Skeleton only*
+    in this branch.
+- **Evaluator** — orchestrates a batch: loads scenarios + trajectories,
+  joins on `scenario_id`, dispatches to scorers, aggregates results.
+
+## Inputs
+
+### Scenario file
+
+JSON list, JSON object, or JSONL. Fields the scorer cares about:
+
+| Field                 | Used by              | Notes                                            |
+| --------------------- | -------------------- | ------------------------------------------------ |
+| `id`                  | join                 | Coerced to string at load time                   |
+| `text`                | all                  | The utterance the agent answered                 |
+| `type`                | reporting            | Scenario family (`iot`, `tsfm`, `FMSR`, …)       |
+| `characteristic_form` | llm_judge, semantic  | Expected behaviour, free-form                    |
+| `expected_answer`     | code_based, semantic | Exact target string / number                     |
+| `grading_method`      | dispatch             | Registered scorer name; overrides CLI default    |
+| `tolerance`           | numeric_match        | Optional relative + absolute tolerance           |
+
+Ground-truth files under `groundtruth/` already match this schema —
+they're a drop-in scenarios input.
+
+### Trajectory file
+
+Written by the observability layer to `AGENT_TRAJECTORY_DIR` as one
+JSON per run. Fields the evaluator reads:
+
+```
+{
+  "run_id":      "<uuid>",
+  "scenario_id": "<scenario id>",
+  "runner":      "claude-agent" | "plan-execute" | …,
+  "model":       "<provider/model>",
+  "question":    "<utterance>",
+  "answer":      "<final response>",
+  "trajectory":  {…}  // SDK Trajectory dict, or list[StepResult] for plan-execute
+}
+```
+
+`scenario_id` is critical — trajectories with `null` scenario_id are
+dropped at the join step. Pass `--scenario-id` to the agent CLI to set it.
+
+## End-to-end workflow
+
+```bash
+# 1. Persist trajectories under AGENT_TRAJECTORY_DIR
+export AGENT_TRAJECTORY_DIR=$(pwd)/traces/trajectories
+uv run claude-agent "List all failure modes of asset Chiller." --scenario-id 101
+
+# 2. Score with LLM-As-Judge against the ground-truth file
+uv run evaluate \
+  --trajectories traces/trajectories \
+  --scenarios groundtruth/101.json \
+  --scorer-default llm_judge \
+  --judge-model litellm_proxy/aws/claude-opus-4-6
+```
+
+Output:
+
+```
+Scenarios: 1  Passed: 1  Pass rate: 100.0%
+
+By scenario type:
+  FMSR             1/1   (100.0%)
+
+Operational metrics:
+  tokens_in_total:   7
+  tokens_out_total:  25
+  tool_calls_total:  1
+  duration_ms_p50:   14690.6
+
+Reports written: reports/<run_id>.json (1 files)
+Aggregate:       reports/_aggregate.json
+```
+
+## Output layout
+
+```
+reports/
+├── <run_id>.json        # one ScenarioResult per trajectory
+├── <run_id>.json
+└── _aggregate.json      # EvalReport: totals, by_scenario_type, ops rollup
+```
+
+Per-run file (`reports/<run_id>.json`):
+
+```json
+{
+  "scenario_id": "101",
+  "scenario_type": "FMSR",
+  "run_id": "112c1b56-…",
+  "runner": "claude-agent",
+  "model": "litellm_proxy/aws/claude-opus-4-6",
+  "question": "List all failure modes of asset Chiller.",
+  "answer":   "Here are the 7 failure modes for the Chiller asset: …",
+  "grade": {
+    "scorer": "llm_judge",
+    "passed": true,
+    "score": 1.0,
+    "rationale": "",
+    "details": {
+      "task_completion": true,
+      "data_retrieval_accuracy": true,
+      "generalized_result_verification": true,
+      "agent_sequence_correct": true,
+      "clarity_and_justification": true,
+      "hallucinations": false,
+      "suggestions": ""
+    }
+  },
+  "ops": {
+    "turn_count": 2,
+    "tool_call_count": 1,
+    "unique_tools": ["get_failure_modes"],
+    "tokens_in": 7,
+    "tokens_out": 25,
+    "duration_ms": 14690.6,
+    "est_cost_usd": 0.001959
+  }
+}
+```
+
+Aggregate (`reports/_aggregate.json`) is the full `EvalReport` (totals,
+runners, models, by-scenario-type breakdown, ops rollup, and the list
+of per-scenario results).
+
+## CLI reference
+
+```
+uv run evaluate \
+  --trajectories DIR_OR_FILE     # required
+  --scenarios FILE [FILE ...]    # required, one or more
+  [--reports-dir DIR]            # default: reports/
+  [--scorer-default NAME]        # default: llm_judge
+  [--judge-model MODEL_ID]       # required when llm_judge runs
+  [-v]
+```
+
+`--grader-default` is accepted as a legacy alias for `--scorer-default`.
+
+## Available scorers in this branch
+
+| Family        | Registered name        | Status                                      |
+| ------------- | ---------------------- | ------------------------------------------- |
+| LLM-As-Judge  | `llm_judge`            | Works. Installed by passing `--judge-model` |
+| Code-Based    | `exact_string_match`   | **Skeleton — `NotImplementedError`**        |
+| Code-Based    | `numeric_match`        | **Skeleton — `NotImplementedError`**        |
+| Semantic-Score| `semantic_similarity`  | **Skeleton — `NotImplementedError`**        |
+
+Skeleton scorers don't auto-register; calling them raises
+`NotImplementedError`. Fill in the body and call
+`evaluation.scorers.register("<name>", <fn>)` to enable.
+
+## LLM-As-Judge
+
+Six-criterion rubric, prompt mirrored from
+`src/tmp/evaluation_agent/result_evaluation_prompt.py`:
+
+- `task_completion`
+- `data_retrieval_accuracy`
+- `generalized_result_verification`
+- `agent_sequence_correct`
+- `clarity_and_justification`
+- `hallucinations`
+
+A run passes overall iff the first five are `true` **and**
+`hallucinations` is `false`. The score is the fraction of the first
+five satisfied, minus 0.2 if `hallucinations` is `true`. The judge's
+free-form `suggestions` (or legacy `reason`) lands in
+`grade.rationale`; the full review dict lands in `grade.details`.
+
+To customise: edit `_PROMPT_TEMPLATE` in
+`src/evaluation/scorers/llm_judge.py`.
+
+## Programmatic use
+
+```python
+from pathlib import Path
+from evaluation import Evaluator
+from evaluation.scorers.llm_judge import install
+from llm import LiteLLMBackend
+
+install(LiteLLMBackend(model_id="litellm_proxy/aws/claude-opus-4-6"))
+
+report = Evaluator(default_scorer="llm_judge").evaluate(
+    trajectories_path=Path("traces/trajectories"),
+    scenarios_paths=[Path("groundtruth/101.json")],
+)
+
+for r in report.results:
+    print(r.run_id, r.grade.passed, r.grade.score)
+```
+
+## Plug in a custom scorer
+
+```python
+from evaluation import scorers
+from evaluation.models import ScorerResult
+
+def keyword_hit(scenario, answer, trajectory_text) -> ScorerResult:
+    required = (scenario.model_extra or {}).get("required_keywords", [])
+    hits = [k for k in required if k.lower() in answer.lower()]
+    passed = len(hits) == len(required)
+    return ScorerResult(
+        scorer="keyword_hit",
+        passed=passed,
+        score=len(hits) / max(1, len(required)),
+        rationale="" if passed else f"missing: {set(required) - set(hits)}",
+    )
+
+scorers.register("keyword_hit", keyword_hit)
+# Any scenario with "grading_method": "keyword_hit" now routes here.
+```
+
+## Loop over all ground-truth files
+
+```bash
+export AGENT_TRAJECTORY_DIR=$(pwd)/traces/trajectories
+
+for f in groundtruth/*.json; do
+  utt=$(python3 -c "import json,sys;d=json.load(open(sys.argv[1]));print(d['text'])" "$f")
+  sid=$(python3 -c "import json,sys;d=json.load(open(sys.argv[1]));print(d['id'])"  "$f")
+  uv run claude-agent "$utt" --scenario-id "$sid"
+done
+
+uv run evaluate \
+  --trajectories traces/trajectories \
+  --scenarios groundtruth/*.json \
+  --scorer-default llm_judge \
+  --judge-model litellm_proxy/aws/claude-opus-4-6
+```
diff --git a/src/evaluation/scorers/__init__.py b/src/evaluation/scorers/__init__.py
index 00ff0c1e2..3bad68a8c 100644
--- a/src/evaluation/scorers/__init__.py
+++ b/src/evaluation/scorers/__init__.py
@@ -40,5 +40,9 @@ def names() -> list[str]:
     return sorted(_REGISTRY)
 
 
-from . import code_based  # noqa: E402,F401  — register-on-import
-from . import semantic  # noqa: E402,F401  — register-on-import
+# Code-Based and Semantic-Score families ship as skeletons — their
+# modules are importable but register no scorers until an
+# implementation is filled in.  LLM-As-Judge is registered explicitly
+# via :func:`evaluation.scorers.llm_judge.install`.
+from . import code_based  # noqa: E402,F401
+from . import semantic  # noqa: E402,F401
diff --git a/src/evaluation/scorers/semantic.py b/src/evaluation/scorers/semantic.py
index 56506eb10..c2a9bd61a 100644
--- a/src/evaluation/scorers/semantic.py
+++ b/src/evaluation/scorers/semantic.py
@@ -1,51 +1,16 @@
-"""Semantic-Score scorer — similarity without an LLM call.
+"""Semantic-Score scorer — similarity-based grading without an LLM call.
 
-Uses ``difflib.SequenceMatcher`` over normalised text so the scorer has
-no external dependencies and is stable in CI.  A scenario can override
-the pass threshold via ``scenario.similarity_threshold`` (default 0.6).
+Skeleton only — fill in the implementation (e.g. embedding cosine, BLEU,
+sentence-transformers, or difflib ratio) and re-register with the
+scorer registry before use.
 """
 
 from __future__ import annotations
 
-import re
-from difflib import SequenceMatcher
-
 from ..models import Scenario, ScorerResult
-from . import register
-
-_DEFAULT_THRESHOLD = 0.6
-_WS_RE = re.compile(r"\s+")
-
-
-def _normalize(text: str) -> str:
-    return _WS_RE.sub(" ", str(text).strip().lower())
 
 
 def semantic_similarity(
     scenario: Scenario, answer: str, trajectory_text: str
 ) -> ScorerResult:
-    reference = scenario.characteristic_form or scenario.expected_answer
-    if not reference:
-        return ScorerResult(
-            scorer="semantic_similarity",
-            passed=False,
-            rationale="scenario has neither characteristic_form nor expected_answer",
-        )
-
-    extra = scenario.model_extra or {}
-    threshold = float(extra.get("similarity_threshold", _DEFAULT_THRESHOLD))
-
-    score = SequenceMatcher(None, _normalize(reference), _normalize(answer)).ratio()
-    passed = score >= threshold
-    return ScorerResult(
-        scorer="semantic_similarity",
-        passed=passed,
-        score=round(score, 4),
-        rationale=(
-            "" if passed else f"similarity {score:.3f} below threshold {threshold}"
-        ),
-        details={"threshold": threshold, "reference": reference},
-    )
-
-
-register("semantic_similarity", semantic_similarity)
+    raise NotImplementedError
diff --git a/src/evaluation/tests/test_evaluator.py b/src/evaluation/tests/test_evaluator.py
index ed0058e66..6cc761e46 100644
--- a/src/evaluation/tests/test_evaluator.py
+++ b/src/evaluation/tests/test_evaluator.py
@@ -35,9 +35,13 @@ def test_evaluator_routes_to_default_scorer(tmp_path: Path, make_persisted_recor
     assert report.results[0].grade.scorer == "stub-evaluator"
 
 
+def _fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult:
+    return ScorerResult(scorer="fail-default", passed=False, score=0.0)
+
+
 def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_record):
     # The scenario-level grading_method must route around the default
-    # scorer, even when the default is a placeholder that would fail.
+    # scorer, even when the default scorer would reject the answer.
     rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="answer text")
     (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")
 
@@ -49,18 +53,20 @@ def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_rec
                     "id": 1,
                     "text": "Q",
                     "type": "tsfm",
-                    "characteristic_form": "answer text",
-                    "grading_method": "semantic_similarity",
+                    "grading_method": "stub-evaluator",
                 }
             ]
         ),
         encoding="utf-8",
     )
 
-    report = Evaluator(default_scorer="llm_judge").evaluate(
+    registry.register("stub-evaluator", _stub_scorer)
+    registry.register("fail-default", _fail_scorer)
+
+    report = Evaluator(default_scorer="fail-default").evaluate(
         trajectories_path=tmp_path,
         scenarios_paths=[scenarios_path],
     )
 
     assert report.totals["passed"] == 1
-    assert report.results[0].grade.scorer == "semantic_similarity"
+    assert report.results[0].grade.scorer == "stub-evaluator"
diff --git a/src/evaluation/tests/test_runner.py b/src/evaluation/tests/test_runner.py
index 421f642e4..a9d558878 100644
--- a/src/evaluation/tests/test_runner.py
+++ b/src/evaluation/tests/test_runner.py
@@ -46,6 +46,10 @@ def test_evaluate_end_to_end(tmp_path: Path, make_persisted_record):
     assert report.ops.tokens_in_total > 0
 
 
+def _always_fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult:
+    return ScorerResult(scorer="stub-fail", passed=False, score=0.0)
+
+
 def test_evaluate_uses_per_scenario_grading_method(tmp_path: Path, make_persisted_record):
     rec = make_persisted_record(run_id="run-x", scenario_id=1, answer="A.")
     (tmp_path / "run-x.json").write_text(json.dumps(rec), encoding="utf-8")
@@ -58,22 +62,22 @@ def test_evaluate_uses_per_scenario_grading_method(tmp_path: Path, make_persiste
                     "id": 1,
                     "text": "Q",
                     "type": "iot",
-                    "characteristic_form": "A.",
-                    "similarity_threshold": 0.5,
-                    "grading_method": "semantic_similarity",
+                    "grading_method": "stub-pass",
                 }
             ]
         ),
         encoding="utf-8",
     )
 
-    registry.register("stub", _always_pass_scorer)
+    registry.register("stub-pass", _always_pass_scorer)
+    registry.register("stub-fail", _always_fail_scorer)
 
     report = evaluate(
         trajectories_path=tmp_path,
         scenarios_paths=[scenarios_path],
-        default_grading_method="stub",  # per-scenario override wins
+        default_grading_method="stub-fail",  # per-scenario override wins
     )
 
+    # Override wins: scenario routed to the always-pass stub even though
+    # the default scorer would have failed it.
     assert report.totals["passed"] == 1
-    assert report.results[0].grade.scorer == "semantic_similarity"
diff --git a/src/evaluation/tests/test_scorers.py b/src/evaluation/tests/test_scorers.py
index b76a924bc..8f2ac6b69 100644
--- a/src/evaluation/tests/test_scorers.py
+++ b/src/evaluation/tests/test_scorers.py
@@ -36,42 +36,22 @@ def test_numeric_match_not_implemented(self, make_scenario):
         raise AssertionError("expected NotImplementedError")
 
 
-class TestSemanticSimilarity:
-    def test_close_text_passes_default_threshold(self, make_scenario):
-        s = make_scenario(
-            characteristic_form="Lists temperature, pressure, and vibration sensors."
-        )
-        r = semantic_similarity(
-            s, "lists temperature pressure and vibration sensors", ""
-        )
-        assert r.passed
-        assert r.score >= 0.6
-
-    def test_unrelated_text_fails(self, make_scenario):
-        s = make_scenario(characteristic_form="lists three iot sensors")
-        r = semantic_similarity(s, "the chiller is operating normally", "")
-        assert not r.passed
-        assert "below threshold" in r.rationale
-
-    def test_custom_threshold_override(self, make_scenario):
-        s = make_scenario(
-            characteristic_form="lists three iot sensors",
-            similarity_threshold=0.05,
-        )
-        r = semantic_similarity(s, "completely different answer text", "")
-        # Threshold lowered enough that even weak overlap passes.
-        assert r.passed
-
-    def test_missing_reference_short_circuits(self, make_scenario):
-        s = make_scenario(characteristic_form=None, expected_answer=None)
-        r = semantic_similarity(s, "anything", "")
-        assert not r.passed
-        assert "characteristic_form" in r.rationale
+class TestSemanticSkeleton:
+    def test_semantic_similarity_not_implemented(self, make_scenario):
+        try:
+            semantic_similarity(make_scenario(), "a", "")
+        except NotImplementedError:
+            return
+        raise AssertionError("expected NotImplementedError")
 
 
 class TestRegistry:
-    def test_semantic_scorer_registered(self):
-        assert "semantic_similarity" in registry.names()
+    def test_skeleton_scorers_not_auto_registered(self):
+        # code_based and semantic ship as skeletons; only llm_judge is
+        # registered (lazily, via install()).
+        assert "exact_string_match" not in registry.names()
+        assert "numeric_match" not in registry.names()
+        assert "semantic_similarity" not in registry.names()
 
     def test_get_unknown_raises(self):
         try:

From 0d523453e783a8fe6888c6b88055fe6372386b2c Mon Sep 17 00:00:00 2001
From: Shuxin Lin <linshuhsin@gmail.com>
Date: Wed, 13 May 2026 15:54:23 -0400
Subject: [PATCH 7/8] refactor(evaluation): standardize grade/grader ->
 score/scorer

Field and identifier renames so the module speaks a single vocabulary:

- Scenario.grading_method      -> Scenario.scoring_method
- ScenarioResult.grade         -> ScenarioResult.score (typed ScorerResult)
- runner.default_grading_method -> runner.default_scoring_method
- CLI: --grader-default legacy alias removed (only --scorer-default)
- report.totals["graded"]      -> report.totals["scored"]
- Docstrings/comments/docs: "grading"/"graded"/"grader" -> "scoring"/"scored"/"scorer"

Tests, INSTRUCTIONS.md, docs/evaluation.md updated.

Note: the inner numeric ScorerResult.score is unchanged; access is
result.score.score for the numeric, result.score.passed for the bool.

40 evaluation tests pass; full suite 310 passed; end-to-end against
groundtruth/101.json still emits 6/6 rubric pass.

Signed-off-by: Shuxin Lin <linshuhsin@gmail.com>
---
 INSTRUCTIONS.md                        |  2 +-
 docs/evaluation.md                     | 19 +++++++++----------
 src/evaluation/__init__.py             |  4 ++--
 src/evaluation/cli.py                  |  5 ++---
 src/evaluation/evaluator.py            |  8 ++++----
 src/evaluation/models.py               |  6 +++---
 src/evaluation/report.py               |  8 ++++----
 src/evaluation/runner.py               |  8 ++++----
 src/evaluation/scorers/__init__.py     |  2 +-
 src/evaluation/scorers/semantic.py     |  2 +-
 src/evaluation/tests/test_evaluator.py |  8 ++++----
 src/evaluation/tests/test_metrics.py   |  2 +-
 src/evaluation/tests/test_report.py    |  6 +++---
 src/evaluation/tests/test_runner.py    |  8 ++++----
 14 files changed, 43 insertions(+), 45 deletions(-)

diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md
index 36d8fb020..f4922c2ef 100644
--- a/INSTRUCTIONS.md
+++ b/INSTRUCTIONS.md
@@ -223,7 +223,7 @@ See [docs/observability.md](docs/observability.md) for span attribute reference,
 
 ## Evaluation
 
-Offline grading of saved trajectories against ground-truth scenarios. Three-stage flow:
+Offline scoring of saved trajectories against ground-truth scenarios. Three-stage flow:
 
 ```
 agent run  →  trajectory (run_id)  →  uv run evaluate  →  reports/<run_id>.json
diff --git a/docs/evaluation.md b/docs/evaluation.md
index 0d6f9a366..3b705b8a0 100644
--- a/docs/evaluation.md
+++ b/docs/evaluation.md
@@ -1,6 +1,6 @@
 # Evaluation
 
-Offline grading of saved agent trajectories against ground-truth scenarios.
+Offline scoring of saved agent trajectories against ground-truth scenarios.
 
 The evaluation module follows the three-stage pattern used by SWE-bench,
 HELM, and τ-bench:
@@ -9,8 +9,8 @@ HELM, and τ-bench:
 agent run  →  trajectory (run_id)  →  evaluate  →  reports/<run_id>.json
 ```
 
-Re-grading from saved trajectories is first-class: re-run scoring with
-a different scorer or judge model without re-invoking the agent.
+Re-scoring from saved trajectories is first-class: re-run with a
+different scorer or judge model without re-invoking the agent.
 
 ## Concepts
 
@@ -18,7 +18,7 @@ The vocabulary follows MLflow's evaluation split:
 
 - **Scenario** — a ground-truth record on disk. Carries `id`, `text`
   (the utterance), `type`, `characteristic_form` (expected behaviour),
-  and optional `grading_method`.
+  and optional `scoring_method`.
 - **Trajectory** — a per-run JSON file persisted by the agent runners
   when `AGENT_TRAJECTORY_DIR` is set. Carries `run_id`, `scenario_id`,
   `question`, `answer`, and per-turn detail.
@@ -47,7 +47,7 @@ JSON list, JSON object, or JSONL. Fields the scorer cares about:
 | `type`                | reporting            | Scenario family (`iot`, `tsfm`, `FMSR`, …)       |
 | `characteristic_form` | llm_judge, semantic  | Expected behaviour, free-form                    |
 | `expected_answer`     | code_based, semantic | Exact target string / number                     |
-| `grading_method`      | dispatch             | Registered scorer name; overrides CLI default    |
+| `scoring_method`      | dispatch             | Registered scorer name; overrides CLI default    |
 | `tolerance`           | numeric_match        | Optional relative + absolute tolerance           |
 
 Ground-truth files under `groundtruth/` already match this schema —
@@ -126,7 +126,7 @@ Per-run file (`reports/<run_id>.json`):
   "model": "litellm_proxy/aws/claude-opus-4-6",
   "question": "List all failure modes of asset Chiller.",
   "answer":   "Here are the 7 failure modes for the Chiller asset: …",
-  "grade": {
+  "score": {
     "scorer": "llm_judge",
     "passed": true,
     "score": 1.0,
@@ -169,7 +169,6 @@ uv run evaluate \
   [-v]
 ```
 
-`--grader-default` is accepted as a legacy alias for `--scorer-default`.
 
 ## Available scorers in this branch
 
@@ -200,7 +199,7 @@ A run passes overall iff the first five are `true` **and**
 `hallucinations` is `false`. The score is the fraction of the first
 five satisfied, minus 0.2 if `hallucinations` is `true`. The judge's
 free-form `suggestions` (or legacy `reason`) lands in
-`grade.rationale`; the full review dict lands in `grade.details`.
+`score.rationale`; the full review dict lands in `score.details`.
 
 To customise: edit `_PROMPT_TEMPLATE` in
 `src/evaluation/scorers/llm_judge.py`.
@@ -221,7 +220,7 @@ report = Evaluator(default_scorer="llm_judge").evaluate(
 )
 
 for r in report.results:
-    print(r.run_id, r.grade.passed, r.grade.score)
+    print(r.run_id, r.score.passed, r.score.score)
 ```
 
 ## Plug in a custom scorer
@@ -242,7 +241,7 @@ def keyword_hit(scenario, answer, trajectory_text) -> ScorerResult:
     )
 
 scorers.register("keyword_hit", keyword_hit)
-# Any scenario with "grading_method": "keyword_hit" now routes here.
+# Any scenario with "scoring_method": "keyword_hit" now routes here.
 ```
 
 ## Loop over all ground-truth files
diff --git a/src/evaluation/__init__.py b/src/evaluation/__init__.py
index 280da44d5..39cdf0df4 100644
--- a/src/evaluation/__init__.py
+++ b/src/evaluation/__init__.py
@@ -3,11 +3,11 @@
 Consumes saved trajectory files (written by
 :func:`observability.persistence.persist_trajectory`) and scenario files
 (under ``src/scenarios/``) and emits a structured JSON report combining
-graded outcomes with operational metrics.
+scored outcomes with operational metrics.
 
 The shape mirrors conventions from SWE-bench, HELM, and τ-bench:
 ``run`` (executes the agent — already exists) → ``evaluate`` (this
-module) → ``report.json``.  Re-grading from saved trajectories is
+module) → ``report.json``.  Re-scoring from saved trajectories is
 first-class.
 
 The evaluation concept follows MLflow's vocabulary: an
diff --git a/src/evaluation/cli.py b/src/evaluation/cli.py
index 4ba5535a6..faf369652 100644
--- a/src/evaluation/cli.py
+++ b/src/evaluation/cli.py
@@ -45,11 +45,10 @@ def _build_parser() -> argparse.ArgumentParser:
     )
     p.add_argument(
         "--scorer-default",
-        "--grader-default",
         dest="scorer_default",
         default="llm_judge",
-        help="Scorer name when scenario.grading_method is unset. "
-        "Default: llm_judge. (--grader-default is a legacy alias.)",
+        help="Scorer name when scenario.scoring_method is unset. "
+        "Default: llm_judge.",
     )
     p.add_argument(
         "--judge-model",
diff --git a/src/evaluation/evaluator.py b/src/evaluation/evaluator.py
index b61673a68..dedc82885 100644
--- a/src/evaluation/evaluator.py
+++ b/src/evaluation/evaluator.py
@@ -31,7 +31,7 @@ class Evaluator:
     """Run a batch of scenarios against their saved trajectories.
 
     ``default_scorer`` names the registered scorer to use when a
-    scenario does not set ``grading_method``.  Per-scenario overrides
+    scenario does not set ``scoring_method``.  Per-scenario overrides
     take precedence.
     """
 
@@ -56,10 +56,10 @@ def evaluate(
     def _score_one(
         self, scenario: Scenario, traj: PersistedTrajectory
     ) -> ScenarioResult:
-        name = scenario.grading_method or self.default_scorer
+        name = scenario.scoring_method or self.default_scorer
         scorer = self._resolve(name)
         trajectory_text = _trajectory_to_text(traj)
-        grade = scorer(scenario, traj.answer, trajectory_text)
+        score = scorer(scenario, traj.answer, trajectory_text)
 
         return ScenarioResult(
             scenario_id=scenario.id,
@@ -69,7 +69,7 @@ def _score_one(
             model=traj.model,
             question=traj.question,
             answer=traj.answer,
-            grade=grade,
+            score=score,
             ops=metrics_from_trajectory(traj),
         )
 
diff --git a/src/evaluation/models.py b/src/evaluation/models.py
index cdf1d202b..2f57e0b38 100644
--- a/src/evaluation/models.py
+++ b/src/evaluation/models.py
@@ -23,7 +23,7 @@ class Scenario(BaseModel):
     category: str = ""
     characteristic_form: str | None = None
     expected_answer: str | None = None
-    grading_method: str | None = None
+    scoring_method: str | None = None
 
     @classmethod
     def from_raw(cls, raw: dict) -> "Scenario":
@@ -70,7 +70,7 @@ class ScorerResult(BaseModel):
     """Output of a single :class:`Scorer` invocation.
 
     ``scorer`` is the registered name of the scorer that produced this
-    result — distinct from ``Scenario.grading_method``, which is the
+    result — distinct from ``Scenario.scoring_method``, which is the
     *requested* scorer on the input side.
     """
 
@@ -89,7 +89,7 @@ class ScenarioResult(BaseModel):
     model: str
     question: str
     answer: str
-    grade: ScorerResult
+    score: ScorerResult
     ops: OpsMetrics
 
 
diff --git a/src/evaluation/report.py b/src/evaluation/report.py
index 9f9ddc4a0..6ff5a9d28 100644
--- a/src/evaluation/report.py
+++ b/src/evaluation/report.py
@@ -1,4 +1,4 @@
-"""Build an :class:`EvalReport` from graded scenario results."""
+"""Build an :class:`EvalReport` from scored scenario results."""
 
 from __future__ import annotations
 
@@ -15,7 +15,7 @@
 
 def build_report(results: list[ScenarioResult]) -> EvalReport:
     total = len(results)
-    passed = sum(1 for r in results if r.grade.passed)
+    passed = sum(1 for r in results if r.score.passed)
 
     by_type: dict[str, list[ScenarioResult]] = defaultdict(list)
     for r in results:
@@ -24,7 +24,7 @@ def build_report(results: list[ScenarioResult]) -> EvalReport:
     breakdown: dict[str, TypeBreakdown] = {}
     for stype, items in by_type.items():
         n = len(items)
-        p = sum(1 for r in items if r.grade.passed)
+        p = sum(1 for r in items if r.score.passed)
         breakdown[stype] = TypeBreakdown(
             total=n,
             passed=p,
@@ -37,7 +37,7 @@ def build_report(results: list[ScenarioResult]) -> EvalReport:
         models=sorted({r.model for r in results}),
         totals={
             "scenarios": total,
-            "graded": total,
+            "scored": total,
             "passed": passed,
             "pass_rate": round(passed / total, 4) if total else 0.0,
         },
diff --git a/src/evaluation/runner.py b/src/evaluation/runner.py
index 23df16f77..507cdaa25 100644
--- a/src/evaluation/runner.py
+++ b/src/evaluation/runner.py
@@ -12,14 +12,14 @@ def evaluate(
     *,
     trajectories_path: Path,
     scenarios_paths: list[Path],
-    default_grading_method: str = "llm_judge",
+    default_scoring_method: str = "llm_judge",
 ) -> EvalReport:
     """Load, score, and aggregate.
 
-    Per-scenario scorer is picked from ``scenario.grading_method`` when
-    set, falling back to ``default_grading_method``.
+    Per-scenario scorer is picked from ``scenario.scoring_method`` when
+    set, falling back to ``default_scoring_method``.
     """
-    return Evaluator(default_scorer=default_grading_method).evaluate(
+    return Evaluator(default_scorer=default_scoring_method).evaluate(
         trajectories_path=trajectories_path,
         scenarios_paths=scenarios_paths,
     )
diff --git a/src/evaluation/scorers/__init__.py b/src/evaluation/scorers/__init__.py
index 3bad68a8c..a2fa994e6 100644
--- a/src/evaluation/scorers/__init__.py
+++ b/src/evaluation/scorers/__init__.py
@@ -7,7 +7,7 @@
 
 * **Code-Based** — deterministic, no model required (string/numeric
   matchers in :mod:`evaluation.scorers.code_based`).
-* **LLM-As-Judge** — model-graded against a rubric
+* **LLM-As-Judge** — model-scored against a rubric
   (:mod:`evaluation.scorers.llm_judge`).
 * **Semantic-Score** — similarity-based, no model call
   (:mod:`evaluation.scorers.semantic`).
diff --git a/src/evaluation/scorers/semantic.py b/src/evaluation/scorers/semantic.py
index c2a9bd61a..d56b401cb 100644
--- a/src/evaluation/scorers/semantic.py
+++ b/src/evaluation/scorers/semantic.py
@@ -1,4 +1,4 @@
-"""Semantic-Score scorer — similarity-based grading without an LLM call.
+"""Semantic-Score scorer — similarity-based scoring without an LLM call.
 
 Skeleton only — fill in the implementation (e.g. embedding cosine, BLEU,
 sentence-transformers, or difflib ratio) and re-register with the
diff --git a/src/evaluation/tests/test_evaluator.py b/src/evaluation/tests/test_evaluator.py
index 6cc761e46..6d91e633b 100644
--- a/src/evaluation/tests/test_evaluator.py
+++ b/src/evaluation/tests/test_evaluator.py
@@ -32,7 +32,7 @@ def test_evaluator_routes_to_default_scorer(tmp_path: Path, make_persisted_recor
     )
 
     assert report.totals["passed"] == 1
-    assert report.results[0].grade.scorer == "stub-evaluator"
+    assert report.results[0].score.scorer == "stub-evaluator"
 
 
 def _fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> ScorerResult:
@@ -40,7 +40,7 @@ def _fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -> Score
 
 
 def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_record):
-    # The scenario-level grading_method must route around the default
+    # The scenario-level scoring_method must route around the default
     # scorer, even when the default scorer would reject the answer.
     rec = make_persisted_record(run_id="run-1", scenario_id=1, answer="answer text")
     (tmp_path / "run-1.json").write_text(json.dumps(rec), encoding="utf-8")
@@ -53,7 +53,7 @@ def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_rec
                     "id": 1,
                     "text": "Q",
                     "type": "tsfm",
-                    "grading_method": "stub-evaluator",
+                    "scoring_method": "stub-evaluator",
                 }
             ]
         ),
@@ -69,4 +69,4 @@ def test_evaluator_per_scenario_override_wins(tmp_path: Path, make_persisted_rec
     )
 
     assert report.totals["passed"] == 1
-    assert report.results[0].grade.scorer == "stub-evaluator"
+    assert report.results[0].score.scorer == "stub-evaluator"
diff --git a/src/evaluation/tests/test_metrics.py b/src/evaluation/tests/test_metrics.py
index 804624ffb..21f097b1c 100644
--- a/src/evaluation/tests/test_metrics.py
+++ b/src/evaluation/tests/test_metrics.py
@@ -23,7 +23,7 @@ def _result(passed: bool = True, ops: OpsMetrics | None = None) -> ScenarioResul
         model="watsonx/ibm/granite",
         question="q",
         answer="a",
-        grade=ScorerResult(scorer="exact_string_match", passed=passed),
+        score=ScorerResult(scorer="exact_string_match", passed=passed),
         ops=ops or OpsMetrics(),
     )
 
diff --git a/src/evaluation/tests/test_report.py b/src/evaluation/tests/test_report.py
index d342bdd2d..7c71788dc 100644
--- a/src/evaluation/tests/test_report.py
+++ b/src/evaluation/tests/test_report.py
@@ -27,7 +27,7 @@ def _result(stype: str, passed: bool, run_id: str = "", **ops_kwargs) -> Scenari
         model="watsonx/ibm/granite",
         question="q",
         answer="a",
-        grade=ScorerResult(scorer="llm_judge", passed=passed, score=1.0 if passed else 0.0),
+        score=ScorerResult(scorer="llm_judge", passed=passed, score=1.0 if passed else 0.0),
         ops=OpsMetrics(**ops_kwargs),
     )
 
@@ -42,7 +42,7 @@ def test_build_report_totals_and_breakdown():
 
     assert report.totals == {
         "scenarios": 3,
-        "graded": 3,
+        "scored": 3,
         "passed": 2,
         "pass_rate": round(2 / 3, 4),
     }
@@ -82,7 +82,7 @@ def test_write_reports_dir_per_run_files(tmp_path: Path):
 
     per_run = json.loads((out_dir / "run-a.json").read_text())
     assert per_run["run_id"] == "run-a"
-    assert per_run["grade"]["passed"] is True
+    assert per_run["score"]["passed"] is True
 
     agg = json.loads((out_dir / "_aggregate.json").read_text())
     assert agg["totals"]["scenarios"] == 2
diff --git a/src/evaluation/tests/test_runner.py b/src/evaluation/tests/test_runner.py
index a9d558878..f8a936db0 100644
--- a/src/evaluation/tests/test_runner.py
+++ b/src/evaluation/tests/test_runner.py
@@ -37,7 +37,7 @@ def test_evaluate_end_to_end(tmp_path: Path, make_persisted_record):
     report = evaluate(
         trajectories_path=tmp_path,
         scenarios_paths=[scenarios_path],
-        default_grading_method="stub",
+        default_scoring_method="stub",
     )
 
     assert report.totals["scenarios"] == 2
@@ -50,7 +50,7 @@ def _always_fail_scorer(scenario: Scenario, answer: str, trajectory_text: str) -
     return ScorerResult(scorer="stub-fail", passed=False, score=0.0)
 
 
-def test_evaluate_uses_per_scenario_grading_method(tmp_path: Path, make_persisted_record):
+def test_evaluate_uses_per_scenario_scoring_method(tmp_path: Path, make_persisted_record):
     rec = make_persisted_record(run_id="run-x", scenario_id=1, answer="A.")
     (tmp_path / "run-x.json").write_text(json.dumps(rec), encoding="utf-8")
 
@@ -62,7 +62,7 @@ def test_evaluate_uses_per_scenario_grading_method(tmp_path: Path, make_persiste
                     "id": 1,
                     "text": "Q",
                     "type": "iot",
-                    "grading_method": "stub-pass",
+                    "scoring_method": "stub-pass",
                 }
             ]
         ),
@@ -75,7 +75,7 @@ def test_evaluate_uses_per_scenario_grading_method(tmp_path: Path, make_persiste
     report = evaluate(
         trajectories_path=tmp_path,
         scenarios_paths=[scenarios_path],
-        default_grading_method="stub-fail",  # per-scenario override wins
+        default_scoring_method="stub-fail",  # per-scenario override wins
     )
 
     # Override wins: scenario routed to the always-pass stub even though

From 3af08e6fb3c041375cf2d4eea5cbd783b097f304 Mon Sep 17 00:00:00 2001
From: Shuxin Lin <linshuhsin@gmail.com>
Date: Wed, 13 May 2026 15:59:33 -0400
Subject: [PATCH 8/8] docs(evaluation): clarify skeleton scorers in
 scenario-field table; add aggregate JSON shape

Signed-off-by: Shuxin Lin <linshuhsin@gmail.com>
---
 docs/evaluation.md | 50 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 38 insertions(+), 12 deletions(-)

diff --git a/docs/evaluation.md b/docs/evaluation.md
index 3b705b8a0..12ce4e1e2 100644
--- a/docs/evaluation.md
+++ b/docs/evaluation.md
@@ -40,15 +40,17 @@ The vocabulary follows MLflow's evaluation split:
 
 JSON list, JSON object, or JSONL. Fields the scorer cares about:
 
-| Field                 | Used by              | Notes                                            |
-| --------------------- | -------------------- | ------------------------------------------------ |
-| `id`                  | join                 | Coerced to string at load time                   |
-| `text`                | all                  | The utterance the agent answered                 |
-| `type`                | reporting            | Scenario family (`iot`, `tsfm`, `FMSR`, …)       |
-| `characteristic_form` | llm_judge, semantic  | Expected behaviour, free-form                    |
-| `expected_answer`     | code_based, semantic | Exact target string / number                     |
-| `scoring_method`      | dispatch             | Registered scorer name; overrides CLI default    |
-| `tolerance`           | numeric_match        | Optional relative + absolute tolerance           |
+| Field                 | Used by                                          | Notes                                          |
+| --------------------- | ------------------------------------------------ | ---------------------------------------------- |
+| `id`                  | join                                             | Coerced to string at load time                 |
+| `text`                | all                                              | The utterance the agent answered               |
+| `type`                | reporting                                        | Scenario family (`iot`, `tsfm`, `FMSR`, …)     |
+| `characteristic_form` | `llm_judge`, `semantic_similarity`*              | Expected behaviour, free-form                  |
+| `expected_answer`     | `exact_string_match`*, `numeric_match`*          | Exact target string / number                   |
+| `scoring_method`      | dispatch                                         | Registered scorer name; overrides CLI default  |
+| `tolerance`           | `numeric_match`*                                 | Optional relative + absolute tolerance         |
+
+\* Skeleton in this branch — see [Available scorers](#available-scorers-in-this-branch).
 
 Ground-truth files under `groundtruth/` already match this schema —
 they're a drop-in scenarios input.
@@ -153,9 +155,33 @@ Per-run file (`reports/<run_id>.json`):
 }
 ```
 
-Aggregate (`reports/_aggregate.json`) is the full `EvalReport` (totals,
-runners, models, by-scenario-type breakdown, ops rollup, and the list
-of per-scenario results).
+Aggregate (`reports/_aggregate.json`) is the full `EvalReport`:
+
+```json
+{
+  "generated_at": "<iso8601>",
+  "runners": ["claude-agent"],
+  "models":  ["litellm_proxy/aws/claude-opus-4-6"],
+  "totals": {
+    "scenarios": 1,
+    "scored":    1,
+    "passed":    1,
+    "pass_rate": 1.0
+  },
+  "by_scenario_type": {
+    "FMSR": {"total": 1, "passed": 1, "pass_rate": 1.0}
+  },
+  "ops": {
+    "tokens_in_total":    7,
+    "tokens_out_total":   25,
+    "tool_calls_total":   1,
+    "duration_ms_p50":    14690.6,
+    "duration_ms_p95":    14690.6,
+    "est_cost_usd_total": 0.001959
+  },
+  "results": [ /* one ScenarioResult per run, same shape as the per-run files */ ]
+}
+```
 
 ## CLI reference