diff --git a/.gitignore b/.gitignore
index d758b077..520c6db7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,7 +53,8 @@ frontend/yarn-error.log*
 .docker/
 
 # Eval (stale)
-eval/
+/eval/
+/eval_runs/
 
 # Project-specific
 session_logs/
diff --git a/README.md b/README.md
index 29fe439b..a60dfd45 100644
--- a/README.md
+++ b/README.md
@@ -54,6 +54,34 @@ ml-intern --max-iterations 100 "your prompt"
 ml-intern --no-stream "your prompt"
 ```
 
+## Evaluation
+
+Run a local baseline-vs-candidate benchmark on GLUE SST-2:
+
+```bash
+python eval.py \
+  --task glue_sst2 \
+  --baseline-model distilbert/distilbert-base-uncased-finetuned-sst-2-english \
+  --candidate-model your-org/your-sst2-model \
+  --output-dir eval_runs \
+  --limit 100
+```
+
+Optional cost metadata file:
+
+```json
+{
+  "training_cost": 12.5,
+  "eval_cost": 0.25
+}
+```
+
+Pass the file with `--cost-file path/to/costs.json`.
+Use `--split` to override the default dataset split and `--notes` to attach
+free-form context to the saved run record.
+The command writes one JSON run record and appends one row to
+`leaderboard.jsonl` in the output directory.
+
 ## Architecture
 
 ### Component Overview
diff --git a/agent/eval/__init__.py b/agent/eval/__init__.py
new file mode 100644
index 00000000..3b46a138
--- /dev/null
+++ b/agent/eval/__init__.py
@@ -0,0 +1,5 @@
+"""Public evaluation registry interface."""
+
+from agent.eval.registry import EvalTask, get_task
+
+__all__ = ["EvalTask", "get_task"]
diff --git a/agent/eval/artifacts.py b/agent/eval/artifacts.py
new file mode 100644
index 00000000..79e6d561
--- /dev/null
+++ b/agent/eval/artifacts.py
@@ -0,0 +1,72 @@
+"""Artifact builders and writers for evaluation runs."""
+
+from datetime import UTC, datetime
+from copy import deepcopy
+import json
+from pathlib import Path
+
+from agent.eval.compare import ComparisonResult
+
+
+def build_run_record(
+    run_id: str,
+    comparison: ComparisonResult,
+    dataset: str,
+    split: str,
+    parameters: dict,
+    training_cost: float | None,
+    eval_cost: float | None,
+    notes: str | None,
+) -> dict:
+    return {
+        "run_id": run_id,
+        "created_at": datetime.now(UTC).isoformat(),
+        "task": comparison.task_id,
+        "dataset": dataset,
+        "split": split,
+        "baseline_model": comparison.baseline.model_id,
+        "candidate_model": comparison.candidate.model_id,
+        "baseline_metrics": deepcopy(comparison.baseline.metrics),
+        "candidate_metrics": deepcopy(comparison.candidate.metrics),
+        "primary_metric": comparison.primary_metric,
+        "primary_delta": comparison.delta,
+        "training_cost": training_cost,
+        "eval_cost": eval_cost,
+        "notes": notes,
+        "parameters": deepcopy(parameters),
+    }
+
+
+def build_leaderboard_row(record: dict) -> dict:
+    metric = record["primary_metric"]
+    return {
+        "run_id": record["run_id"],
+        "created_at": record["created_at"],
+        "task": record["task"],
+        "baseline_model": record["baseline_model"],
+        "candidate_model": record["candidate_model"],
+        "primary_metric": metric,
+        "baseline_score": record["baseline_metrics"][metric],
+        "candidate_score": record["candidate_metrics"][metric],
+        "delta": record["primary_delta"],
+        "training_cost": record["training_cost"],
+        "eval_cost": record["eval_cost"],
+        "notes": record["notes"],
+    }
+
+
+def write_run_artifact(output_dir: Path, record: dict) -> Path:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    path = output_dir / f"{record['run_id']}.json"
+    if path.exists():
+        raise FileExistsError(f"Run artifact already exists: {path}")
+    path.write_text(json.dumps(record, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+    return path
+
+
+def append_leaderboard_row(output_dir: Path, row: dict) -> Path:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    path = output_dir / "leaderboard.jsonl"
+    with path.open("a", encoding="utf-8") as handle:
+        handle.write(json.dumps(row, sort_keys=True) + "\n")
+    return path
diff --git a/agent/eval/compare.py b/agent/eval/compare.py
new file mode 100644
index 00000000..81693850
--- /dev/null
+++ b/agent/eval/compare.py
@@ -0,0 +1,43 @@
+"""Comparison logic for baseline-vs-candidate evaluation results."""
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class ModelResult:
+    model_id: str
+    metrics: dict[str, float]
+
+
+@dataclass(frozen=True)
+class ComparisonResult:
+    task_id: str
+    primary_metric: str
+    baseline: ModelResult
+    candidate: ModelResult
+
+    @property
+    def baseline_score(self) -> float:
+        return self.baseline.metrics[self.primary_metric]
+
+    @property
+    def candidate_score(self) -> float:
+        return self.candidate.metrics[self.primary_metric]
+
+    @property
+    def delta(self) -> float:
+        return self.candidate_score - self.baseline_score
+
+
+def compare_results(
+    task_id: str,
+    primary_metric: str,
+    baseline: ModelResult,
+    candidate: ModelResult,
+) -> ComparisonResult:
+    return ComparisonResult(
+        task_id=task_id,
+        primary_metric=primary_metric,
+        baseline=baseline,
+        candidate=candidate,
+    )
diff --git a/agent/eval/registry.py b/agent/eval/registry.py
new file mode 100644
index 00000000..8680c2d2
--- /dev/null
+++ b/agent/eval/registry.py
@@ -0,0 +1,34 @@
+"""Task registry for local evaluation workflows."""
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class EvalTask:
+    task_id: str
+    dataset_name: str
+    dataset_config: str | None
+    default_split: str
+    text_column: str
+    label_column: str
+    primary_metric: str
+
+
+_TASKS: dict[str, EvalTask] = {
+    "glue_sst2": EvalTask(
+        task_id="glue_sst2",
+        dataset_name="glue",
+        dataset_config="sst2",
+        default_split="validation",
+        text_column="sentence",
+        label_column="label",
+        primary_metric="accuracy",
+    )
+}
+
+
+def get_task(task_id: str) -> EvalTask:
+    try:
+        return _TASKS[task_id]
+    except KeyError as exc:
+        raise ValueError(f"Unknown evaluation task: {task_id}") from exc
diff --git a/agent/eval/runner.py b/agent/eval/runner.py
new file mode 100644
index 00000000..3b2e4ba4
--- /dev/null
+++ b/agent/eval/runner.py
@@ -0,0 +1,84 @@
+"""GLUE SST-2 dataset loading and evaluation helpers."""
+
+import sys
+
+from datasets import load_dataset
+from huggingface_hub import InferenceClient
+
+from agent.eval.compare import ModelResult
+from agent.eval.registry import EvalTask
+
+
+def normalize_label(label: str | int) -> int:
+    mapping = {
+        "NEGATIVE": 0,
+        "POSITIVE": 1,
+        "LABEL_0": 0,
+        "LABEL_1": 1,
+        "0": 0,
+        "1": 1,
+    }
+    if isinstance(label, int):
+        if label in (0, 1):
+            return label
+        raise ValueError(f"Unsupported label from inference API: {label}")
+    try:
+        return mapping[str(label).upper()]
+    except KeyError as exc:
+        raise ValueError(f"Unsupported label from inference API: {label}") from exc
+
+
+def load_examples(
+    task: EvalTask,
+    split: str | None = None,
+    limit: int | None = None,
+) -> list[dict]:
+    selected_split = split or task.default_split
+    dataset_args = [task.dataset_config] if task.dataset_config else []
+    dataset = load_dataset(task.dataset_name, *dataset_args, split=selected_split)
+    records = list(dataset)
+    if limit is not None:
+        records = records[:limit]
+    return records
+
+
+def extract_label(response) -> str | int:
+    if isinstance(response, list):
+        if not response:
+            raise ValueError("Empty response from inference API")
+        response = response[0]
+
+    if isinstance(response, dict):
+        return response["label"]
+
+    label = getattr(response, "label", None)
+    if label is None:
+        raise ValueError("Inference response does not contain a label")
+    return label
+
+
+def evaluate_model(
+    task: EvalTask,
+    model_id: str,
+    examples: list[dict],
+    client: InferenceClient | None = None,
+) -> ModelResult:
+    client = client or InferenceClient()
+    correct = 0
+
+    for index, example in enumerate(examples):
+        try:
+            response = client.text_classification(example[task.text_column], model=model_id)
+            predicted = normalize_label(extract_label(response))
+        except Exception as exc:
+            print(
+                "Evaluation failed "
+                f"(task={task.task_id}, model={model_id}, example_index={index}): {exc}",
+                file=sys.stderr,
+            )
+            raise
+        if predicted == example[task.label_column]:
+            correct += 1
+
+    accuracy = correct / len(examples) if examples else 0.0
+    return ModelResult(model_id=model_id, metrics={"accuracy": accuracy})
diff --git a/docs/superpowers/specs/2026-04-23-evaluation-benchmarking-design.md b/docs/superpowers/specs/2026-04-23-evaluation-benchmarking-design.md
new file mode 100644
index 00000000..834029dc
--- /dev/null
+++ b/docs/superpowers/specs/2026-04-23-evaluation-benchmarking-design.md
@@ -0,0 +1,317 @@
+# Evaluation And Benchmarking Design
+
+Date: 2026-04-23
+Issue: `#84`
+Scope: First PR only
+
+## Goal
+
+Add a reproducible local evaluation pipeline for comparing a baseline model
+against a trained model on a task-specific benchmark. The first PR should
+produce machine-readable artifacts and a leaderboard-style summary without
+changing the live agent runtime or the web UI.
+
+## Problem
+
+The repo can help users train and iterate on models, but it does not yet offer
+a standard way to answer a basic question: did the trained model improve over
+the baseline, and was the gain worth the cost?
+
+Issue `#84` asks for:
+
+- task-specific benchmarks
+- baseline vs agent-generated model comparison
+- training cost vs performance gain tracking
+- a reproducible `eval.py`
+- leaderboard-style logging
+
+This design narrows that request into a first contribution that is small enough
+for one PR and reusable by later work.
+
+## First-PR Scope
+
+This PR will add:
+
+- a local CLI entrypoint, `eval.py`
+- a small evaluation utility layer separate from `agent/core/*`
+- support for comparing exactly two models in one run:
+  - baseline model
+  - candidate model
+- a task registry for explicit benchmark definitions
+- machine-readable run artifacts
+- a leaderboard-style artifact for cross-run comparison
+- unit tests for registry resolution, comparison logic, and artifact writing
+- contributor documentation for how to run the evaluation pipeline
+
+This PR will not add:
+
+- frontend leaderboard views
+- backend endpoints for evaluation storage
+- automatic agent-triggered evaluation
+- broad benchmark support for every task family
+- heavyweight end-to-end remote evaluation tests
+
+## Design Summary
+
+The new feature should live as a standalone evaluation workflow that can be run
+by contributors from the command line. The evaluation logic must be isolated
+from the live chat session path so that the first PR stays low-risk and easy to
+review.
+
+The system will:
+
+1. parse CLI arguments
+2. resolve a named evaluation task
+3. evaluate the baseline model
+4. evaluate the candidate model
+5. compute metric deltas and cost metadata
+6. write a reproducible run record
+7. update a leaderboard-style artifact
+
+## Proposed File Layout
+
+Exact names may shift slightly to match repo conventions, but the structure
+should stay close to this:
+
+```text
+eval.py
+agent/
+  eval/
+    __init__.py
+    registry.py
+    runner.py
+    compare.py
+    artifacts.py
+tests/
+  unit/
+    test_eval_registry.py
+    test_eval_compare.py
+    test_eval_artifacts.py
+```
+
+Rationale:
+
+- `eval.py` stays as the thin executable entrypoint requested by the issue
+- `agent/eval/` keeps the logic reusable without coupling it to `agent/core/*`
+- tests focus on deterministic behavior instead of real remote inference
+
+## CLI Contract
+
+The script should accept explicit, reproducible inputs instead of hidden
+defaults. The minimal interface should include:
+
+- `--task`: task id from the registry
+- `--baseline-model`: baseline model id
+- `--candidate-model`: model id to compare against the baseline
+- `--output-dir`: where artifacts are written
+
+Optional but useful first-PR inputs:
+
+- `--limit`: cap number of evaluation samples for quick runs
+- `--split`: override the configured dataset split when supported
+- `--cost-file`: path to cost metadata provided by the user
+- `--notes`: free-form text stored in the run artifact
+
+The script should print a concise terminal summary and write the canonical
+result to disk.
+
+## Task Registry
+
+The first PR should use an explicit task registry instead of a generic
+"benchmark anything" abstraction.
+
+Each task definition should declare:
+
+- task id
+- dataset or benchmark source
+- default split
+- primary metric name
+- any secondary metrics
+- any evaluation parameters needed by the runner
+
+Why a registry:
+
+- keeps supported benchmarks obvious
+- gives each task one clear configuration source
+- makes tests deterministic
+- provides a clean extension point for future tasks
+
+The initial implementation should support exactly one built-in task:
+`glue_sst2`.
+
+Why `glue_sst2` first:
+
+- it matches the issue's request for task-specific benchmarks
+- it is small enough for a focused first PR
+- it uses a well-known benchmark
+- the primary metric is straightforward accuracy
+- it can be evaluated without inventing a broad benchmark framework first
+
+## Evaluation Boundaries
+
+The evaluator should be structured around small interfaces:
+
+- task resolution
+- model evaluation
+- result comparison
+- artifact writing
+
+This separation matters because the first PR should prove the data model and
+workflow before taking on complex provider-specific evaluation plumbing.
+
+The core comparison layer should not depend on how metrics were produced. That
+lets unit tests inject stub metric outputs while future integrations can plug
+in real benchmark runners.
+
+## Artifact Model
+
+Two artifact types should be written.
+
+### 1. Per-run artifact
+
+This is the source of truth for one evaluation run. It should be machine
+readable and detailed enough to reproduce the setup.
+
+Format: one JSON file per successful run.
+
+Recommended fields:
+
+- `run_id`
+- `created_at`
+- `task`
+- `dataset`
+- `split`
+- `baseline_model`
+- `candidate_model`
+- `baseline_metrics`
+- `candidate_metrics`
+- `primary_metric`
+- `primary_delta`
+- `training_cost`
+- `eval_cost`
+- `notes`
+- `parameters`
+
+The `parameters` object should capture the exact evaluation configuration used,
+such as sample limit, split, and any task-specific settings.
+
+### 2. Leaderboard artifact
+
+This is a compact comparison view across runs.
+
+Format: append-only JSONL, one row per successful run.
+
+Recommended fields:
+
+- `run_id`
+- `task`
+- `baseline_model`
+- `candidate_model`
+- `primary_metric`
+- `baseline_score`
+- `candidate_score`
+- `delta`
+- `training_cost`
+- `eval_cost`
+- `created_at`
+- `notes`
+
+The leaderboard should favor readability and scanning over completeness.
+Detailed debugging and reproducibility should stay in the per-run artifact.
+
+## Cost Tracking
+
+Issue `#84` explicitly asks for training cost vs performance gain tracking.
+For the first PR, cost data should be accepted as explicit metadata rather than
+calculated automatically from platform logs.
+
+This keeps scope under control while still establishing the schema we need for
+future automation.
+
+Behavior:
+
+- if a cost file or explicit cost metadata is provided, store it in the run
+  artifact and leaderboard row
+- if no cost metadata is provided, write `null` values rather than guessing
+
+This makes the data model honest and avoids silently incorrect cost numbers.
+
+## Error Handling
+
+The script should fail early and clearly for invalid setup.
+
+Expected early validation errors:
+
+- unknown task id
+- missing model ids
+- unsupported output directory state
+- malformed cost metadata
+
+Runtime evaluation failures should:
+
+- surface enough context to debug which model and task failed
+- avoid producing a misleading "successful" leaderboard entry
+- avoid partially written artifacts that look complete
+
+If one model evaluation fails, the command should exit non-zero and skip both
+the per-run artifact and the leaderboard update. The first PR should not invent
+a failed-run artifact format.
+
+## Testing Strategy
+
+This PR should focus on deterministic unit tests.
+
+Required coverage:
+
+- registry resolution for known and unknown tasks
+- comparison logic for baseline vs candidate metrics
+- derived delta calculation
+- per-run artifact writing
+- leaderboard append or rewrite behavior
+- invalid input handling
+
+Tests should use stubbed evaluation outputs rather than real model calls.
+That keeps the test suite fast, reliable, and suitable for CI.
+
+Real remote evaluation can be added later once the core data flow is proven.
+
+## Why This Slice First
+
+This slice lands the most valuable infrastructure from the issue without
+mixing multiple risky changes together.
+
+It gives the project:
+
+- a reproducible evaluation entrypoint
+- a standard comparison record
+- a leaderboard-ready data format
+- a clean extension point for future tasks and UI work
+
+It avoids:
+
+- changing live agent behavior
+- shipping partial backend storage design
+- taking on UI requirements before the underlying data model is stable
+
+## Follow-up Work After This PR
+
+Natural next steps once this lands:
+
+- add more task definitions
+- integrate real benchmark runners where appropriate
+- let the agent trigger the evaluation pipeline
+- surface leaderboard results in the web UI
+- automate cost collection from training/eval jobs
+
+## Acceptance Criteria
+
+This first PR is successful if:
+
+- contributors can run `eval.py` locally with a task, baseline model, and
+  candidate model
+- the script writes a reproducible run artifact
+- the script writes or updates a leaderboard-style artifact
+- the comparison includes primary metric deltas
+- optional cost metadata is recorded without guessing
+- unit tests cover the core workflow logic
+- no live agent, backend, or frontend behavior changes are required
diff --git a/eval.py b/eval.py
new file mode 100644
index 00000000..97f62796
--- /dev/null
+++ b/eval.py
@@ -0,0 +1,88 @@
+"""Local evaluation CLI for baseline-vs-candidate model comparisons."""
+
+import argparse
+import json
+from pathlib import Path
+import uuid
+
+from agent.eval.artifacts import (
+    append_leaderboard_row,
+    build_leaderboard_row,
+    build_run_record,
+    write_run_artifact,
+)
+from agent.eval.compare import compare_results
+from agent.eval.registry import get_task
+from agent.eval.runner import evaluate_model, load_examples
+
+
+def positive_int(value: str) -> int:
+    parsed = int(value)
+    if parsed <= 0:
+        raise argparse.ArgumentTypeError("limit must be a positive integer")
+    return parsed
+
+
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Evaluate a baseline model against a candidate model.",
+    )
+    parser.add_argument("--task", required=True)
+    parser.add_argument("--baseline-model", required=True)
+    parser.add_argument("--candidate-model", required=True)
+    parser.add_argument("--output-dir", required=True)
+    parser.add_argument("--limit", type=positive_int, default=None)
+    parser.add_argument("--split", default=None)
+    parser.add_argument("--cost-file", default=None)
+    parser.add_argument("--notes", default=None)
+    return parser.parse_args(argv)
+
+
+def load_cost_metadata(path: str | None) -> tuple[float | None, float | None]:
+    if path is None:
+        return None, None
+    payload = json.loads(Path(path).read_text(encoding="utf-8"))
+    return payload.get("training_cost"), payload.get("eval_cost")
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = parse_args(argv)
+    task = get_task(args.task)
+    training_cost, eval_cost = load_cost_metadata(args.cost_file)
+    examples = load_examples(task, split=args.split, limit=args.limit)
+    if not examples:
+        raise ValueError("No evaluation examples loaded")
+
+    baseline = evaluate_model(task, args.baseline_model, examples)
+    candidate = evaluate_model(task, args.candidate_model, examples)
+
+    comparison = compare_results(
+        task_id=task.task_id,
+        primary_metric=task.primary_metric,
+        baseline=baseline,
+        candidate=candidate,
+    )
+    output_dir = Path(args.output_dir)
+    run_record = build_run_record(
+        run_id=str(uuid.uuid4()),
+        comparison=comparison,
+        dataset=f"{task.dataset_name}/{task.dataset_config}",
+        split=args.split or task.default_split,
+        parameters={"limit": args.limit},
+        training_cost=training_cost,
+        eval_cost=eval_cost,
+        notes=args.notes,
+    )
+    write_run_artifact(output_dir, run_record)
+    append_leaderboard_row(output_dir, build_leaderboard_row(run_record))
+
+    print(
+        f"{comparison.primary_metric}: "
+        f"{comparison.baseline_score:.4f} -> {comparison.candidate_score:.4f} "
+        f"(delta {comparison.delta:+.4f})"
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/unit/test_eval_artifacts.py b/tests/unit/test_eval_artifacts.py
new file mode 100644
index 00000000..ea289575
--- /dev/null
+++ b/tests/unit/test_eval_artifacts.py
@@ -0,0 +1,125 @@
+import json
+
+import pytest
+
+from agent.eval.artifacts import (
+    append_leaderboard_row,
+    build_leaderboard_row,
+    build_run_record,
+    write_run_artifact,
+)
+from agent.eval.compare import ModelResult, compare_results
+
+
+def test_write_run_artifact_creates_json_file(tmp_path):
+    record = {
+        "run_id": "run-123",
+        "task": "glue_sst2",
+        "primary_metric": "accuracy",
+        "primary_delta": 0.05,
+    }
+
+    path = write_run_artifact(tmp_path, record)
+
+    assert path.name == "run-123.json"
+    assert json.loads(path.read_text()) == record
+
+
+def test_write_run_artifact_rejects_duplicate_run_id(tmp_path):
+    record = {
+        "run_id": "run-123",
+        "task": "glue_sst2",
+        "primary_metric": "accuracy",
+        "primary_delta": 0.05,
+    }
+
+    write_run_artifact(tmp_path, record)
+
+    with pytest.raises(FileExistsError, match="Run artifact already exists:"):
+        write_run_artifact(tmp_path, record)
+
+
+def test_append_leaderboard_row_appends_jsonl_line(tmp_path):
+    first_row = {
+        "run_id": "run-123",
+        "task": "glue_sst2",
+        "delta": 0.05,
+    }
+    second_row = {
+        "run_id": "run-456",
+        "task": "glue_sst2",
+        "delta": -0.01,
+    }
+
+    path = append_leaderboard_row(tmp_path, first_row)
+    append_leaderboard_row(tmp_path, second_row)
+
+    lines = path.read_text().strip().splitlines()
+    assert len(lines) == 2
+    assert json.loads(lines[0]) == first_row
+    assert json.loads(lines[1]) == second_row
+
+
+def test_build_record_and_leaderboard_row_include_cost_metadata():
+    comparison = compare_results(
+        task_id="glue_sst2",
+        primary_metric="accuracy",
+        baseline=ModelResult("baseline", {"accuracy": 0.84}),
+        candidate=ModelResult("candidate", {"accuracy": 0.89}),
+    )
+
+    record = build_run_record(
+        run_id="run-123",
+        comparison=comparison,
+        dataset="glue/sst2",
+        split="validation",
+        parameters={"limit": 100},
+        training_cost=12.5,
+        eval_cost=0.25,
+        notes="first pass",
+    )
+    row = build_leaderboard_row(record)
+
+    assert record["run_id"] == "run-123"
+    assert record["training_cost"] == 12.5
+    assert record["eval_cost"] == 0.25
+    assert record["parameters"] == {"limit": 100}
+    assert row["baseline_score"] == 0.84
+    assert row["candidate_score"] == 0.89
+    assert row["delta"] == pytest.approx(0.05)
+    assert row["training_cost"] == 12.5
+    assert row["eval_cost"] == 0.25
+
+
+def test_build_run_record_snapshots_optional_metadata_and_metrics():
+    baseline_metrics = {"accuracy": 0.5}
+    candidate_metrics = {"accuracy": 0.6}
+    parameters = {"limit": 10}
+    comparison = compare_results(
+        task_id="glue_sst2",
+        primary_metric="accuracy",
+        baseline=ModelResult("baseline", baseline_metrics),
+        candidate=ModelResult("candidate", candidate_metrics),
+    )
+
+    record = build_run_record(
+        run_id="run-789",
+        comparison=comparison,
+        dataset="glue/sst2",
+        split="validation",
+        parameters=parameters,
+        training_cost=None,
+        eval_cost=None,
+        notes=None,
+    )
+
+    baseline_metrics["accuracy"] = 0.0
+    candidate_metrics["accuracy"] = 0.0
+    parameters["limit"] = 1
+
+    assert record["baseline_metrics"] == {"accuracy": 0.5}
+    assert record["candidate_metrics"] == {"accuracy": 0.6}
+    assert record["parameters"] == {"limit": 10}
+    assert record["training_cost"] is None
+    assert record["eval_cost"] is None
+    assert record["notes"] is None
diff --git a/tests/unit/test_eval_cli.py b/tests/unit/test_eval_cli.py
new file mode 100644
index 00000000..dd01ab93
--- /dev/null
+++ b/tests/unit/test_eval_cli.py
@@ -0,0 +1,82 @@
+import json
+
+import pytest
+
+
+def test_eval_cli_writes_artifacts(monkeypatch, tmp_path):
+    from eval import main
+    from agent.eval.compare import ModelResult
+
+    monkeypatch.setattr(
+        "eval.load_examples",
+        lambda task, split, limit: [{"sentence": "great", "label": 1}],
+    )
+    monkeypatch.setattr(
+        "eval.evaluate_model",
+        lambda task, model_id, examples, client=None: ModelResult(
+            model_id=model_id,
+            metrics={"accuracy": 1.0 if "candidate" in model_id else 0.0},
+        ),
+    )
+
+    exit_code = main(
+        [
+            "--task", "glue_sst2",
+            "--baseline-model", "baseline-model",
+            "--candidate-model", "candidate-model",
+            "--output-dir", str(tmp_path),
+            "--limit", "1",
+        ]
+    )
+
+    assert exit_code == 0
+    leaderboard = (tmp_path / "leaderboard.jsonl").read_text().strip().splitlines()
+    assert len(leaderboard) == 1
+    row = json.loads(leaderboard[0])
+    assert row["task"] == "glue_sst2"
+    assert row["delta"] == 1.0
+
+
+def test_eval_cli_validates_cost_file_before_model_evaluation(monkeypatch, tmp_path):
+    from eval import main
+
+    calls = {"count": 0}
+
+    def fail_if_called(*args, **kwargs):
+        calls["count"] += 1
+        raise AssertionError("evaluate_model should not be called")
+
+    monkeypatch.setattr("eval.load_examples", lambda task, split, limit: [{"sentence": "great", "label": 1}])
+    monkeypatch.setattr("eval.evaluate_model", fail_if_called)
+
+    missing_cost_file = tmp_path / "missing-costs.json"
+
+    with pytest.raises(FileNotFoundError):
+        main(
+            [
+                "--task", "glue_sst2",
+                "--baseline-model", "baseline-model",
+                "--candidate-model", "candidate-model",
+                "--output-dir", str(tmp_path),
+                "--cost-file", str(missing_cost_file),
+            ]
+        )
+
+    assert calls["count"] == 0
+
+
+def test_eval_cli_rejects_non_positive_limit():
+    from eval import main
+
+    with pytest.raises(SystemExit) as exc_info:
+        main(
+            [
+                "--task", "glue_sst2",
+                "--baseline-model", "baseline-model",
+                "--candidate-model", "candidate-model",
+                "--output-dir", "eval_runs",
+                "--limit", "0",
+            ]
+        )
+
+    assert exc_info.value.code == 2
diff --git a/tests/unit/test_eval_compare.py b/tests/unit/test_eval_compare.py
new file mode 100644
index 00000000..f4c8e570
--- /dev/null
+++ b/tests/unit/test_eval_compare.py
@@ -0,0 +1,26 @@
+import pytest
+
+from agent.eval.compare import ModelResult, compare_results
+
+
+def test_compare_results_computes_primary_metric_delta():
+    baseline = ModelResult(
+        model_id="baseline-model",
+        metrics={"accuracy": 0.84},
+    )
+    candidate = ModelResult(
+        model_id="candidate-model",
+        metrics={"accuracy": 0.89},
+    )
+
+    comparison = compare_results(
+        task_id="glue_sst2",
+        primary_metric="accuracy",
+        baseline=baseline,
+        candidate=candidate,
+    )
+
+    assert comparison.primary_metric == "accuracy"
+    assert comparison.baseline_score == 0.84
+    assert comparison.candidate_score == 0.89
+    assert comparison.delta == pytest.approx(0.05)
diff --git a/tests/unit/test_eval_registry.py b/tests/unit/test_eval_registry.py
new file mode 100644
index 00000000..9a9489b7
--- /dev/null
+++ b/tests/unit/test_eval_registry.py
@@ -0,0 +1,21 @@
+import pytest
+
+from agent.eval import EvalTask, get_task
+
+
+def test_get_task_returns_glue_sst2_definition():
+    task = get_task("glue_sst2")
+
+    assert isinstance(task, EvalTask)
+    assert task.task_id == "glue_sst2"
+    assert task.dataset_name == "glue"
+    assert task.dataset_config == "sst2"
+    assert task.default_split == "validation"
+    assert task.text_column == "sentence"
+    assert task.label_column == "label"
+    assert task.primary_metric == "accuracy"
+
+
+def test_get_task_raises_for_unknown_task():
+    with pytest.raises(ValueError, match="^Unknown evaluation task: unknown_task$"):
+        get_task("unknown_task")
diff --git a/tests/unit/test_eval_runner.py b/tests/unit/test_eval_runner.py
new file mode 100644
index 00000000..88c9c6a4
--- /dev/null
+++ b/tests/unit/test_eval_runner.py
@@ -0,0 +1,156 @@
+import pytest
+
+from agent.eval.registry import EvalTask
+from agent.eval.registry import get_task
+from agent.eval.runner import evaluate_model, load_examples, normalize_label
+
+
+class FakeInferenceClient:
+    def __init__(self, predictions):
+        self._predictions = predictions
+        self.calls = []
+
+    def text_classification(self, text, model):
+        self.calls.append((text, model))
+        label = self._predictions[len(self.calls) - 1]
+        return [{"label": label, "score": 0.99}]
+
+
+class FakeLabelObject:
+    def __init__(self, label):
+        self.label = label
+
+
+class FakeObjectInferenceClient(FakeInferenceClient):
+    def text_classification(self, text, model):
+        self.calls.append((text, model))
+        label = self._predictions[len(self.calls) - 1]
+        return FakeLabelObject(label)
+
+
+def test_normalize_label_supports_hf_text_classification_aliases():
+    assert normalize_label("NEGATIVE") == 0
+    assert normalize_label("POSITIVE") == 1
+    assert normalize_label("LABEL_0") == 0
+    assert normalize_label("LABEL_1") == 1
+    assert normalize_label("0") == 0
+    assert normalize_label("1") == 1
+    assert normalize_label(0) == 0
+    assert normalize_label(1) == 1
+
+
+def test_evaluate_model_computes_accuracy_from_examples():
+    task = get_task("glue_sst2")
+    examples = [
+        {"sentence": "bad film", "label": 0},
+        {"sentence": "great film", "label": 1},
+        {"sentence": "fine film", "label": 1},
+    ]
+    client = FakeInferenceClient(["NEGATIVE", "POSITIVE", "NEGATIVE"])
+
+    result = evaluate_model(
+        task=task,
+        model_id="candidate-model",
+        examples=examples,
+        client=client,
+    )
+
+    assert result.model_id == "candidate-model"
+    assert result.metrics == {"accuracy": 2 / 3}
+    assert len(client.calls) == 3
+
+
+def test_evaluate_model_accepts_object_label_responses():
+    task = get_task("glue_sst2")
+    examples = [{"sentence": "great film", "label": 1}]
+    client = FakeObjectInferenceClient(["POSITIVE"])
+
+    result = evaluate_model(
+        task=task,
+        model_id="candidate-model",
+        examples=examples,
+        client=client,
+    )
+
+    assert result.metrics == {"accuracy": 1.0}
+
+
+def test_load_examples_respects_limit(monkeypatch):
+    class FakeSplit(list):
+        pass
+
+    def fake_load_dataset(name, config, split):
+        assert name == "glue"
+        assert config == "sst2"
+        assert split == "validation"
+        return FakeSplit(
+            [
+                {"sentence": "a", "label": 0},
+                {"sentence": "b", "label": 1},
+                {"sentence": "c", "label": 0},
+            ]
+        )
+
+    monkeypatch.setattr("agent.eval.runner.load_dataset", fake_load_dataset)
+
+    task = get_task("glue_sst2")
+    examples = load_examples(task, split="validation", limit=2)
+
+    assert examples == [
+        {"sentence": "a", "label": 0},
+        {"sentence": "b", "label": 1},
+    ]
+
+
+def test_load_examples_omits_missing_dataset_config(monkeypatch):
+    calls = []
+
+    def fake_load_dataset(*args, **kwargs):
+        calls.append((args, kwargs))
+        return [{"text": "a", "label": 0}]
+
+    monkeypatch.setattr("agent.eval.runner.load_dataset", fake_load_dataset)
+
+    task = EvalTask(
+        task_id="custom",
+        dataset_name="custom_dataset",
+        dataset_config=None,
+        default_split="test",
+        text_column="text",
+        label_column="label",
+        primary_metric="accuracy",
+    )
+
+    assert load_examples(task) == [{"text": "a", "label": 0}]
+    assert calls == [(("custom_dataset",), {"split": "test"})]
+
+
+def test_evaluate_model_reports_failed_example_index(capsys):
+    class FailingInferenceClient:
+        def __init__(self):
+            self.calls = 0
+
+        def text_classification(self, text, model):
+            self.calls += 1
+            if self.calls == 2:
+                raise RuntimeError("rate limited")
+            return [{"label": "NEGATIVE", "score": 0.99}]
+
+    task = get_task("glue_sst2")
+    examples = [
+        {"sentence": "bad film", "label": 0},
+        {"sentence": "great film", "label": 1},
+    ]
+
+    with pytest.raises(RuntimeError, match="rate limited"):
+        evaluate_model(
+            task=task,
+            model_id="candidate-model",
+            examples=examples,
+            client=FailingInferenceClient(),
+        )
+
+    captured = capsys.readouterr()
+    assert "task=glue_sst2" in captured.err
+    assert "model=candidate-model" in captured.err
+    assert "example_index=1" in captured.err