diff --git a/.gitignore b/.gitignore index d758b077..520c6db7 100644 --- a/.gitignore +++ b/.gitignore @@ -53,7 +53,8 @@ frontend/yarn-error.log* .docker/ # Eval (stale) -eval/ +/eval/ +/eval_runs/ # Project-specific session_logs/ diff --git a/README.md b/README.md index 29fe439b..a60dfd45 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,34 @@ ml-intern --max-iterations 100 "your prompt" ml-intern --no-stream "your prompt" ``` +## Evaluation + +Run a local baseline-vs-candidate benchmark on GLUE SST-2: + +```bash +python eval.py \ + --task glue_sst2 \ + --baseline-model distilbert/distilbert-base-uncased-finetuned-sst-2-english \ + --candidate-model your-org/your-sst2-model \ + --output-dir eval_runs \ + --limit 100 +``` + +Optional cost metadata file: + +```json +{ + "training_cost": 12.5, + "eval_cost": 0.25 +} +``` + +Pass the file with `--cost-file path/to/costs.json`. +Use `--split` to override the default dataset split and `--notes` to attach +free-form context to the saved run record. +The command writes one JSON run record and appends one row to +`leaderboard.jsonl` in the output directory. + ## Architecture ### Component Overview diff --git a/agent/eval/__init__.py b/agent/eval/__init__.py new file mode 100644 index 00000000..3b46a138 --- /dev/null +++ b/agent/eval/__init__.py @@ -0,0 +1,5 @@ +"""Public evaluation registry interface.""" + +from agent.eval.registry import EvalTask, get_task + +__all__ = ["EvalTask", "get_task"] diff --git a/agent/eval/artifacts.py b/agent/eval/artifacts.py new file mode 100644 index 00000000..79e6d561 --- /dev/null +++ b/agent/eval/artifacts.py @@ -0,0 +1,72 @@ +"""Artifact builders and writers for evaluation runs.""" + +from datetime import UTC, datetime +from copy import deepcopy +import json +from pathlib import Path + +from agent.eval.compare import ComparisonResult + + +def build_run_record( + run_id: str, + comparison: ComparisonResult, + dataset: str, + split: str, + parameters: dict, + training_cost: float | None, + eval_cost: float | None, + notes: str | None, +) -> dict: + return { + "run_id": run_id, + "created_at": datetime.now(UTC).isoformat(), + "task": comparison.task_id, + "dataset": dataset, + "split": split, + "baseline_model": comparison.baseline.model_id, + "candidate_model": comparison.candidate.model_id, + "baseline_metrics": deepcopy(comparison.baseline.metrics), + "candidate_metrics": deepcopy(comparison.candidate.metrics), + "primary_metric": comparison.primary_metric, + "primary_delta": comparison.delta, + "training_cost": training_cost, + "eval_cost": eval_cost, + "notes": notes, + "parameters": deepcopy(parameters), + } + + +def build_leaderboard_row(record: dict) -> dict: + metric = record["primary_metric"] + return { + "run_id": record["run_id"], + "created_at": record["created_at"], + "task": record["task"], + "baseline_model": record["baseline_model"], + "candidate_model": record["candidate_model"], + "primary_metric": metric, + "baseline_score": record["baseline_metrics"][metric], + "candidate_score": record["candidate_metrics"][metric], + "delta": record["primary_delta"], + "training_cost": record["training_cost"], + "eval_cost": record["eval_cost"], + "notes": record["notes"], + } + + +def write_run_artifact(output_dir: Path, record: dict) -> Path: + output_dir.mkdir(parents=True, exist_ok=True) + path = output_dir / f"{record['run_id']}.json" + if path.exists(): + raise FileExistsError(f"Run artifact already exists: {path}") + path.write_text(json.dumps(record, indent=2, sort_keys=True) + "\n", encoding="utf-8") + return path + + +def append_leaderboard_row(output_dir: Path, row: dict) -> Path: + output_dir.mkdir(parents=True, exist_ok=True) + path = output_dir / "leaderboard.jsonl" + with path.open("a", encoding="utf-8") as handle: + handle.write(json.dumps(row, sort_keys=True) + "\n") + return path diff --git a/agent/eval/compare.py b/agent/eval/compare.py new file mode 100644 index 00000000..81693850 --- /dev/null +++ b/agent/eval/compare.py @@ -0,0 +1,43 @@ +"""Comparison logic for baseline-vs-candidate evaluation results.""" + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class ModelResult: + model_id: str + metrics: dict[str, float] + + +@dataclass(frozen=True) +class ComparisonResult: + task_id: str + primary_metric: str + baseline: ModelResult + candidate: ModelResult + + @property + def baseline_score(self) -> float: + return self.baseline.metrics[self.primary_metric] + + @property + def candidate_score(self) -> float: + return self.candidate.metrics[self.primary_metric] + + @property + def delta(self) -> float: + return self.candidate_score - self.baseline_score + + +def compare_results( + task_id: str, + primary_metric: str, + baseline: ModelResult, + candidate: ModelResult, +) -> ComparisonResult: + return ComparisonResult( + task_id=task_id, + primary_metric=primary_metric, + baseline=baseline, + candidate=candidate, + ) diff --git a/agent/eval/registry.py b/agent/eval/registry.py new file mode 100644 index 00000000..8680c2d2 --- /dev/null +++ b/agent/eval/registry.py @@ -0,0 +1,34 @@ +"""Task registry for local evaluation workflows.""" + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class EvalTask: + task_id: str + dataset_name: str + dataset_config: str | None + default_split: str + text_column: str + label_column: str + primary_metric: str + + +_TASKS: dict[str, EvalTask] = { + "glue_sst2": EvalTask( + task_id="glue_sst2", + dataset_name="glue", + dataset_config="sst2", + default_split="validation", + text_column="sentence", + label_column="label", + primary_metric="accuracy", + ) +} + + +def get_task(task_id: str) -> EvalTask: + try: + return _TASKS[task_id] + except KeyError as exc: + raise ValueError(f"Unknown evaluation task: {task_id}") from exc diff --git a/agent/eval/runner.py b/agent/eval/runner.py new file mode 100644 index 00000000..3b2e4ba4 --- /dev/null +++ b/agent/eval/runner.py @@ -0,0 +1,84 @@ +"""GLUE SST-2 dataset loading and evaluation helpers.""" + +import sys + +from datasets import load_dataset +from huggingface_hub import InferenceClient + +from agent.eval.compare import ModelResult +from agent.eval.registry import EvalTask + + +def normalize_label(label: str | int) -> int: + mapping = { + "NEGATIVE": 0, + "POSITIVE": 1, + "LABEL_0": 0, + "LABEL_1": 1, + "0": 0, + "1": 1, + } + if isinstance(label, int): + if label in (0, 1): + return label + raise ValueError(f"Unsupported label from inference API: {label}") + try: + return mapping[str(label).upper()] + except KeyError as exc: + raise ValueError(f"Unsupported label from inference API: {label}") from exc + + +def load_examples( + task: EvalTask, + split: str | None = None, + limit: int | None = None, +) -> list[dict]: + selected_split = split or task.default_split + dataset_args = [task.dataset_config] if task.dataset_config else [] + dataset = load_dataset(task.dataset_name, *dataset_args, split=selected_split) + records = list(dataset) + if limit is not None: + records = records[:limit] + return records + + +def extract_label(response) -> str | int: + if isinstance(response, list): + if not response: + raise ValueError("Empty response from inference API") + response = response[0] + + if isinstance(response, dict): + return response["label"] + + label = getattr(response, "label", None) + if label is None: + raise ValueError("Inference response does not contain a label") + return label + + +def evaluate_model( + task: EvalTask, + model_id: str, + examples: list[dict], + client: InferenceClient | None = None, +) -> ModelResult: + client = client or InferenceClient() + correct = 0 + + for index, example in enumerate(examples): + try: + response = client.text_classification(example[task.text_column], model=model_id) + predicted = normalize_label(extract_label(response)) + except Exception as exc: + print( + "Evaluation failed " + f"(task={task.task_id}, model={model_id}, example_index={index}): {exc}", + file=sys.stderr, + ) + raise + if predicted == example[task.label_column]: + correct += 1 + + accuracy = correct / len(examples) if examples else 0.0 + return ModelResult(model_id=model_id, metrics={"accuracy": accuracy}) diff --git a/docs/superpowers/specs/2026-04-23-evaluation-benchmarking-design.md b/docs/superpowers/specs/2026-04-23-evaluation-benchmarking-design.md new file mode 100644 index 00000000..834029dc --- /dev/null +++ b/docs/superpowers/specs/2026-04-23-evaluation-benchmarking-design.md @@ -0,0 +1,317 @@ +# Evaluation And Benchmarking Design + +Date: 2026-04-23 +Issue: `#84` +Scope: First PR only + +## Goal + +Add a reproducible local evaluation pipeline for comparing a baseline model +against a trained model on a task-specific benchmark. The first PR should +produce machine-readable artifacts and a leaderboard-style summary without +changing the live agent runtime or the web UI. + +## Problem + +The repo can help users train and iterate on models, but it does not yet offer +a standard way to answer a basic question: did the trained model improve over +the baseline, and was the gain worth the cost? + +Issue `#84` asks for: + +- task-specific benchmarks +- baseline vs agent-generated model comparison +- training cost vs performance gain tracking +- a reproducible `eval.py` +- leaderboard-style logging + +This design narrows that request into a first contribution that is small enough +for one PR and reusable by later work. + +## First-PR Scope + +This PR will add: + +- a local CLI entrypoint, `eval.py` +- a small evaluation utility layer separate from `agent/core/*` +- support for comparing exactly two models in one run: + - baseline model + - candidate model +- a task registry for explicit benchmark definitions +- machine-readable run artifacts +- a leaderboard-style artifact for cross-run comparison +- unit tests for registry resolution, comparison logic, and artifact writing +- contributor documentation for how to run the evaluation pipeline + +This PR will not add: + +- frontend leaderboard views +- backend endpoints for evaluation storage +- automatic agent-triggered evaluation +- broad benchmark support for every task family +- heavyweight end-to-end remote evaluation tests + +## Design Summary + +The new feature should live as a standalone evaluation workflow that can be run +by contributors from the command line. The evaluation logic must be isolated +from the live chat session path so that the first PR stays low-risk and easy to +review. + +The system will: + +1. parse CLI arguments +2. resolve a named evaluation task +3. evaluate the baseline model +4. evaluate the candidate model +5. compute metric deltas and cost metadata +6. write a reproducible run record +7. update a leaderboard-style artifact + +## Proposed File Layout + +Exact names may shift slightly to match repo conventions, but the structure +should stay close to this: + +```text +eval.py +agent/ + eval/ + __init__.py + registry.py + runner.py + compare.py + artifacts.py +tests/ + unit/ + test_eval_registry.py + test_eval_compare.py + test_eval_artifacts.py +``` + +Rationale: + +- `eval.py` stays as the thin executable entrypoint requested by the issue +- `agent/eval/` keeps the logic reusable without coupling it to `agent/core/*` +- tests focus on deterministic behavior instead of real remote inference + +## CLI Contract + +The script should accept explicit, reproducible inputs instead of hidden +defaults. The minimal interface should include: + +- `--task`: task id from the registry +- `--baseline-model`: baseline model id +- `--candidate-model`: model id to compare against the baseline +- `--output-dir`: where artifacts are written + +Optional but useful first-PR inputs: + +- `--limit`: cap number of evaluation samples for quick runs +- `--split`: override the configured dataset split when supported +- `--cost-file`: path to cost metadata provided by the user +- `--notes`: free-form text stored in the run artifact + +The script should print a concise terminal summary and write the canonical +result to disk. + +## Task Registry + +The first PR should use an explicit task registry instead of a generic +"benchmark anything" abstraction. + +Each task definition should declare: + +- task id +- dataset or benchmark source +- default split +- primary metric name +- any secondary metrics +- any evaluation parameters needed by the runner + +Why a registry: + +- keeps supported benchmarks obvious +- gives each task one clear configuration source +- makes tests deterministic +- provides a clean extension point for future tasks + +The initial implementation should support exactly one built-in task: +`glue_sst2`. + +Why `glue_sst2` first: + +- it matches the issue's request for task-specific benchmarks +- it is small enough for a focused first PR +- it uses a well-known benchmark +- the primary metric is straightforward accuracy +- it can be evaluated without inventing a broad benchmark framework first + +## Evaluation Boundaries + +The evaluator should be structured around small interfaces: + +- task resolution +- model evaluation +- result comparison +- artifact writing + +This separation matters because the first PR should prove the data model and +workflow before taking on complex provider-specific evaluation plumbing. + +The core comparison layer should not depend on how metrics were produced. That +lets unit tests inject stub metric outputs while future integrations can plug +in real benchmark runners. + +## Artifact Model + +Two artifact types should be written. + +### 1. Per-run artifact + +This is the source of truth for one evaluation run. It should be machine +readable and detailed enough to reproduce the setup. + +Format: one JSON file per successful run. + +Recommended fields: + +- `run_id` +- `created_at` +- `task` +- `dataset` +- `split` +- `baseline_model` +- `candidate_model` +- `baseline_metrics` +- `candidate_metrics` +- `primary_metric` +- `primary_delta` +- `training_cost` +- `eval_cost` +- `notes` +- `parameters` + +The `parameters` object should capture the exact evaluation configuration used, +such as sample limit, split, and any task-specific settings. + +### 2. Leaderboard artifact + +This is a compact comparison view across runs. + +Format: append-only JSONL, one row per successful run. + +Recommended fields: + +- `run_id` +- `task` +- `baseline_model` +- `candidate_model` +- `primary_metric` +- `baseline_score` +- `candidate_score` +- `delta` +- `training_cost` +- `eval_cost` +- `created_at` +- `notes` + +The leaderboard should favor readability and scanning over completeness. +Detailed debugging and reproducibility should stay in the per-run artifact. + +## Cost Tracking + +Issue `#84` explicitly asks for training cost vs performance gain tracking. +For the first PR, cost data should be accepted as explicit metadata rather than +calculated automatically from platform logs. + +This keeps scope under control while still establishing the schema we need for +future automation. + +Behavior: + +- if a cost file or explicit cost metadata is provided, store it in the run + artifact and leaderboard row +- if no cost metadata is provided, write `null` values rather than guessing + +This makes the data model honest and avoids silently incorrect cost numbers. + +## Error Handling + +The script should fail early and clearly for invalid setup. + +Expected early validation errors: + +- unknown task id +- missing model ids +- unsupported output directory state +- malformed cost metadata + +Runtime evaluation failures should: + +- surface enough context to debug which model and task failed +- avoid producing a misleading "successful" leaderboard entry +- avoid partially written artifacts that look complete + +If one model evaluation fails, the command should exit non-zero and skip both +the per-run artifact and the leaderboard update. The first PR should not invent +a failed-run artifact format. + +## Testing Strategy + +This PR should focus on deterministic unit tests. + +Required coverage: + +- registry resolution for known and unknown tasks +- comparison logic for baseline vs candidate metrics +- derived delta calculation +- per-run artifact writing +- leaderboard append or rewrite behavior +- invalid input handling + +Tests should use stubbed evaluation outputs rather than real model calls. +That keeps the test suite fast, reliable, and suitable for CI. + +Real remote evaluation can be added later once the core data flow is proven. + +## Why This Slice First + +This slice lands the most valuable infrastructure from the issue without +mixing multiple risky changes together. + +It gives the project: + +- a reproducible evaluation entrypoint +- a standard comparison record +- a leaderboard-ready data format +- a clean extension point for future tasks and UI work + +It avoids: + +- changing live agent behavior +- shipping partial backend storage design +- taking on UI requirements before the underlying data model is stable + +## Follow-up Work After This PR + +Natural next steps once this lands: + +- add more task definitions +- integrate real benchmark runners where appropriate +- let the agent trigger the evaluation pipeline +- surface leaderboard results in the web UI +- automate cost collection from training/eval jobs + +## Acceptance Criteria + +This first PR is successful if: + +- contributors can run `eval.py` locally with a task, baseline model, and + candidate model +- the script writes a reproducible run artifact +- the script writes or updates a leaderboard-style artifact +- the comparison includes primary metric deltas +- optional cost metadata is recorded without guessing +- unit tests cover the core workflow logic +- no live agent, backend, or frontend behavior changes are required diff --git a/eval.py b/eval.py new file mode 100644 index 00000000..97f62796 --- /dev/null +++ b/eval.py @@ -0,0 +1,88 @@ +"""Local evaluation CLI for baseline-vs-candidate model comparisons.""" + +import argparse +import json +from pathlib import Path +import uuid + +from agent.eval.artifacts import ( + append_leaderboard_row, + build_leaderboard_row, + build_run_record, + write_run_artifact, +) +from agent.eval.compare import compare_results +from agent.eval.registry import get_task +from agent.eval.runner import evaluate_model, load_examples + + +def positive_int(value: str) -> int: + parsed = int(value) + if parsed <= 0: + raise argparse.ArgumentTypeError("limit must be a positive integer") + return parsed + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Evaluate a baseline model against a candidate model.", + ) + parser.add_argument("--task", required=True) + parser.add_argument("--baseline-model", required=True) + parser.add_argument("--candidate-model", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument("--limit", type=positive_int, default=None) + parser.add_argument("--split", default=None) + parser.add_argument("--cost-file", default=None) + parser.add_argument("--notes", default=None) + return parser.parse_args(argv) + + +def load_cost_metadata(path: str | None) -> tuple[float | None, float | None]: + if path is None: + return None, None + payload = json.loads(Path(path).read_text(encoding="utf-8")) + return payload.get("training_cost"), payload.get("eval_cost") + + +def main(argv: list[str] | None = None) -> int: + args = parse_args(argv) + task = get_task(args.task) + training_cost, eval_cost = load_cost_metadata(args.cost_file) + examples = load_examples(task, split=args.split, limit=args.limit) + if not examples: + raise ValueError("No evaluation examples loaded") + + baseline = evaluate_model(task, args.baseline_model, examples) + candidate = evaluate_model(task, args.candidate_model, examples) + + comparison = compare_results( + task_id=task.task_id, + primary_metric=task.primary_metric, + baseline=baseline, + candidate=candidate, + ) + output_dir = Path(args.output_dir) + run_record = build_run_record( + run_id=str(uuid.uuid4()), + comparison=comparison, + dataset=f"{task.dataset_name}/{task.dataset_config}", + split=args.split or task.default_split, + parameters={"limit": args.limit}, + training_cost=training_cost, + eval_cost=eval_cost, + notes=args.notes, + ) + write_run_artifact(output_dir, run_record) + append_leaderboard_row(output_dir, build_leaderboard_row(run_record)) + + print( + f"{comparison.primary_metric}: " + f"{comparison.baseline_score:.4f} -> {comparison.candidate_score:.4f} " + f"(delta {comparison.delta:+.4f})" + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/unit/test_eval_artifacts.py b/tests/unit/test_eval_artifacts.py new file mode 100644 index 00000000..ea289575 --- /dev/null +++ b/tests/unit/test_eval_artifacts.py @@ -0,0 +1,125 @@ +import json + +import pytest + +from agent.eval.artifacts import ( + append_leaderboard_row, + build_leaderboard_row, + build_run_record, + write_run_artifact, +) +from agent.eval.compare import ModelResult, compare_results + + +def test_write_run_artifact_creates_json_file(tmp_path): + record = { + "run_id": "run-123", + "task": "glue_sst2", + "primary_metric": "accuracy", + "primary_delta": 0.05, + } + + path = write_run_artifact(tmp_path, record) + + assert path.name == "run-123.json" + assert json.loads(path.read_text()) == record + + +def test_write_run_artifact_rejects_duplicate_run_id(tmp_path): + record = { + "run_id": "run-123", + "task": "glue_sst2", + "primary_metric": "accuracy", + "primary_delta": 0.05, + } + + write_run_artifact(tmp_path, record) + + with pytest.raises(FileExistsError, match="Run artifact already exists:"): + write_run_artifact(tmp_path, record) + + +def test_append_leaderboard_row_appends_jsonl_line(tmp_path): + first_row = { + "run_id": "run-123", + "task": "glue_sst2", + "delta": 0.05, + } + second_row = { + "run_id": "run-456", + "task": "glue_sst2", + "delta": -0.01, + } + + path = append_leaderboard_row(tmp_path, first_row) + append_leaderboard_row(tmp_path, second_row) + + lines = path.read_text().strip().splitlines() + assert len(lines) == 2 + assert json.loads(lines[0]) == first_row + assert json.loads(lines[1]) == second_row + + +def test_build_record_and_leaderboard_row_include_cost_metadata(): + comparison = compare_results( + task_id="glue_sst2", + primary_metric="accuracy", + baseline=ModelResult("baseline", {"accuracy": 0.84}), + candidate=ModelResult("candidate", {"accuracy": 0.89}), + ) + + record = build_run_record( + run_id="run-123", + comparison=comparison, + dataset="glue/sst2", + split="validation", + parameters={"limit": 100}, + training_cost=12.5, + eval_cost=0.25, + notes="first pass", + ) + row = build_leaderboard_row(record) + + assert record["run_id"] == "run-123" + assert record["training_cost"] == 12.5 + assert record["eval_cost"] == 0.25 + assert record["parameters"] == {"limit": 100} + assert row["baseline_score"] == 0.84 + assert row["candidate_score"] == 0.89 + assert row["delta"] == pytest.approx(0.05) + assert row["training_cost"] == 12.5 + assert row["eval_cost"] == 0.25 + + +def test_build_run_record_snapshots_optional_metadata_and_metrics(): + baseline_metrics = {"accuracy": 0.5} + candidate_metrics = {"accuracy": 0.6} + parameters = {"limit": 10} + comparison = compare_results( + task_id="glue_sst2", + primary_metric="accuracy", + baseline=ModelResult("baseline", baseline_metrics), + candidate=ModelResult("candidate", candidate_metrics), + ) + + record = build_run_record( + run_id="run-789", + comparison=comparison, + dataset="glue/sst2", + split="validation", + parameters=parameters, + training_cost=None, + eval_cost=None, + notes=None, + ) + + baseline_metrics["accuracy"] = 0.0 + candidate_metrics["accuracy"] = 0.0 + parameters["limit"] = 1 + + assert record["baseline_metrics"] == {"accuracy": 0.5} + assert record["candidate_metrics"] == {"accuracy": 0.6} + assert record["parameters"] == {"limit": 10} + assert record["training_cost"] is None + assert record["eval_cost"] is None + assert record["notes"] is None diff --git a/tests/unit/test_eval_cli.py b/tests/unit/test_eval_cli.py new file mode 100644 index 00000000..dd01ab93 --- /dev/null +++ b/tests/unit/test_eval_cli.py @@ -0,0 +1,82 @@ +import json + +import pytest + + +def test_eval_cli_writes_artifacts(monkeypatch, tmp_path): + from eval import main + from agent.eval.compare import ModelResult + + monkeypatch.setattr( + "eval.load_examples", + lambda task, split, limit: [{"sentence": "great", "label": 1}], + ) + monkeypatch.setattr( + "eval.evaluate_model", + lambda task, model_id, examples, client=None: ModelResult( + model_id=model_id, + metrics={"accuracy": 1.0 if "candidate" in model_id else 0.0}, + ), + ) + + exit_code = main( + [ + "--task", "glue_sst2", + "--baseline-model", "baseline-model", + "--candidate-model", "candidate-model", + "--output-dir", str(tmp_path), + "--limit", "1", + ] + ) + + assert exit_code == 0 + leaderboard = (tmp_path / "leaderboard.jsonl").read_text().strip().splitlines() + assert len(leaderboard) == 1 + row = json.loads(leaderboard[0]) + assert row["task"] == "glue_sst2" + assert row["delta"] == 1.0 + + +def test_eval_cli_validates_cost_file_before_model_evaluation(monkeypatch, tmp_path): + from eval import main + + calls = {"count": 0} + + def fail_if_called(*args, **kwargs): + calls["count"] += 1 + raise AssertionError("evaluate_model should not be called") + + monkeypatch.setattr("eval.load_examples", lambda task, split, limit: [{"sentence": "great", "label": 1}]) + monkeypatch.setattr("eval.evaluate_model", fail_if_called) + + missing_cost_file = tmp_path / "missing-costs.json" + + with pytest.raises(FileNotFoundError): + main( + [ + "--task", "glue_sst2", + "--baseline-model", "baseline-model", + "--candidate-model", "candidate-model", + "--output-dir", str(tmp_path), + "--cost-file", str(missing_cost_file), + ] + ) + + assert calls["count"] == 0 + + +def test_eval_cli_rejects_non_positive_limit(): + from eval import main + + with pytest.raises(SystemExit) as exc_info: + main( + [ + "--task", "glue_sst2", + "--baseline-model", "baseline-model", + "--candidate-model", "candidate-model", + "--output-dir", "eval_runs", + "--limit", "0", + ] + ) + + assert exc_info.value.code == 2 diff --git a/tests/unit/test_eval_compare.py b/tests/unit/test_eval_compare.py new file mode 100644 index 00000000..f4c8e570 --- /dev/null +++ b/tests/unit/test_eval_compare.py @@ -0,0 +1,26 @@ +import pytest + +from agent.eval.compare import ModelResult, compare_results + + +def test_compare_results_computes_primary_metric_delta(): + baseline = ModelResult( + model_id="baseline-model", + metrics={"accuracy": 0.84}, + ) + candidate = ModelResult( + model_id="candidate-model", + metrics={"accuracy": 0.89}, + ) + + comparison = compare_results( + task_id="glue_sst2", + primary_metric="accuracy", + baseline=baseline, + candidate=candidate, + ) + + assert comparison.primary_metric == "accuracy" + assert comparison.baseline_score == 0.84 + assert comparison.candidate_score == 0.89 + assert comparison.delta == pytest.approx(0.05) diff --git a/tests/unit/test_eval_registry.py b/tests/unit/test_eval_registry.py new file mode 100644 index 00000000..9a9489b7 --- /dev/null +++ b/tests/unit/test_eval_registry.py @@ -0,0 +1,21 @@ +import pytest + +from agent.eval import EvalTask, get_task + + +def test_get_task_returns_glue_sst2_definition(): + task = get_task("glue_sst2") + + assert isinstance(task, EvalTask) + assert task.task_id == "glue_sst2" + assert task.dataset_name == "glue" + assert task.dataset_config == "sst2" + assert task.default_split == "validation" + assert task.text_column == "sentence" + assert task.label_column == "label" + assert task.primary_metric == "accuracy" + + +def test_get_task_raises_for_unknown_task(): + with pytest.raises(ValueError, match="^Unknown evaluation task: unknown_task$"): + get_task("unknown_task") diff --git a/tests/unit/test_eval_runner.py b/tests/unit/test_eval_runner.py new file mode 100644 index 00000000..88c9c6a4 --- /dev/null +++ b/tests/unit/test_eval_runner.py @@ -0,0 +1,156 @@ +import pytest + +from agent.eval.registry import EvalTask +from agent.eval.registry import get_task +from agent.eval.runner import evaluate_model, load_examples, normalize_label + + +class FakeInferenceClient: + def __init__(self, predictions): + self._predictions = predictions + self.calls = [] + + def text_classification(self, text, model): + self.calls.append((text, model)) + label = self._predictions[len(self.calls) - 1] + return [{"label": label, "score": 0.99}] + + +class FakeLabelObject: + def __init__(self, label): + self.label = label + + +class FakeObjectInferenceClient(FakeInferenceClient): + def text_classification(self, text, model): + self.calls.append((text, model)) + label = self._predictions[len(self.calls) - 1] + return FakeLabelObject(label) + + +def test_normalize_label_supports_hf_text_classification_aliases(): + assert normalize_label("NEGATIVE") == 0 + assert normalize_label("POSITIVE") == 1 + assert normalize_label("LABEL_0") == 0 + assert normalize_label("LABEL_1") == 1 + assert normalize_label("0") == 0 + assert normalize_label("1") == 1 + assert normalize_label(0) == 0 + assert normalize_label(1) == 1 + + +def test_evaluate_model_computes_accuracy_from_examples(): + task = get_task("glue_sst2") + examples = [ + {"sentence": "bad film", "label": 0}, + {"sentence": "great film", "label": 1}, + {"sentence": "fine film", "label": 1}, + ] + client = FakeInferenceClient(["NEGATIVE", "POSITIVE", "NEGATIVE"]) + + result = evaluate_model( + task=task, + model_id="candidate-model", + examples=examples, + client=client, + ) + + assert result.model_id == "candidate-model" + assert result.metrics == {"accuracy": 2 / 3} + assert len(client.calls) == 3 + + +def test_evaluate_model_accepts_object_label_responses(): + task = get_task("glue_sst2") + examples = [{"sentence": "great film", "label": 1}] + client = FakeObjectInferenceClient(["POSITIVE"]) + + result = evaluate_model( + task=task, + model_id="candidate-model", + examples=examples, + client=client, + ) + + assert result.metrics == {"accuracy": 1.0} + + +def test_load_examples_respects_limit(monkeypatch): + class FakeSplit(list): + pass + + def fake_load_dataset(name, config, split): + assert name == "glue" + assert config == "sst2" + assert split == "validation" + return FakeSplit( + [ + {"sentence": "a", "label": 0}, + {"sentence": "b", "label": 1}, + {"sentence": "c", "label": 0}, + ] + ) + + monkeypatch.setattr("agent.eval.runner.load_dataset", fake_load_dataset) + + task = get_task("glue_sst2") + examples = load_examples(task, split="validation", limit=2) + + assert examples == [ + {"sentence": "a", "label": 0}, + {"sentence": "b", "label": 1}, + ] + + +def test_load_examples_omits_missing_dataset_config(monkeypatch): + calls = [] + + def fake_load_dataset(*args, **kwargs): + calls.append((args, kwargs)) + return [{"text": "a", "label": 0}] + + monkeypatch.setattr("agent.eval.runner.load_dataset", fake_load_dataset) + + task = EvalTask( + task_id="custom", + dataset_name="custom_dataset", + dataset_config=None, + default_split="test", + text_column="text", + label_column="label", + primary_metric="accuracy", + ) + + assert load_examples(task) == [{"text": "a", "label": 0}] + assert calls == [(("custom_dataset",), {"split": "test"})] + + +def test_evaluate_model_reports_failed_example_index(capsys): + class FailingInferenceClient: + def __init__(self): + self.calls = 0 + + def text_classification(self, text, model): + self.calls += 1 + if self.calls == 2: + raise RuntimeError("rate limited") + return [{"label": "NEGATIVE", "score": 0.99}] + + task = get_task("glue_sst2") + examples = [ + {"sentence": "bad film", "label": 0}, + {"sentence": "great film", "label": 1}, + ] + + with pytest.raises(RuntimeError, match="rate limited"): + evaluate_model( + task=task, + model_id="candidate-model", + examples=examples, + client=FailingInferenceClient(), + ) + + captured = capsys.readouterr() + assert "task=glue_sst2" in captured.err + assert "model=candidate-model" in captured.err + assert "example_index=1" in captured.err