Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ frontend/yarn-error.log*
.docker/

# Eval (stale)
eval/
/eval/
/eval_runs/

# Project-specific
session_logs/
Expand Down
28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,34 @@ ml-intern --max-iterations 100 "your prompt"
ml-intern --no-stream "your prompt"
```

## Evaluation

Run a local baseline-vs-candidate benchmark on GLUE SST-2:

```bash
python eval.py \
--task glue_sst2 \
--baseline-model distilbert/distilbert-base-uncased-finetuned-sst-2-english \
--candidate-model your-org/your-sst2-model \
--output-dir eval_runs \
--limit 100
```

Optional cost metadata file:

```json
{
"training_cost": 12.5,
"eval_cost": 0.25
}
```

Pass the file with `--cost-file path/to/costs.json`.
Use `--split` to override the default dataset split and `--notes` to attach
free-form context to the saved run record.
The command writes one JSON run record and appends one row to
`leaderboard.jsonl` in the output directory.

## Architecture

### Component Overview
Expand Down
5 changes: 5 additions & 0 deletions agent/eval/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Public evaluation registry interface."""

from agent.eval.registry import EvalTask, get_task

__all__ = ["EvalTask", "get_task"]
72 changes: 72 additions & 0 deletions agent/eval/artifacts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""Artifact builders and writers for evaluation runs."""

from datetime import UTC, datetime
from copy import deepcopy
import json
from pathlib import Path

from agent.eval.compare import ComparisonResult


def build_run_record(
run_id: str,
comparison: ComparisonResult,
dataset: str,
split: str,
parameters: dict,
training_cost: float | None,
eval_cost: float | None,
notes: str | None,
) -> dict:
return {
"run_id": run_id,
"created_at": datetime.now(UTC).isoformat(),
"task": comparison.task_id,
"dataset": dataset,
"split": split,
"baseline_model": comparison.baseline.model_id,
"candidate_model": comparison.candidate.model_id,
"baseline_metrics": deepcopy(comparison.baseline.metrics),
"candidate_metrics": deepcopy(comparison.candidate.metrics),
"primary_metric": comparison.primary_metric,
"primary_delta": comparison.delta,
"training_cost": training_cost,
"eval_cost": eval_cost,
"notes": notes,
"parameters": deepcopy(parameters),
}


def build_leaderboard_row(record: dict) -> dict:
metric = record["primary_metric"]
return {
"run_id": record["run_id"],
"created_at": record["created_at"],
"task": record["task"],
"baseline_model": record["baseline_model"],
"candidate_model": record["candidate_model"],
"primary_metric": metric,
"baseline_score": record["baseline_metrics"][metric],
"candidate_score": record["candidate_metrics"][metric],
"delta": record["primary_delta"],
"training_cost": record["training_cost"],
"eval_cost": record["eval_cost"],
"notes": record["notes"],
}


def write_run_artifact(output_dir: Path, record: dict) -> Path:
output_dir.mkdir(parents=True, exist_ok=True)
path = output_dir / f"{record['run_id']}.json"
if path.exists():
raise FileExistsError(f"Run artifact already exists: {path}")
path.write_text(json.dumps(record, indent=2, sort_keys=True) + "\n", encoding="utf-8")
return path


def append_leaderboard_row(output_dir: Path, row: dict) -> Path:
output_dir.mkdir(parents=True, exist_ok=True)
path = output_dir / "leaderboard.jsonl"
with path.open("a", encoding="utf-8") as handle:
handle.write(json.dumps(row, sort_keys=True) + "\n")
return path
43 changes: 43 additions & 0 deletions agent/eval/compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""Comparison logic for baseline-vs-candidate evaluation results."""

from dataclasses import dataclass


@dataclass(frozen=True)
class ModelResult:
model_id: str
metrics: dict[str, float]


@dataclass(frozen=True)
class ComparisonResult:
task_id: str
primary_metric: str
baseline: ModelResult
candidate: ModelResult

@property
def baseline_score(self) -> float:
return self.baseline.metrics[self.primary_metric]

@property
def candidate_score(self) -> float:
return self.candidate.metrics[self.primary_metric]

@property
def delta(self) -> float:
return self.candidate_score - self.baseline_score


def compare_results(
task_id: str,
primary_metric: str,
baseline: ModelResult,
candidate: ModelResult,
) -> ComparisonResult:
return ComparisonResult(
task_id=task_id,
primary_metric=primary_metric,
baseline=baseline,
candidate=candidate,
)
34 changes: 34 additions & 0 deletions agent/eval/registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Task registry for local evaluation workflows."""

from dataclasses import dataclass


@dataclass(frozen=True)
class EvalTask:
task_id: str
dataset_name: str
dataset_config: str | None
default_split: str
text_column: str
label_column: str
primary_metric: str


_TASKS: dict[str, EvalTask] = {
"glue_sst2": EvalTask(
task_id="glue_sst2",
dataset_name="glue",
dataset_config="sst2",
default_split="validation",
text_column="sentence",
label_column="label",
primary_metric="accuracy",
)
}


def get_task(task_id: str) -> EvalTask:
try:
return _TASKS[task_id]
except KeyError as exc:
raise ValueError(f"Unknown evaluation task: {task_id}") from exc
84 changes: 84 additions & 0 deletions agent/eval/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""GLUE SST-2 dataset loading and evaluation helpers."""

import sys

from datasets import load_dataset
from huggingface_hub import InferenceClient

from agent.eval.compare import ModelResult
from agent.eval.registry import EvalTask


def normalize_label(label: str | int) -> int:
mapping = {
"NEGATIVE": 0,
"POSITIVE": 1,
"LABEL_0": 0,
"LABEL_1": 1,
"0": 0,
"1": 1,
}
if isinstance(label, int):
if label in (0, 1):
return label
raise ValueError(f"Unsupported label from inference API: {label}")
try:
return mapping[str(label).upper()]
except KeyError as exc:
raise ValueError(f"Unsupported label from inference API: {label}") from exc


def load_examples(
task: EvalTask,
split: str | None = None,
limit: int | None = None,
) -> list[dict]:
selected_split = split or task.default_split
dataset_args = [task.dataset_config] if task.dataset_config else []
dataset = load_dataset(task.dataset_name, *dataset_args, split=selected_split)
records = list(dataset)
if limit is not None:
records = records[:limit]
return records


def extract_label(response) -> str | int:
if isinstance(response, list):
if not response:
raise ValueError("Empty response from inference API")
response = response[0]

if isinstance(response, dict):
return response["label"]

label = getattr(response, "label", None)
if label is None:
raise ValueError("Inference response does not contain a label")
return label


def evaluate_model(
task: EvalTask,
model_id: str,
examples: list[dict],
client: InferenceClient | None = None,
) -> ModelResult:
client = client or InferenceClient()
correct = 0

for index, example in enumerate(examples):
try:
response = client.text_classification(example[task.text_column], model=model_id)
predicted = normalize_label(extract_label(response))
except Exception as exc:
print(
"Evaluation failed "
f"(task={task.task_id}, model={model_id}, example_index={index}): {exc}",
file=sys.stderr,
)
raise
if predicted == example[task.label_column]:
correct += 1

accuracy = correct / len(examples) if examples else 0.0
return ModelResult(model_id=model_id, metrics={"accuracy": accuracy})
Loading