Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions REFERENCE.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,9 @@ clean for piping). `--llm-reduce` is repeatable, each prompt running on the
previous one's output; for a single source it extends the `--llm` chain over
that transcript.

`assembly eval` takes the same `--llm`/`--llm-reduce` flags but emits a single
JSON object (not NDJSON): `--llm` runs a chain over each transcript and attaches
`{"model","steps"}` under the row's `llm` key (the WER score still uses the raw
transcript), and `--llm-reduce` runs one prompt over every item's result and
adds a top-level `reduce` (`{"model","prompts","output"}`) to the object.
`assembly eval` takes the same `--llm`/`--llm-reduce` flags but emits one JSON
object per dataset (not NDJSON; a single dataset is therefore one object):
`--llm` runs a chain over each transcript and attaches `{"model","steps"}` under
the row's `llm` key (the WER score still uses the raw transcript), and
`--llm-reduce` runs one prompt over every item's result and adds a top-level
`reduce` (`{"model","prompts","output"}`) to the object.
16 changes: 12 additions & 4 deletions aai_cli/commands/evaluate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@
"Score a model on 10 rows of a benchmark",
"assembly eval tedlium",
),
(
"Score several benchmarks in one run",
"assembly eval tedlium librispeech earnings22",
),
(
"Compare models on your own audio",
"assembly eval calls.csv --speech-model universal-3-pro",
Expand All @@ -55,9 +59,9 @@
)
def evaluate(
ctx: typer.Context,
dataset: str = typer.Argument(
datasets: list[str] = typer.Argument(
...,
help="Hugging Face dataset id, or a local .csv/.jsonl manifest with audio + text columns",
help="Hugging Face dataset ids, or local .csv/.jsonl manifests with audio + text columns",
),
split: str | None = typer.Option(
None, "--split", help="Hugging Face split to score (default: test)"
Expand Down Expand Up @@ -114,14 +118,18 @@ def evaluate(
),
json_out: bool = options.json_option("Output the rows and summary as one JSON object"),
) -> None:
"""Transcribe a dataset and score WER against its reference texts
"""Transcribe one or more datasets and score WER against their reference texts

Each row's audio is transcribed, then scored against the row's reference
text; both are normalized first (lowercased, punctuation stripped) so style
differences don't count as errors, and the summary pools total errors over
total reference words. Handy for picking a model: run once per
--speech-model and compare.

Pass several datasets to score them in one run; each is loaded, scored, and
reported separately (under --json, one JSON object per dataset). The
--limit/--split/--subset/--column flags apply to every dataset.

Datasets come from the Hugging Face Hub (any public dataset its viewer
serves with audio + reference columns; gated ones need HF_TOKEN), a local
.csv/.jsonl manifest with audio + text columns, or a built-in benchmark
Expand All @@ -138,7 +146,7 @@ def evaluate(
item's result to summarize patterns across the run.
"""
opts = evaluate_exec.EvalOptions(
dataset=dataset,
datasets=datasets,
split=split,
subset=subset,
limit=limit,
Expand Down
32 changes: 22 additions & 10 deletions aai_cli/commands/evaluate/_exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class EvalOptions:
"""Every `assembly eval` flag as plain data (``--json`` excluded: run_command
resolves it into the ``json_mode`` argument)."""

dataset: str
datasets: list[str]
split: str | None
subset: str | None
limit: int
Expand Down Expand Up @@ -423,13 +423,12 @@ def _render(payload: dict[str, object]) -> RenderableType:
)


def run_evaluate(opts: EvalOptions, state: AppState, *, json_mode: bool) -> None:
"""Transcribe an evaluation dataset and score WER against its reference texts."""
# Resolve credentials before any dataset download: a signed-out user must
# not pull the whole dataset only to fail at the first transcription.
api_key = state.resolve_api_key()
def _evaluate_one(
dataset: str, api_key: str, opts: EvalOptions, state: AppState, *, json_mode: bool
) -> dict[str, object]:
"""Score one dataset end to end and return its emitted payload."""
data = eval_data.load(
opts.dataset,
dataset,
split=opts.split,
subset=opts.subset,
audio_column=opts.audio_column,
Expand Down Expand Up @@ -466,11 +465,24 @@ def run_evaluate(opts: EvalOptions, state: AppState, *, json_mode: bool) -> None
reduce = _run_reduce(api_key, results, llm_opts, json_mode=json_mode, quiet=state.quiet)
if reduce is not None:
payload["reduce"] = reduce
output.emit(payload, _render, json_mode=json_mode)
failed = jsonshape.as_int(payload.get("failed"))
return payload


def run_evaluate(opts: EvalOptions, state: AppState, *, json_mode: bool) -> None:
"""Transcribe one or more evaluation datasets and score WER against references."""
# Resolve credentials before any dataset download: a signed-out user must
# not pull the whole dataset only to fail at the first transcription.
api_key = state.resolve_api_key()
failed = 0
total = 0
for dataset in opts.datasets:
payload = _evaluate_one(dataset, api_key, opts, state, json_mode=json_mode)
output.emit(payload, _render, json_mode=json_mode)
failed += jsonshape.as_int(payload.get("failed"))
total += jsonshape.as_int(payload.get("items"))
if failed:
raise CLIError(
f"{failed} of {len(results)} items failed to transcribe.",
f"{failed} of {total} items failed to transcribe.",
error_type="eval_failed",
suggestion="The summary covers only the items that transcribed.",
)
4 changes: 2 additions & 2 deletions tests/__snapshots__/test_snapshots_help_root.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@
│ time range │
│ dub [sandbox] Dub a video or audio file into another language │
│ caption Burn always-visible captions into a video │
│ eval Transcribe a dataset and score WER against its reference
texts
│ eval Transcribe one or more datasets and score WER against their
reference texts
│ webhooks Receive webhook deliveries on a public dev URL │
╰──────────────────────────────────────────────────────────────────────────────╯
╭─ Setup & Tools ──────────────────────────────────────────────────────────────╮
Expand Down
17 changes: 12 additions & 5 deletions tests/__snapshots__/test_snapshots_help_run.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -480,16 +480,20 @@
# name: test_command_help_matches_snapshot[eval]
'''

Usage: assembly eval [OPTIONS] DATASET
Usage: assembly eval [OPTIONS] DATASETS...

Transcribe a dataset and score WER against its reference texts
Transcribe one or more datasets and score WER against their reference texts

Each row's audio is transcribed, then scored against the row's reference
text; both are normalized first (lowercased, punctuation stripped) so style
differences don't count as errors, and the summary pools total errors over
total reference words. Handy for picking a model: run once per
--speech-model and compare.

Pass several datasets to score them in one run; each is loaded, scored, and
reported separately (under --json, one JSON object per dataset). The
--limit/--split/--subset/--column flags apply to every dataset.

Datasets come from the Hugging Face Hub (any public dataset its viewer
serves with audio + reference columns; gated ones need HF_TOKEN), a local
.csv/.jsonl manifest with audio + text columns, or a built-in benchmark
Expand All @@ -506,9 +510,10 @@
item's result to summarize patterns across the run.

╭─ Arguments ──────────────────────────────────────────────────────────────────╮
│ * dataset TEXT Hugging Face dataset id, or a local .csv/.jsonl │
│ manifest with audio + text columns │
│ [required] │
│ * datasets DATASETS... Hugging Face dataset ids, or local │
│ .csv/.jsonl manifests with audio + text │
│ columns │
│ [required] │
╰──────────────────────────────────────────────────────────────────────────────╯
╭─ Options ────────────────────────────────────────────────────────────────────╮
│ --split TEXT Hugging Face split to │
Expand Down Expand Up @@ -554,6 +559,8 @@
Examples
Score a model on 10 rows of a benchmark
$ assembly eval tedlium
Score several benchmarks in one run
$ assembly eval tedlium librispeech earnings22
Compare models on your own audio
$ assembly eval calls.csv --speech-model universal-3-pro
More rows, transcribed four at a time
Expand Down
60 changes: 60 additions & 0 deletions tests/test_eval_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ def _payload_of(result):
)


def _payloads_of(result):
return [
json.loads(line) for line in result.output.splitlines() if line.startswith('{"dataset"')
]


def _without_latency(row):
return {key: value for key, value in row.items() if key != "latency"}

Expand Down Expand Up @@ -121,6 +127,60 @@ def test_json_payload_shape(tmp_path, mocker):
assert "failed" not in payload # only present when a row failed


def _write_two_manifests(tmp_path):
(tmp_path / "a.wav").write_bytes(b"fake-audio")
(tmp_path / "b.wav").write_bytes(b"fake-audio")
(tmp_path / "one.csv").write_text("audio,text\na.wav,hello there\n", encoding="utf-8")
(tmp_path / "two.csv").write_text("audio,text\nb.wav,goodbye now\n", encoding="utf-8")


def test_multiple_datasets_emit_one_payload_each(tmp_path, mocker):
_auth()
_write_two_manifests(tmp_path)
# First dataset transcribes perfectly; second gets one of two words wrong.
_mock_transcribe(mocker, [_transcript("hello there"), _transcript("goodbye cow")])
result = runner.invoke(app, ["eval", "one.csv", "two.csv", "--json"])
assert result.exit_code == 0
payloads = _payloads_of(result)
# One self-describing JSON object per dataset, in argument order.
assert [payload["dataset"] for payload in payloads] == ["one.csv", "two.csv"]
assert payloads[0]["wer"] == 0.0
assert payloads[1]["wer"] == 0.5


def test_multiple_datasets_render_a_block_per_dataset(tmp_path, mocker):
_auth()
_write_two_manifests(tmp_path)
_mock_transcribe(mocker, [_transcript("hello there"), _transcript("goodbye now")])
result = runner.invoke(app, ["eval", "one.csv", "two.csv"])
assert result.exit_code == 0
# Each dataset gets its own header line in the human output.
assert "one.csv" in result.output
assert "two.csv" in result.output


def test_multiple_datasets_aggregate_failures_in_exit_message(tmp_path, mocker):
from aai_cli.core.errors import APIError

_auth()
_write_two_manifests(tmp_path)
# A row in each dataset fails: the tally pools (sums) across both datasets.
_mock_transcribe(mocker, [APIError("boom one"), APIError("boom two")])
result = runner.invoke(app, ["eval", "one.csv", "two.csv", "--json"])
assert result.exit_code == 1
err = next(
json.loads(line) for line in result.output.splitlines() if line.startswith('{"error"')
)
# Exact message (not a substring) so a summed-vs-subtracted tally is caught:
# "-2 of 2 …" would still contain "2 of 2 …".
assert err["error"]["message"] == "2 of 2 items failed to transcribe."


def test_at_least_one_dataset_is_required():
result = runner.invoke(app, ["eval"])
assert result.exit_code == 2


@pytest.mark.parametrize("model", ["universal-3-pro", "universal-2"])
def test_speech_model_flag_reaches_config_and_output(tmp_path, mocker, model):
_auth()
Expand Down
4 changes: 2 additions & 2 deletions tests/test_eval_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def fake_status(message, *, json_mode, quiet):

def test_llm_options_maps_fields():
opts = evaluate_exec.EvalOptions(
dataset="d", split=None, subset=None, limit=10, audio_column=None,
datasets=["d"], split=None, subset=None, limit=10, audio_column=None,
text_column=None, speech_model=None, language_code=None, concurrency=1,
llm_prompt=["a"], llm_reduce=["b", "c"], model="m", max_tokens=5,
) # fmt: skip
Expand All @@ -238,7 +238,7 @@ def test_llm_options_maps_fields():

def test_llm_options_default_to_empty_chains():
opts = evaluate_exec.EvalOptions(
dataset="d", split=None, subset=None, limit=10, audio_column=None,
datasets=["d"], split=None, subset=None, limit=10, audio_column=None,
text_column=None, speech_model=None, language_code=None, concurrency=1,
llm_prompt=None, llm_reduce=None, model="m", max_tokens=5,
) # fmt: skip
Expand Down
Loading