diff --git a/REFERENCE.md b/REFERENCE.md index 9329af4e..6a89200b 100644 --- a/REFERENCE.md +++ b/REFERENCE.md @@ -92,8 +92,9 @@ clean for piping). `--llm-reduce` is repeatable, each prompt running on the previous one's output; for a single source it extends the `--llm` chain over that transcript. -`assembly eval` takes the same `--llm`/`--llm-reduce` flags but emits a single -JSON object (not NDJSON): `--llm` runs a chain over each transcript and attaches -`{"model","steps"}` under the row's `llm` key (the WER score still uses the raw -transcript), and `--llm-reduce` runs one prompt over every item's result and -adds a top-level `reduce` (`{"model","prompts","output"}`) to the object. +`assembly eval` takes the same `--llm`/`--llm-reduce` flags but emits one JSON +object per dataset (not NDJSON; a single dataset is therefore one object): +`--llm` runs a chain over each transcript and attaches `{"model","steps"}` under +the row's `llm` key (the WER score still uses the raw transcript), and +`--llm-reduce` runs one prompt over every item's result and adds a top-level +`reduce` (`{"model","prompts","output"}`) to the object. diff --git a/aai_cli/commands/evaluate/__init__.py b/aai_cli/commands/evaluate/__init__.py index dac5c3ea..291bb6ad 100644 --- a/aai_cli/commands/evaluate/__init__.py +++ b/aai_cli/commands/evaluate/__init__.py @@ -34,6 +34,10 @@ "Score a model on 10 rows of a benchmark", "assembly eval tedlium", ), + ( + "Score several benchmarks in one run", + "assembly eval tedlium librispeech earnings22", + ), ( "Compare models on your own audio", "assembly eval calls.csv --speech-model universal-3-pro", @@ -55,9 +59,9 @@ ) def evaluate( ctx: typer.Context, - dataset: str = typer.Argument( + datasets: list[str] = typer.Argument( ..., - help="Hugging Face dataset id, or a local .csv/.jsonl manifest with audio + text columns", + help="Hugging Face dataset ids, or local .csv/.jsonl manifests with audio + text columns", ), split: str | None = typer.Option( None, "--split", help="Hugging Face split to score (default: test)" @@ -114,7 +118,7 @@ def evaluate( ), json_out: bool = options.json_option("Output the rows and summary as one JSON object"), ) -> None: - """Transcribe a dataset and score WER against its reference texts + """Transcribe one or more datasets and score WER against their reference texts Each row's audio is transcribed, then scored against the row's reference text; both are normalized first (lowercased, punctuation stripped) so style @@ -122,6 +126,10 @@ def evaluate( total reference words. Handy for picking a model: run once per --speech-model and compare. + Pass several datasets to score them in one run; each is loaded, scored, and + reported separately (under --json, one JSON object per dataset). The + --limit/--split/--subset/--column flags apply to every dataset. + Datasets come from the Hugging Face Hub (any public dataset its viewer serves with audio + reference columns; gated ones need HF_TOKEN), a local .csv/.jsonl manifest with audio + text columns, or a built-in benchmark @@ -138,7 +146,7 @@ def evaluate( item's result to summarize patterns across the run. """ opts = evaluate_exec.EvalOptions( - dataset=dataset, + datasets=datasets, split=split, subset=subset, limit=limit, diff --git a/aai_cli/commands/evaluate/_exec.py b/aai_cli/commands/evaluate/_exec.py index e525723c..2984f6d4 100644 --- a/aai_cli/commands/evaluate/_exec.py +++ b/aai_cli/commands/evaluate/_exec.py @@ -43,7 +43,7 @@ class EvalOptions: """Every `assembly eval` flag as plain data (``--json`` excluded: run_command resolves it into the ``json_mode`` argument).""" - dataset: str + datasets: list[str] split: str | None subset: str | None limit: int @@ -423,13 +423,12 @@ def _render(payload: dict[str, object]) -> RenderableType: ) -def run_evaluate(opts: EvalOptions, state: AppState, *, json_mode: bool) -> None: - """Transcribe an evaluation dataset and score WER against its reference texts.""" - # Resolve credentials before any dataset download: a signed-out user must - # not pull the whole dataset only to fail at the first transcription. - api_key = state.resolve_api_key() +def _evaluate_one( + dataset: str, api_key: str, opts: EvalOptions, state: AppState, *, json_mode: bool +) -> dict[str, object]: + """Score one dataset end to end and return its emitted payload.""" data = eval_data.load( - opts.dataset, + dataset, split=opts.split, subset=opts.subset, audio_column=opts.audio_column, @@ -466,11 +465,24 @@ def run_evaluate(opts: EvalOptions, state: AppState, *, json_mode: bool) -> None reduce = _run_reduce(api_key, results, llm_opts, json_mode=json_mode, quiet=state.quiet) if reduce is not None: payload["reduce"] = reduce - output.emit(payload, _render, json_mode=json_mode) - failed = jsonshape.as_int(payload.get("failed")) + return payload + + +def run_evaluate(opts: EvalOptions, state: AppState, *, json_mode: bool) -> None: + """Transcribe one or more evaluation datasets and score WER against references.""" + # Resolve credentials before any dataset download: a signed-out user must + # not pull the whole dataset only to fail at the first transcription. + api_key = state.resolve_api_key() + failed = 0 + total = 0 + for dataset in opts.datasets: + payload = _evaluate_one(dataset, api_key, opts, state, json_mode=json_mode) + output.emit(payload, _render, json_mode=json_mode) + failed += jsonshape.as_int(payload.get("failed")) + total += jsonshape.as_int(payload.get("items")) if failed: raise CLIError( - f"{failed} of {len(results)} items failed to transcribe.", + f"{failed} of {total} items failed to transcribe.", error_type="eval_failed", suggestion="The summary covers only the items that transcribed.", ) diff --git a/tests/__snapshots__/test_snapshots_help_root.ambr b/tests/__snapshots__/test_snapshots_help_root.ambr index 4c9de116..e8935ece 100644 --- a/tests/__snapshots__/test_snapshots_help_root.ambr +++ b/tests/__snapshots__/test_snapshots_help_root.ambr @@ -57,8 +57,8 @@ │ time range │ │ dub [sandbox] Dub a video or audio file into another language │ │ caption Burn always-visible captions into a video │ - │ eval Transcribe a dataset and score WER against its reference │ - │ texts │ + │ eval Transcribe one or more datasets and score WER against their │ + │ reference texts │ │ webhooks Receive webhook deliveries on a public dev URL │ ╰──────────────────────────────────────────────────────────────────────────────╯ ╭─ Setup & Tools ──────────────────────────────────────────────────────────────╮ diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr index 1d2d4cd6..5b19f1df 100644 --- a/tests/__snapshots__/test_snapshots_help_run.ambr +++ b/tests/__snapshots__/test_snapshots_help_run.ambr @@ -480,9 +480,9 @@ # name: test_command_help_matches_snapshot[eval] ''' - Usage: assembly eval [OPTIONS] DATASET + Usage: assembly eval [OPTIONS] DATASETS... - Transcribe a dataset and score WER against its reference texts + Transcribe one or more datasets and score WER against their reference texts Each row's audio is transcribed, then scored against the row's reference text; both are normalized first (lowercased, punctuation stripped) so style @@ -490,6 +490,10 @@ total reference words. Handy for picking a model: run once per --speech-model and compare. + Pass several datasets to score them in one run; each is loaded, scored, and + reported separately (under --json, one JSON object per dataset). The + --limit/--split/--subset/--column flags apply to every dataset. + Datasets come from the Hugging Face Hub (any public dataset its viewer serves with audio + reference columns; gated ones need HF_TOKEN), a local .csv/.jsonl manifest with audio + text columns, or a built-in benchmark @@ -506,9 +510,10 @@ item's result to summarize patterns across the run. ╭─ Arguments ──────────────────────────────────────────────────────────────────╮ - │ * dataset TEXT Hugging Face dataset id, or a local .csv/.jsonl │ - │ manifest with audio + text columns │ - │ [required] │ + │ * datasets DATASETS... Hugging Face dataset ids, or local │ + │ .csv/.jsonl manifests with audio + text │ + │ columns │ + │ [required] │ ╰──────────────────────────────────────────────────────────────────────────────╯ ╭─ Options ────────────────────────────────────────────────────────────────────╮ │ --split TEXT Hugging Face split to │ @@ -554,6 +559,8 @@ Examples Score a model on 10 rows of a benchmark $ assembly eval tedlium + Score several benchmarks in one run + $ assembly eval tedlium librispeech earnings22 Compare models on your own audio $ assembly eval calls.csv --speech-model universal-3-pro More rows, transcribed four at a time diff --git a/tests/test_eval_command.py b/tests/test_eval_command.py index 695a7454..3e873414 100644 --- a/tests/test_eval_command.py +++ b/tests/test_eval_command.py @@ -55,6 +55,12 @@ def _payload_of(result): ) +def _payloads_of(result): + return [ + json.loads(line) for line in result.output.splitlines() if line.startswith('{"dataset"') + ] + + def _without_latency(row): return {key: value for key, value in row.items() if key != "latency"} @@ -121,6 +127,60 @@ def test_json_payload_shape(tmp_path, mocker): assert "failed" not in payload # only present when a row failed +def _write_two_manifests(tmp_path): + (tmp_path / "a.wav").write_bytes(b"fake-audio") + (tmp_path / "b.wav").write_bytes(b"fake-audio") + (tmp_path / "one.csv").write_text("audio,text\na.wav,hello there\n", encoding="utf-8") + (tmp_path / "two.csv").write_text("audio,text\nb.wav,goodbye now\n", encoding="utf-8") + + +def test_multiple_datasets_emit_one_payload_each(tmp_path, mocker): + _auth() + _write_two_manifests(tmp_path) + # First dataset transcribes perfectly; second gets one of two words wrong. + _mock_transcribe(mocker, [_transcript("hello there"), _transcript("goodbye cow")]) + result = runner.invoke(app, ["eval", "one.csv", "two.csv", "--json"]) + assert result.exit_code == 0 + payloads = _payloads_of(result) + # One self-describing JSON object per dataset, in argument order. + assert [payload["dataset"] for payload in payloads] == ["one.csv", "two.csv"] + assert payloads[0]["wer"] == 0.0 + assert payloads[1]["wer"] == 0.5 + + +def test_multiple_datasets_render_a_block_per_dataset(tmp_path, mocker): + _auth() + _write_two_manifests(tmp_path) + _mock_transcribe(mocker, [_transcript("hello there"), _transcript("goodbye now")]) + result = runner.invoke(app, ["eval", "one.csv", "two.csv"]) + assert result.exit_code == 0 + # Each dataset gets its own header line in the human output. + assert "one.csv" in result.output + assert "two.csv" in result.output + + +def test_multiple_datasets_aggregate_failures_in_exit_message(tmp_path, mocker): + from aai_cli.core.errors import APIError + + _auth() + _write_two_manifests(tmp_path) + # A row in each dataset fails: the tally pools (sums) across both datasets. + _mock_transcribe(mocker, [APIError("boom one"), APIError("boom two")]) + result = runner.invoke(app, ["eval", "one.csv", "two.csv", "--json"]) + assert result.exit_code == 1 + err = next( + json.loads(line) for line in result.output.splitlines() if line.startswith('{"error"') + ) + # Exact message (not a substring) so a summed-vs-subtracted tally is caught: + # "-2 of 2 …" would still contain "2 of 2 …". + assert err["error"]["message"] == "2 of 2 items failed to transcribe." + + +def test_at_least_one_dataset_is_required(): + result = runner.invoke(app, ["eval"]) + assert result.exit_code == 2 + + @pytest.mark.parametrize("model", ["universal-3-pro", "universal-2"]) def test_speech_model_flag_reaches_config_and_output(tmp_path, mocker, model): _auth() diff --git a/tests/test_eval_llm.py b/tests/test_eval_llm.py index a0c0c7aa..d451125f 100644 --- a/tests/test_eval_llm.py +++ b/tests/test_eval_llm.py @@ -225,7 +225,7 @@ def fake_status(message, *, json_mode, quiet): def test_llm_options_maps_fields(): opts = evaluate_exec.EvalOptions( - dataset="d", split=None, subset=None, limit=10, audio_column=None, + datasets=["d"], split=None, subset=None, limit=10, audio_column=None, text_column=None, speech_model=None, language_code=None, concurrency=1, llm_prompt=["a"], llm_reduce=["b", "c"], model="m", max_tokens=5, ) # fmt: skip @@ -238,7 +238,7 @@ def test_llm_options_maps_fields(): def test_llm_options_default_to_empty_chains(): opts = evaluate_exec.EvalOptions( - dataset="d", split=None, subset=None, limit=10, audio_column=None, + datasets=["d"], split=None, subset=None, limit=10, audio_column=None, text_column=None, speech_model=None, language_code=None, concurrency=1, llm_prompt=None, llm_reduce=None, model="m", max_tokens=5, ) # fmt: skip