AssemblyAI · alexkroman · Jun 16, 2026 · Jun 16, 2026
diff --git a/REFERENCE.md b/REFERENCE.md
@@ -92,8 +92,9 @@ clean for piping). `--llm-reduce` is repeatable, each prompt running on the
 previous one's output; for a single source it extends the `--llm` chain over
 that transcript.
 
-`assembly eval` takes the same `--llm`/`--llm-reduce` flags but emits a single
-JSON object (not NDJSON): `--llm` runs a chain over each transcript and attaches
-`{"model","steps"}` under the row's `llm` key (the WER score still uses the raw
-transcript), and `--llm-reduce` runs one prompt over every item's result and
-adds a top-level `reduce` (`{"model","prompts","output"}`) to the object.
+`assembly eval` takes the same `--llm`/`--llm-reduce` flags but emits one JSON
+object per dataset (not NDJSON; a single dataset is therefore one object):
+`--llm` runs a chain over each transcript and attaches `{"model","steps"}` under
+the row's `llm` key (the WER score still uses the raw transcript), and
+`--llm-reduce` runs one prompt over every item's result and adds a top-level
+`reduce` (`{"model","prompts","output"}`) to the object.
diff --git a/aai_cli/commands/evaluate/__init__.py b/aai_cli/commands/evaluate/__init__.py
@@ -34,6 +34,10 @@
                 "Score a model on 10 rows of a benchmark",
                 "assembly eval tedlium",
             ),
+            (
+                "Score several benchmarks in one run",
+                "assembly eval tedlium librispeech earnings22",
+            ),
             (
                 "Compare models on your own audio",
                 "assembly eval calls.csv --speech-model universal-3-pro",
@@ -55,9 +59,9 @@
 )
 def evaluate(
     ctx: typer.Context,
-    dataset: str = typer.Argument(
+    datasets: list[str] = typer.Argument(
         ...,
-        help="Hugging Face dataset id, or a local .csv/.jsonl manifest with audio + text columns",
+        help="Hugging Face dataset ids, or local .csv/.jsonl manifests with audio + text columns",
     ),
     split: str | None = typer.Option(
         None, "--split", help="Hugging Face split to score (default: test)"
@@ -114,14 +118,18 @@ def evaluate(
     ),
     json_out: bool = options.json_option("Output the rows and summary as one JSON object"),
 ) -> None:
-    """Transcribe a dataset and score WER against its reference texts
+    """Transcribe one or more datasets and score WER against their reference texts
 
     Each row's audio is transcribed, then scored against the row's reference
     text; both are normalized first (lowercased, punctuation stripped) so style
     differences don't count as errors, and the summary pools total errors over
     total reference words. Handy for picking a model: run once per
     --speech-model and compare.
 
+    Pass several datasets to score them in one run; each is loaded, scored, and
+    reported separately (under --json, one JSON object per dataset). The
+    --limit/--split/--subset/--column flags apply to every dataset.
+
     Datasets come from the Hugging Face Hub (any public dataset its viewer
     serves with audio + reference columns; gated ones need HF_TOKEN), a local
     .csv/.jsonl manifest with audio + text columns, or a built-in benchmark
@@ -138,7 +146,7 @@ def evaluate(
     item's result to summarize patterns across the run.
     """
     opts = evaluate_exec.EvalOptions(
-        dataset=dataset,
+        datasets=datasets,
         split=split,
         subset=subset,
         limit=limit,

diff --git a/aai_cli/commands/evaluate/_exec.py b/aai_cli/commands/evaluate/_exec.py
@@ -43,7 +43,7 @@ class EvalOptions:
     """Every `assembly eval` flag as plain data (``--json`` excluded: run_command
     resolves it into the ``json_mode`` argument)."""
 
-    dataset: str
+    datasets: list[str]
     split: str | None
     subset: str | None
     limit: int
@@ -423,13 +423,12 @@ def _render(payload: dict[str, object]) -> RenderableType:
     )
 
 
-def run_evaluate(opts: EvalOptions, state: AppState, *, json_mode: bool) -> None:
-    """Transcribe an evaluation dataset and score WER against its reference texts."""
-    # Resolve credentials before any dataset download: a signed-out user must
-    # not pull the whole dataset only to fail at the first transcription.
-    api_key = state.resolve_api_key()
+def _evaluate_one(
+    dataset: str, api_key: str, opts: EvalOptions, state: AppState, *, json_mode: bool
+) -> dict[str, object]:
+    """Score one dataset end to end and return its emitted payload."""
     data = eval_data.load(
-        opts.dataset,
+        dataset,
         split=opts.split,
         subset=opts.subset,
         audio_column=opts.audio_column,
@@ -466,11 +465,24 @@ def run_evaluate(opts: EvalOptions, state: AppState, *, json_mode: bool) -> None
         reduce = _run_reduce(api_key, results, llm_opts, json_mode=json_mode, quiet=state.quiet)
         if reduce is not None:
             payload["reduce"] = reduce
-    output.emit(payload, _render, json_mode=json_mode)
-    failed = jsonshape.as_int(payload.get("failed"))
+    return payload
+
+
+def run_evaluate(opts: EvalOptions, state: AppState, *, json_mode: bool) -> None:
+    """Transcribe one or more evaluation datasets and score WER against references."""
+    # Resolve credentials before any dataset download: a signed-out user must
+    # not pull the whole dataset only to fail at the first transcription.
+    api_key = state.resolve_api_key()
+    failed = 0
+    total = 0
+    for dataset in opts.datasets:
+        payload = _evaluate_one(dataset, api_key, opts, state, json_mode=json_mode)
+        output.emit(payload, _render, json_mode=json_mode)
+        failed += jsonshape.as_int(payload.get("failed"))
+        total += jsonshape.as_int(payload.get("items"))
     if failed:
         raise CLIError(
-            f"{failed} of {len(results)} items failed to transcribe.",
+            f"{failed} of {total} items failed to transcribe.",
             error_type="eval_failed",
             suggestion="The summary covers only the items that transcribed.",
         )
diff --git a/tests/__snapshots__/test_snapshots_help_root.ambr b/tests/__snapshots__/test_snapshots_help_root.ambr
@@ -57,8 +57,8 @@
   │                time range                                                    │
   │ dub            [sandbox] Dub a video or audio file into another language     │
   │ caption        Burn always-visible captions into a video                     │
-  │ eval           Transcribe a dataset and score WER against its reference      │
-  │                texts                                                         │
+  │ eval           Transcribe one or more datasets and score WER against their   │
+  │                reference texts                                               │
   │ webhooks       Receive webhook deliveries on a public dev URL                │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Setup & Tools ──────────────────────────────────────────────────────────────╮

diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr
@@ -480,16 +480,20 @@
 # name: test_command_help_matches_snapshot[eval]
   '''
 
-   Usage: assembly eval [OPTIONS] DATASET
+   Usage: assembly eval [OPTIONS] DATASETS...
 
-   Transcribe a dataset and score WER against its reference texts
+   Transcribe one or more datasets and score WER against their reference texts
 
    Each row's audio is transcribed, then scored against the row's reference
    text; both are normalized first (lowercased, punctuation stripped) so style
    differences don't count as errors, and the summary pools total errors over
    total reference words. Handy for picking a model: run once per
    --speech-model and compare.
 
+   Pass several datasets to score them in one run; each is loaded, scored, and
+   reported separately (under --json, one JSON object per dataset). The
+   --limit/--split/--subset/--column flags apply to every dataset.
+
    Datasets come from the Hugging Face Hub (any public dataset its viewer
    serves with audio + reference columns; gated ones need HF_TOKEN), a local
    .csv/.jsonl manifest with audio + text columns, or a built-in benchmark
@@ -506,9 +510,10 @@
    item's result to summarize patterns across the run.
 
   ╭─ Arguments ──────────────────────────────────────────────────────────────────╮
-  │ *    dataset      TEXT  Hugging Face dataset id, or a local .csv/.jsonl      │
-  │                         manifest with audio + text columns                   │
-  │                         [required]                                           │
+  │ *    datasets      DATASETS...  Hugging Face dataset ids, or local           │
+  │                                 .csv/.jsonl manifests with audio + text      │
+  │                                 columns                                      │
+  │                                 [required]                                   │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Options ────────────────────────────────────────────────────────────────────╮
   │ --split                  TEXT                      Hugging Face split to     │
@@ -554,6 +559,8 @@
    Examples
    Score a model on 10 rows of a benchmark
    $ assembly eval tedlium
+   Score several benchmarks in one run
+   $ assembly eval tedlium librispeech earnings22
    Compare models on your own audio
    $ assembly eval calls.csv --speech-model universal-3-pro
    More rows, transcribed four at a time

diff --git a/tests/test_eval_command.py b/tests/test_eval_command.py
@@ -55,6 +55,12 @@ def _payload_of(result):
     )
 
 
+def _payloads_of(result):
+    return [
+        json.loads(line) for line in result.output.splitlines() if line.startswith('{"dataset"')
+    ]
+
+
 def _without_latency(row):
     return {key: value for key, value in row.items() if key != "latency"}
 
@@ -121,6 +127,60 @@ def test_json_payload_shape(tmp_path, mocker):
     assert "failed" not in payload  # only present when a row failed
 
 
+def _write_two_manifests(tmp_path):
+    (tmp_path / "a.wav").write_bytes(b"fake-audio")
+    (tmp_path / "b.wav").write_bytes(b"fake-audio")
+    (tmp_path / "one.csv").write_text("audio,text\na.wav,hello there\n", encoding="utf-8")
+    (tmp_path / "two.csv").write_text("audio,text\nb.wav,goodbye now\n", encoding="utf-8")
+
+
+def test_multiple_datasets_emit_one_payload_each(tmp_path, mocker):
+    _auth()
+    _write_two_manifests(tmp_path)
+    # First dataset transcribes perfectly; second gets one of two words wrong.
+    _mock_transcribe(mocker, [_transcript("hello there"), _transcript("goodbye cow")])
+    result = runner.invoke(app, ["eval", "one.csv", "two.csv", "--json"])
+    assert result.exit_code == 0
+    payloads = _payloads_of(result)
+    # One self-describing JSON object per dataset, in argument order.
+    assert [payload["dataset"] for payload in payloads] == ["one.csv", "two.csv"]
+    assert payloads[0]["wer"] == 0.0
+    assert payloads[1]["wer"] == 0.5
+
+
+def test_multiple_datasets_render_a_block_per_dataset(tmp_path, mocker):
+    _auth()
+    _write_two_manifests(tmp_path)
+    _mock_transcribe(mocker, [_transcript("hello there"), _transcript("goodbye now")])
+    result = runner.invoke(app, ["eval", "one.csv", "two.csv"])
+    assert result.exit_code == 0
+    # Each dataset gets its own header line in the human output.
+    assert "one.csv" in result.output
+    assert "two.csv" in result.output
+
+
+def test_multiple_datasets_aggregate_failures_in_exit_message(tmp_path, mocker):
+    from aai_cli.core.errors import APIError
+
+    _auth()
+    _write_two_manifests(tmp_path)
+    # A row in each dataset fails: the tally pools (sums) across both datasets.
+    _mock_transcribe(mocker, [APIError("boom one"), APIError("boom two")])
+    result = runner.invoke(app, ["eval", "one.csv", "two.csv", "--json"])
+    assert result.exit_code == 1
+    err = next(
+        json.loads(line) for line in result.output.splitlines() if line.startswith('{"error"')
+    )
+    # Exact message (not a substring) so a summed-vs-subtracted tally is caught:
+    # "-2 of 2 …" would still contain "2 of 2 …".
+    assert err["error"]["message"] == "2 of 2 items failed to transcribe."
+
+
+def test_at_least_one_dataset_is_required():
+    result = runner.invoke(app, ["eval"])
+    assert result.exit_code == 2
+
+
 @pytest.mark.parametrize("model", ["universal-3-pro", "universal-2"])
 def test_speech_model_flag_reaches_config_and_output(tmp_path, mocker, model):
     _auth()

diff --git a/tests/test_eval_llm.py b/tests/test_eval_llm.py
@@ -225,7 +225,7 @@ def fake_status(message, *, json_mode, quiet):
 
 def test_llm_options_maps_fields():
     opts = evaluate_exec.EvalOptions(
-        dataset="d", split=None, subset=None, limit=10, audio_column=None,
+        datasets=["d"], split=None, subset=None, limit=10, audio_column=None,
         text_column=None, speech_model=None, language_code=None, concurrency=1,
         llm_prompt=["a"], llm_reduce=["b", "c"], model="m", max_tokens=5,
     )  # fmt: skip
@@ -238,7 +238,7 @@ def test_llm_options_maps_fields():
 
 def test_llm_options_default_to_empty_chains():
     opts = evaluate_exec.EvalOptions(
-        dataset="d", split=None, subset=None, limit=10, audio_column=None,
+        datasets=["d"], split=None, subset=None, limit=10, audio_column=None,
         text_column=None, speech_model=None, language_code=None, concurrency=1,
         llm_prompt=None, llm_reduce=None, model="m", max_tokens=5,
     )  # fmt: skip