From 2521f53e92104ca76cae6924eca97314b2731ea4 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 12 Jun 2026 22:02:20 +0000 Subject: [PATCH] =?UTF-8?q?Add=20fsspec=20bucket-URL=20sources=20(s3://,?= =?UTF-8?q?=20gs://,=20=E2=80=A6)=20to=20assembly=20transcribe?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A new aai_cli/remotefs.py (mirroring youtube.py's lazy-import shape) fetches audio from any fsspec-addressable store. fsspec core ships with the CLI; protocol backends (s3fs, gcsfs, adlfs, …) stay user-installed extras surfaced via fsspec's own install hint as a clean CLIError. - Single files: a bucket URL is downloaded to a temp dir and uploaded, like the YouTube path. resolve_audio_source gains an opt-in allow_remote flag, so stream/agent keep rejecting bucket URLs as missing files. - Batch mode: a remote glob (s3://bucket/calls/*.mp3) or trailing-slash folder (filtered by AUDIO_EXTENSIONS, recursive) expands like its local equivalent; sidecar naming now treats any bucket URL like a web URL (slug + hash in cwd). - --show-code rejects bucket URLs up front instead of emitting a script the SDK can't run. Tests drive the real fsspec code paths via its in-process memory:// filesystem (shared memory_fs fixture), so pytest-socket stays armed. https://claude.ai/code/session_01KKPpFcWPk3tyqVRBApdgcj --- .importlinter | 2 + README.md | 3 +- aai_cli/client.py | 11 +- aai_cli/commands/transcribe.py | 9 +- aai_cli/remotefs.py | 134 ++++++++++++++++ aai_cli/transcribe_batch.py | 40 ++++- aai_cli/transcribe_exec.py | 15 +- pyproject.toml | 8 +- .../test_snapshots_help_run.ambr | 10 +- tests/conftest.py | 17 ++ tests/test_remotefs.py | 151 ++++++++++++++++++ tests/test_source_validation.py | 23 +++ tests/test_transcribe.py | 32 ++++ tests/test_transcribe_batch_sources.py | 113 ++++++++++++- tests/test_transcribe_show_code.py | 15 ++ uv.lock | 11 ++ 16 files changed, 573 insertions(+), 21 deletions(-) create mode 100644 aai_cli/remotefs.py create mode 100644 tests/test_remotefs.py diff --git a/.importlinter b/.importlinter index f835c632..07dbce68 100644 --- a/.importlinter +++ b/.importlinter @@ -32,6 +32,7 @@ source_modules = aai_cli.options aai_cli.output aai_cli.procs + aai_cli.remotefs aai_cli.render aai_cli.speak_exec aai_cli.stdio @@ -88,6 +89,7 @@ source_modules = aai_cli.errors aai_cli.eval_data aai_cli.llm + aai_cli.remotefs aai_cli.telemetry aai_cli.wer forbidden_modules = diff --git a/README.md b/README.md index bcbbdbf2..ec74ffff 100644 --- a/README.md +++ b/README.md @@ -169,7 +169,7 @@ assembly init # scaffold a starter app ## 📋 Key features - **Transcription**: `assembly transcribe` handles files, URLs, and YouTube/podcast pages, with flags for speaker labels, PII redaction, summarization, sentiment, chapters, and more. -- **Batch transcription**: point `assembly transcribe` at a directory or glob (or pipe paths with `--from-stdin`) to transcribe everything concurrently, with sidecar files that make re-runs resumable. Add `--llm "prompt"` to run an LLM prompt over each finished transcript, saved into the sidecars. +- **Batch transcription**: point `assembly transcribe` at a directory or glob — local, or in bucket storage (`"s3://bucket/calls/*.mp3"`, `gs://`, `az://`, …, with the matching fsspec backend such as `s3fs` installed) — or pipe paths with `--from-stdin` to transcribe everything concurrently, with sidecar files that make re-runs resumable. Add `--llm "prompt"` to run an LLM prompt over each finished transcript, saved into the sidecars. - **Real-time streaming**: `assembly stream` transcribes the microphone, a file, or a URL live — on macOS it can capture system audio too. - **Voice agent**: `assembly agent` runs a full-duplex spoken conversation in your terminal. - **LLM Gateway**: `assembly llm` prompts an LLM over a transcript, stdin, or a live stream (`assembly stream --llm "summarize as I talk"`). @@ -194,6 +194,7 @@ Transcribe in batches — a directory, a glob, or a piped list, resumable on re- ```sh assembly transcribe ./recordings +assembly transcribe "s3://bucket/calls/*.mp3" # needs: pip install s3fs find . -name "*.wav" | assembly transcribe --from-stdin ``` diff --git a/aai_cli/client.py b/aai_cli/client.py index bea44769..0f4946fd 100644 --- a/aai_cli/client.py +++ b/aai_cli/client.py @@ -16,7 +16,7 @@ StreamingParameters, ) -from aai_cli import environments, jsonshape, stdio +from aai_cli import environments, jsonshape, remotefs, stdio from aai_cli.errors import APIError, CLIError, UsageError, auth_failure, is_auth_failure from aai_cli.streaming.diagnostics import classify_error, silence_streaming_logging @@ -45,14 +45,17 @@ def _make_streaming_client(api_key: str) -> _StreamingClientLike: return client -def resolve_audio_source(source: str | None, *, sample: bool, check_local: bool = True) -> str: +def resolve_audio_source( + source: str | None, *, sample: bool, check_local: bool = True, allow_remote: bool = False +) -> str: """The audio reference to use: the hosted --sample clip, else the given path/URL. Shared by `transcribe` and `stream` so both accept a file or URL and `--sample`. A local path that doesn't exist fails here — before any credential resolution or request — so a typo'd filename reads as "file not found", not as an API failure. ``--show-code`` paths pass ``check_local=False``: generating code for a file you - don't have yet is legitimate. + don't have yet is legitimate. Callers that can fetch fsspec bucket URLs (s3://, + gs://, …) pass ``allow_remote=True``; elsewhere those fail as missing files. """ if sample: if source: @@ -68,6 +71,8 @@ def resolve_audio_source(source: str | None, *, sample: bool, check_local: bool suggestion="Or pass --sample to use the hosted demo file.", ) if check_local and not source.startswith(("http://", "https://")): + if allow_remote and remotefs.is_remote_url(source): + return source path = Path(source) if not path.exists(): raise CLIError( diff --git a/aai_cli/commands/transcribe.py b/aai_cli/commands/transcribe.py index 851e2d1f..3cbf7df7 100644 --- a/aai_cli/commands/transcribe.py +++ b/aai_cli/commands/transcribe.py @@ -19,6 +19,7 @@ ("Transcribe a local file", "assembly transcribe call.mp3"), ("Batch-transcribe a folder", "assembly transcribe ./recordings"), ("Batch-transcribe a glob", 'assembly transcribe "calls/*.mp3"'), + ("Batch-transcribe an S3 prefix", 'assembly transcribe "s3://bucket/calls/*.mp3"'), ("Try it with the hosted sample", "assembly transcribe --sample"), ("Transcribe a YouTube video", "assembly transcribe https://youtu.be/dtp6b76pMak"), ("Transcribe a podcast page", 'assembly transcribe "https://podcasts.apple.com/…"'), @@ -33,7 +34,9 @@ def transcribe( ctx: typer.Context, source: str | None = typer.Argument( - None, help="Audio file, URL, YouTube/podcast URL, or a directory/glob (batch mode)." + None, + help="Audio file, URL, YouTube/podcast URL, bucket URL (s3://, gs://, …), or a " + "directory/glob (batch mode).", ), sample: bool = typer.Option(False, "--sample", help="Use the hosted wildfires.mp3 sample."), # batch mode @@ -348,6 +351,10 @@ def transcribe( sources already transcribed — with changed --llm prompts it replays just the LLM step, never a second transcription. + Bucket URLs (s3://, gs://, az://, sftp://, …) work for single files and for + batches (a glob, or a folder ending in /); install the matching fsspec + backend first (e.g. pip install s3fs) and use its usual credentials. + Curated flags cover common features; --config KEY=VALUE and --config-file reach every other field. Analysis (summary, chapters, ...) renders in human mode. """ opts = transcribe_exec.TranscribeOptions( diff --git a/aai_cli/remotefs.py b/aai_cli/remotefs.py new file mode 100644 index 00000000..d234741a --- /dev/null +++ b/aai_cli/remotefs.py @@ -0,0 +1,134 @@ +"""Fetching audio from fsspec-addressable storage (s3://, gs://, az://, sftp://, …). + +The AssemblyAI API fetches plain http(s) URLs itself, and `youtube.py` handles +media-page URLs; this module covers bucket/remote-storage URLs neither can read. +fsspec core ships with the CLI, but each protocol's backend (s3fs, gcsfs, adlfs, +…) is an optional install — a missing one surfaces as a clean install hint, not a +traceback. Credentials are the backend's business (e.g. the standard AWS +environment/config for s3://), never AssemblyAI's API key. +""" + +from __future__ import annotations + +from abc import abstractmethod +from contextlib import contextmanager +from pathlib import Path, PurePosixPath +from typing import TYPE_CHECKING, Protocol + +from aai_cli.errors import CLIError + +if TYPE_CHECKING: + from collections.abc import Generator + +# Schemes that never route through fsspec even though it knows them: the API +# fetches web URLs itself, and file:// paths take the ordinary local checks. +_NON_REMOTE_PROTOCOLS = frozenset({"http", "https", "file", "local"}) + + +class _RemoteFS(Protocol): + """The slice of ``fsspec.AbstractFileSystem`` this module touches.""" + + @abstractmethod + def glob(self, path: str, **kwargs: bool) -> dict[str, dict[str, object]]: + """Expand a glob; with ``detail=True``, a path -> info mapping.""" + + @abstractmethod + def find(self, path: str) -> list[str]: + """Every file under ``path``, recursively.""" + + @abstractmethod + def unstrip_protocol(self, name: str) -> str: + """Re-attach the protocol prefix to a bare path.""" + + @abstractmethod + def get_file(self, rpath: str, lpath: str) -> None: + """Copy one remote file to a local path.""" + + +def is_remote_url(source: str | None) -> bool: + """True if `source` is a URL whose scheme names an fsspec filesystem (s3://, gs://, …). + + http(s) is excluded (the API fetches web URLs itself), as is file:// (local + files go through the ordinary path checks); an unknown scheme is not remote. + """ + if not source or "://" not in source: + return False + protocol = source.partition("://")[0] + if protocol in _NON_REMOTE_PROTOCOLS: + return False + from fsspec.registry import known_implementations + + return protocol in known_implementations + + +def _filesystem(url: str) -> tuple[_RemoteFS, str]: + """The (filesystem, bare path) pair for `url`. + + A protocol whose backend isn't installed becomes a clean CLIError; fsspec's + own ImportError text already names the package to install (e.g. "Install + s3fs to access S3"). + """ + import fsspec + + try: + # fsspec ships no py.typed; the declared annotation pins the pair's type. + pair: tuple[_RemoteFS, str] = fsspec.url_to_fs(url) # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] + except ImportError as exc: + protocol = url.partition("://")[0] + raise CLIError( + f"Reading {protocol}:// URLs needs an extra package.", + error_type="remote_backend_missing", + exit_code=2, + suggestion=f"{exc}.", + ) from exc + return pair + + +@contextmanager +def _remote_errors(url: str) -> Generator[None]: + """Normalize backend exceptions for one remote operation into clean CLIErrors.""" + try: + yield + except FileNotFoundError as exc: + raise CLIError( + f"Remote file not found: {url}", + error_type="file_not_found", + exit_code=2, + suggestion="Check the path. A folder needs a trailing '/'; globs (*.mp3) also work.", + ) from exc + except CLIError: + raise + except Exception as exc: # backends raise many types; surface one clean CLI error + reason = " ".join(str(exc).split()) + raise CLIError( + f"Could not access {url}: {reason}", + error_type="remote_error", + exit_code=1, + ) from exc + + +def download(url: str, dest_dir: Path) -> Path: + """Copy the remote file at `url` into `dest_dir` and return its local path.""" + fs, path = _filesystem(url) + local = dest_dir / PurePosixPath(path).name + with _remote_errors(url): + fs.get_file(path, str(local)) + return local + + +def glob_files(url: str) -> list[str]: + """The full URLs of the files matching the remote glob `url`, sorted.""" + fs, path = _filesystem(url) + with _remote_errors(url): + details = fs.glob(path, detail=True) + return sorted( + fs.unstrip_protocol(match) for match, info in details.items() if info.get("type") == "file" + ) + + +def list_files(url: str) -> list[str]: + """The full URLs of every file under the remote folder/prefix `url`, recursively, sorted.""" + fs, path = _filesystem(url) + with _remote_errors(url): + found = fs.find(path) + return sorted(fs.unstrip_protocol(match) for match in found) diff --git a/aai_cli/transcribe_batch.py b/aai_cli/transcribe_batch.py index fef03c22..22f81bd3 100644 --- a/aai_cli/transcribe_batch.py +++ b/aai_cli/transcribe_batch.py @@ -1,7 +1,9 @@ """Batch transcription: directories, globs, and stdin lists with sidecar resume. ``assembly transcribe`` switches to batch mode when the source is a directory or a -glob pattern, or when ``--from-stdin`` supplies one path/URL per line. Sources run +glob pattern — local, or on fsspec-addressable remote storage (an ``s3://…/*.mp3`` +glob, or a trailing-slash folder like ``s3://bucket/calls/``) — or when +``--from-stdin`` supplies one path/URL per line. Sources run concurrently behind a live progress table; each finished source gets a ``.aai.json`` sidecar holding the full transcript. The sidecar doubles as the resume marker — a re-run skips any source whose sidecar records a completed @@ -29,7 +31,7 @@ from rich.live import Live from rich.markup import escape -from aai_cli import client, jsonshape, llm, output, stdio, theme, transcribe_exec +from aai_cli import client, jsonshape, llm, output, remotefs, stdio, theme, transcribe_exec from aai_cli.errors import CLIError, NotAuthenticated, UsageError, mutually_exclusive if TYPE_CHECKING: @@ -73,8 +75,9 @@ def expand_sources(source: str | None, *, from_stdin: bool, sample: bool) -> lis """The batch source list, or ``None`` when this is a single-source invocation. Batch mode triggers on ``--from-stdin``, a directory (scanned recursively for - audio files), or a glob pattern that names no existing file. A plain file, URL, - ``-`` (audio piped on stdin), or ``--sample`` stays on the single-source path. + audio files), a glob pattern that names no existing file, or a bucket URL + that is a glob or trailing-slash folder. A plain file, URL, ``-`` (audio + piped on stdin), or ``--sample`` stays on the single-source path. """ if from_stdin: return _stdin_sources(source, sample=sample) @@ -84,6 +87,8 @@ def expand_sources(source: str | None, *, from_stdin: bool, sample: bool) -> lis # whole working directory; instead it stays single-source and fails validation. if not source or sample or source == "-" or source.startswith(_URL_PREFIXES): return None + if remotefs.is_remote_url(source): + return _remote_sources(source) path = Path(source) if path.is_dir(): return _directory_sources(path) @@ -119,6 +124,29 @@ def _directory_sources(path: Path) -> list[str]: return files +def _remote_sources(url: str) -> list[str] | None: + """Batch sources for a bucket/remote URL, or ``None`` when it's a single file. + + Mirrors the local rules: a glob expands to its file matches (sidecars + excluded), a trailing-slash folder to its audio files (recursive, filtered by + ``AUDIO_EXTENSIONS``); anything else is downloaded as one file. + """ + if _GLOB_CHARS.intersection(url): + matches = [u for u in remotefs.glob_files(url) if not u.endswith(SIDECAR_SUFFIX)] + if not matches: + raise UsageError(f"No files match {url}.") + return matches + if url.endswith("/"): + files = [u for u in remotefs.list_files(url) if Path(u).suffix.lower() in AUDIO_EXTENSIONS] + if not files: + raise UsageError( + f"No audio files found under {url}.", + suggestion="Recognized extensions: " + ", ".join(sorted(AUDIO_EXTENSIONS)) + ".", + ) + return files + return None + + def _glob_sources(pattern: str) -> list[str]: # pathlib globs are always relative, so peel an absolute pattern's anchor off # and glob from there ("" anchors at the working directory; Path("") is "."). @@ -159,8 +187,8 @@ def reject_single_source_flags( def sidecar_path(source: str) -> Path: """Where ``source``'s sidecar lives: ``.aai.json`` next to a local file, or - a slug + URL-hash name in the working directory for a URL.""" - if source.startswith(_URL_PREFIXES): + a slug + URL-hash name in the working directory for a URL (web or bucket).""" + if source.startswith(_URL_PREFIXES) or remotefs.is_remote_url(source): digest = hashlib.sha256(source.encode()).hexdigest()[:8] slug = re.sub(r"[^A-Za-z0-9._-]+", "-", source.partition("://")[2]).strip("-.")[:64] return Path(f"{slug}-{digest}{SIDECAR_SUFFIX}") diff --git a/aai_cli/transcribe_exec.py b/aai_cli/transcribe_exec.py index 729c25f2..f0467ff2 100644 --- a/aai_cli/transcribe_exec.py +++ b/aai_cli/transcribe_exec.py @@ -24,6 +24,7 @@ config_builder, llm, output, + remotefs, stdio, transcribe_render, youtube, @@ -155,7 +156,7 @@ def check_source_exists(source: str | None, *, sample: bool) -> None: Stdin (``-``) is exempt: its bytes are buffered at transcription time. """ if source != "-": - client.resolve_audio_source(source, sample=sample) + client.resolve_audio_source(source, sample=sample, allow_remote=True) def run_transcription( @@ -177,7 +178,12 @@ def run_transcription( local.write_bytes(data) return client.transcribe(api_key, str(local), config=transcription_config) - audio = client.resolve_audio_source(source, sample=sample) + audio = client.resolve_audio_source(source, sample=sample, allow_remote=True) + if remotefs.is_remote_url(audio): + # Fetch from bucket/remote storage first; the API can't read s3://-style URLs. + with tempfile.TemporaryDirectory(prefix="aai-remote-") as td: + local = remotefs.download(audio, Path(td)) + return client.transcribe(api_key, str(local), config=transcription_config) if youtube.is_downloadable_url(audio): # Fetch first; AssemblyAI can't read a YouTube/podcast page URL itself. with tempfile.TemporaryDirectory(prefix="aai-yt-") as td: @@ -368,6 +374,11 @@ def _print_show_code(opts: TranscribeOptions, merged: dict[str, object]) -> None Raw stdout, so `--show-code > script.py` runs. No source/--sample needed — fall back to a placeholder path for a pure snippet. """ + if opts.source and remotefs.is_remote_url(opts.source): + raise UsageError( + "--show-code does not support bucket URLs (s3://, gs://, …) yet.", + suggestion="Download the audio first and pass the local file.", + ) audio = ( client.resolve_audio_source(opts.source, sample=opts.sample, check_local=False) if opts.source or opts.sample diff --git a/pyproject.toml b/pyproject.toml index 1a644852..9222e9cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,10 @@ dependencies = [ # `assembly eval` WER scoring uses jiwer (the de-facto standard WER # implementation); wer.py imports it lazily to keep CLI startup fast. "jiwer>=4.0", + # Bucket-URL audio sources (s3://, gs://, …) for `assembly transcribe` (remotefs.py, + # imported lazily). fsspec core only — each protocol's backend (s3fs, gcsfs, adlfs, + # …) stays a user-installed extra surfaced via a clean install hint. + "fsspec>=2026.4.0", ] [project.urls] @@ -271,8 +275,8 @@ paths = ["aai_cli", "tests"] exclude = ["aai_cli/_version.py"] min_confidence = 90 ignore_decorators = ["@app.command", "@app.callback"] -ignore_names = ["app", "capture_output", "download", "healthy", "ist", "memory_keyring", "org", - "refresh"] +ignore_names = ["app", "capture_output", "download", "healthy", "ist", "lpath", "memory_keyring", + "org", "refresh", "rpath"] [tool.deptry] exclude = ["docs", "dist", ".venv", "aai_cli/init/templates"] diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr index d4f7c2b9..031a4ba8 100644 --- a/tests/__snapshots__/test_snapshots_help_run.ambr +++ b/tests/__snapshots__/test_snapshots_help_run.ambr @@ -493,13 +493,17 @@ sources already transcribed — with changed --llm prompts it replays just the LLM step, never a second transcription. + Bucket URLs (s3://, gs://, az://, sftp://, …) work for single files and for + batches (a glob, or a folder ending in /); install the matching fsspec + backend first (e.g. pip install s3fs) and use its usual credentials. + Curated flags cover common features; --config KEY=VALUE and --config-file reach every other field. Analysis (summary, chapters, ...) renders in human mode. ╭─ Arguments ──────────────────────────────────────────────────────────────────╮ - │ source [SOURCE] Audio file, URL, YouTube/podcast URL, or a │ - │ directory/glob (batch mode). │ + │ source [SOURCE] Audio file, URL, YouTube/podcast URL, bucket URL │ + │ (s3://, gs://, …), or a directory/glob (batch mode). │ ╰──────────────────────────────────────────────────────────────────────────────╯ ╭─ Options ────────────────────────────────────────────────────────────────────╮ │ --sample Use the hosted │ @@ -647,6 +651,8 @@ $ assembly transcribe ./recordings Batch-transcribe a glob $ assembly transcribe "calls/*.mp3" + Batch-transcribe an S3 prefix + $ assembly transcribe "s3://bucket/calls/*.mp3" Try it with the hosted sample $ assembly transcribe --sample Transcribe a YouTube video diff --git a/tests/conftest.py b/tests/conftest.py index 948de2e7..5b2fb368 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -127,6 +127,23 @@ def neutralize_shipped_token(monkeypatch): return original +@pytest.fixture +def memory_fs(): + """fsspec's in-process memory filesystem, reset afterwards. + + Lets remote-source tests (memory:// URLs) exercise real fsspec glob/find/ + download code paths while pytest-socket stays armed. The reset matters: + MemoryFileSystem state is process-global (class attributes), so leftover + files would leak across randomly-ordered tests. + """ + import fsspec + from fsspec.implementations.memory import MemoryFileSystem + + yield fsspec.filesystem("memory") + MemoryFileSystem.store.clear() + MemoryFileSystem.pseudo_dirs[:] = [""] + + @pytest.fixture(autouse=True) def tmp_config(monkeypatch, tmp_path): cfg_dir = tmp_path / "config" diff --git a/tests/test_remotefs.py b/tests/test_remotefs.py new file mode 100644 index 00000000..fc8b0021 --- /dev/null +++ b/tests/test_remotefs.py @@ -0,0 +1,151 @@ +"""aai_cli.remotefs: fsspec-backed bucket/remote audio sources. + +Driven against fsspec's in-process memory filesystem (the shared ``memory_fs`` +fixture) so the tests exercise real fsspec glob/find/download code paths while +pytest-socket stays armed. +""" + +from pathlib import Path + +import fsspec +import pytest +from fsspec.implementations.memory import MemoryFileSystem +from fsspec.registry import known_implementations + +from aai_cli import remotefs +from aai_cli.errors import CLIError + + +@pytest.mark.parametrize( + "url", + [ + "s3://bucket/key.mp3", + "gs://bucket/key.wav", + "az://container/key.m4a", + "sftp://host/path/key.mp3", + "memory://calls/a.mp3", + ], +) +def test_is_remote_url_matches_fsspec_schemes(url): + assert remotefs.is_remote_url(url) + + +@pytest.mark.parametrize( + "source", + [ + "https://example.com/a.mp3", # the API fetches web URLs itself + "http://example.com/a.mp3", + "file:///tmp/a.mp3", # local files take the ordinary path checks + "local://tmp/a.mp3", + "./a.mp3", + "/abs/a.mp3", + "C:\\audio\\a.mp3", + "not-a-known-scheme://x/y.mp3", + "-", + "", + None, + ], +) +def test_is_remote_url_rejects_web_local_and_unknown_sources(source): + assert not remotefs.is_remote_url(source) + + +def test_download_copies_the_remote_file_locally(memory_fs, tmp_path): + memory_fs.pipe("/calls/a.mp3", b"remote-bytes") + local = remotefs.download("memory://calls/a.mp3", tmp_path) + assert local == tmp_path / "a.mp3" # keeps the remote file's name + assert local.read_bytes() == b"remote-bytes" + + +def test_download_missing_remote_file_fails_cleanly(memory_fs, tmp_path): + with pytest.raises(CLIError) as exc: + remotefs.download("memory://calls/nope.mp3", tmp_path) + assert exc.value.error_type == "file_not_found" + assert exc.value.exit_code == 2 + assert exc.value.message == "Remote file not found: memory://calls/nope.mp3" + assert "trailing '/'" in (exc.value.suggestion or "") + + +def test_missing_protocol_backend_surfaces_install_hint(tmp_path): + # fsspec knows the protocol but its backend package isn't importable; the + # CLIError must carry fsspec's own install hint, not a traceback. + fsspec.register_implementation( + "aaimissing", "no_such_pkg.NoFS", errtxt="Install no-such-pkg to access aaimissing" + ) + try: + assert remotefs.is_remote_url("aaimissing://bucket/a.mp3") + with pytest.raises(CLIError) as exc: + remotefs.download("aaimissing://bucket/a.mp3", tmp_path) + finally: + known_implementations.pop("aaimissing") + assert exc.value.error_type == "remote_backend_missing" + assert exc.value.exit_code == 2 + assert exc.value.message == "Reading aaimissing:// URLs needs an extra package." + assert "Install no-such-pkg" in (exc.value.suggestion or "") + + +def test_backend_errors_become_one_clean_cli_error(memory_fs, tmp_path, monkeypatch): + # Not-found is special-cased above; anything else (auth, permissions, …) + # collapses to a single one-line CLIError, multi-line reprs flattened. + def _denied(self, rpath, lpath, **kwargs): + raise PermissionError("access\n denied") + + monkeypatch.setattr(MemoryFileSystem, "get_file", _denied) + with pytest.raises(CLIError) as exc: + remotefs.download("memory://calls/a.mp3", tmp_path) + assert exc.value.error_type == "remote_error" + assert exc.value.exit_code == 1 + assert exc.value.message == "Could not access memory://calls/a.mp3: access denied" + + +def test_cli_errors_from_an_operation_pass_through_unwrapped(memory_fs, tmp_path, monkeypatch): + inner = CLIError("already clean", error_type="x", exit_code=3) + + def _raise(self, rpath, lpath, **kwargs): + raise inner + + monkeypatch.setattr(MemoryFileSystem, "get_file", _raise) + with pytest.raises(CLIError) as exc: + remotefs.download("memory://calls/a.mp3", tmp_path) + assert exc.value is inner + + +def test_glob_files_returns_full_urls_for_files_only(memory_fs): + memory_fs.pipe("/calls/b.mp3", b"b") + memory_fs.pipe("/calls/a.mp3", b"a") + memory_fs.pipe("/calls/sub.mp3/inner.mp3", b"i") # a directory matching the glob + memory_fs.pipe("/calls/notes.txt", b"x") + assert remotefs.glob_files("memory://calls/*.mp3") == [ + "memory:///calls/a.mp3", + "memory:///calls/b.mp3", + ] + + +def test_list_files_walks_the_folder_recursively(memory_fs): + memory_fs.pipe("/calls/a.mp3", b"a") + memory_fs.pipe("/calls/sub/b.wav", b"b") + assert remotefs.list_files("memory://calls/") == [ + "memory:///calls/a.mp3", + "memory:///calls/sub/b.wav", + ] + + +def test_results_are_sorted_even_when_the_backend_is_not(memory_fs, monkeypatch): + # Real backends return listing order; the expansion must be deterministic. + monkeypatch.setattr(MemoryFileSystem, "find", lambda self, path: ["/z.mp3", "/a.mp3"]) + assert remotefs.list_files("memory://x/") == ["memory:///a.mp3", "memory:///z.mp3"] + monkeypatch.setattr( + MemoryFileSystem, + "glob", + lambda self, path, **kwargs: {"/z.mp3": {"type": "file"}, "/a.mp3": {"type": "file"}}, + ) + assert remotefs.glob_files("memory://x/*.mp3") == ["memory:///a.mp3", "memory:///z.mp3"] + + +def test_downloaded_urls_round_trip_through_url_to_fs(memory_fs, tmp_path): + # glob_files/list_files emit unstripped URLs (memory:///…); download must + # accept exactly those, since batch mode feeds them straight back in. + memory_fs.pipe("/calls/a.mp3", b"abc") + (url,) = remotefs.glob_files("memory://calls/*.mp3") + local = remotefs.download(url, tmp_path) + assert Path(local).read_bytes() == b"abc" diff --git a/tests/test_source_validation.py b/tests/test_source_validation.py index 94b888e6..2ba88c3a 100644 --- a/tests/test_source_validation.py +++ b/tests/test_source_validation.py @@ -98,6 +98,29 @@ def test_resolve_audio_source_skips_existence_check_for_urls_and_show_code(): ) +def test_resolve_audio_source_remote_url_needs_allow_remote(): + # Only callers that can fetch bucket URLs (transcribe) opt in; everywhere + # else an s3://-style source must keep failing as a missing file. + url = "s3://bucket/key.mp3" + with pytest.raises(CLIError) as exc: + client.resolve_audio_source(url, sample=False) + assert exc.value.error_type == "file_not_found" + assert client.resolve_audio_source(url, sample=False, allow_remote=True) == url + + +def test_resolve_audio_source_allow_remote_still_checks_local_paths(): + with pytest.raises(CLIError) as exc: + client.resolve_audio_source("missing.mp3", sample=False, allow_remote=True) + assert exc.value.error_type == "file_not_found" + + +def test_stream_rejects_bucket_urls_as_missing_files(): + # stream/agent can't fetch bucket URLs; the pre-credential check still fires. + result = runner.invoke(app, ["stream", "s3://bucket/key.mp3"]) + assert result.exit_code == 2 + assert "File not found: s3://bucket/key.mp3" in result.output + + def test_validate_transcript_id_rejects_path_segments(): assert client.validate_transcript_id("t_42-abc") == "t_42-abc" for bad in ("../../etc/passwd", "", "a/b", "id?x=1"): diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py index 7e6cf694..28991567 100644 --- a/tests/test_transcribe.py +++ b/tests/test_transcribe.py @@ -393,6 +393,38 @@ def test_transcribe_podcast_page_url_downloads_then_transcribes(monkeypatch, moc assert tx.call_args.args[1] == str(fake) # transcribed the downloaded local file +def test_transcribe_remote_bucket_url_downloads_then_transcribes(monkeypatch, mocker, memory_fs): + # A single fsspec bucket URL (memory:// stands in for s3://, gs://, …) is + # fetched to a temp file and that local copy is uploaded. + _auth() + memory_fs.pipe("/bucket/call.mp3", b"remote-bytes") + seen = {} + + def fake(api_key, audio, *, config): + from pathlib import Path + + seen["path"] = audio + seen["bytes"] = Path(audio).read_bytes() + return _fake_transcript(mocker) + + monkeypatch.setattr("aai_cli.transcribe_exec.client.transcribe", fake) + result = runner.invoke(app, ["transcribe", "memory://bucket/call.mp3", "-o", "text"]) + assert result.exit_code == 0 + assert result.output.strip() == "hello world" + assert seen["path"].endswith("/call.mp3") + assert "aai-remote-" in seen["path"] # a temp copy, not the URL itself + assert seen["bytes"] == b"remote-bytes" + + +def test_transcribe_missing_remote_file_fails_cleanly(mocker, memory_fs): + _auth() + tx = mocker.patch("aai_cli.transcribe_exec.client.transcribe", autospec=True) + result = runner.invoke(app, ["transcribe", "memory://bucket/nope.mp3"]) + assert result.exit_code == 2 + assert "Remote file not found" in result.output + tx.assert_not_called() + + def test_transcribe_direct_audio_url_passes_through_without_download(monkeypatch, mocker): # The API fetches direct audio URLs itself; no yt-dlp download must happen. _auth() diff --git a/tests/test_transcribe_batch_sources.py b/tests/test_transcribe_batch_sources.py index 8c835b62..ce750299 100644 --- a/tests/test_transcribe_batch_sources.py +++ b/tests/test_transcribe_batch_sources.py @@ -1,11 +1,15 @@ """Batch-mode source selection for `assembly transcribe`: glob/directory/stdin -expansion and the single-source flags batch mode rejects. +expansion (local and remote/bucket URLs) and the single-source flags batch mode +rejects. The batch *run* (sidecar resume, concurrency, failures, output) lives in -test_transcribe_batch.py. +test_transcribe_batch.py. Remote sources use fsspec's memory:// filesystem (the +shared ``memory_fs`` fixture), so the real download/glob paths run offline. """ +import hashlib import json +from pathlib import Path import pytest from typer.testing import CliRunner @@ -197,9 +201,110 @@ def test_batch_rejects_single_source_flags(tmp_path, extra, flag_name, hint): assert hint in result.output -def test_glob_batch_writes_per_source_sidecars(tmp_path, mocker, monkeypatch): - import hashlib +def _patch_transcribe_recording_uploads(mocker, monkeypatch): + """Patch client.transcribe with a fake that records (name, bytes, path) per upload. + + Remote sources are downloaded to a temp file before upload, so the audio arg + is a local path whose basename and bytes must match the remote object. + """ + seen = [] + + def fake(api_key, audio, *, config): + seen.append((Path(audio).name, Path(audio).read_bytes(), audio)) + t = mocker.MagicMock() + t.id = f"t_{Path(audio).name}" + t.text = "hi" + t.status = "completed" + t.json_response = {"id": t.id, "text": t.text, "status": "completed"} + return t + + monkeypatch.setattr(_TRANSCRIBE, fake) + return seen + + +def test_remote_glob_batch_downloads_and_transcribes(mocker, monkeypatch, memory_fs): + _auth() + memory_fs.pipe("/calls/a.mp3", b"aaa") + memory_fs.pipe("/calls/b.mp3", b"bbb") + memory_fs.pipe("/calls/notes.txt", b"not audio") + seen = _patch_transcribe_recording_uploads(mocker, monkeypatch) + result = runner.invoke(app, ["transcribe", "memory://calls/*.mp3", "--json"]) + assert result.exit_code == 0 + # Each match was fetched to a temp dir (never the cwd) and its bytes uploaded. + assert sorted((name, data) for name, data, _ in seen) == [ + ("a.mp3", b"aaa"), + ("b.mp3", b"bbb"), + ] + assert all("aai-remote-" in path for _, _, path in seen) + records = {r["source"]: r for r in map(json.loads, result.output.splitlines())} + record = records["memory:///calls/a.mp3"] + assert record["status"] == "completed" + digest = hashlib.sha256(b"memory:///calls/a.mp3").hexdigest()[:8] + assert record["sidecar"] == f"calls-a.mp3-{digest}.aai.json" + assert json.loads(Path(record["sidecar"]).read_text())["status"] == "completed" + + +def test_remote_batch_rerun_resumes_from_sidecars(mocker, monkeypatch, memory_fs): + _auth() + memory_fs.pipe("/calls/a.mp3", b"aaa") + seen = _patch_transcribe_recording_uploads(mocker, monkeypatch) + assert runner.invoke(app, ["transcribe", "memory://calls/*.mp3", "--json"]).exit_code == 0 + result = runner.invoke(app, ["transcribe", "memory://calls/*.mp3", "--json"]) + assert result.exit_code == 0 + assert [json.loads(line)["status"] for line in result.output.splitlines()] == ["skipped"] + assert len(seen) == 1 # the re-run never downloaded or transcribed again + +def test_remote_glob_skips_sidecar_files(mocker, monkeypatch, memory_fs): + _auth() + memory_fs.pipe("/calls/a.mp3", b"aaa") + memory_fs.pipe("/calls/stale.aai.json", b"{}") + seen = _patch_transcribe_recording_uploads(mocker, monkeypatch) + result = runner.invoke(app, ["transcribe", "memory://calls/*", "--json"]) + assert result.exit_code == 0 + assert [name for name, _, _ in seen] == ["a.mp3"] + + +def test_remote_folder_scan_is_recursive_and_audio_only(mocker, monkeypatch, memory_fs): + _auth() + memory_fs.pipe("/calls/a.mp3", b"a") + memory_fs.pipe("/calls/sub/b.WAV", b"b") # extension match is case-insensitive + memory_fs.pipe("/calls/notes.txt", b"not audio") + seen = _patch_transcribe_recording_uploads(mocker, monkeypatch) + result = runner.invoke(app, ["transcribe", "memory://calls/", "--json"]) + assert result.exit_code == 0 + assert sorted(name for name, _, _ in seen) == ["a.mp3", "b.WAV"] + + +def test_remote_folder_without_audio_exits_2(memory_fs): + _auth() + memory_fs.pipe("/calls/notes.txt", b"x") + result = runner.invoke(app, ["transcribe", "memory://calls/"]) + assert result.exit_code == 2 + assert "No audio files found under memory://calls/" in result.output + assert ".mp3" in result.output # the suggestion lists recognized extensions + + +def test_remote_glob_without_matches_exits_2(memory_fs): + _auth() + result = runner.invoke(app, ["transcribe", "memory://calls/*.mp3"]) + assert result.exit_code == 2 + assert "No files match" in result.output + + +def test_plain_remote_file_url_stays_single_source(memory_fs): + # No glob and no trailing slash: a bucket URL is one file, like a local path. + for url in ("memory://calls/a.mp3", "memory://calls"): + assert transcribe_batch.expand_sources(url, from_stdin=False, sample=False) is None + + +def test_sidecar_path_for_remote_url_is_slug_plus_hash(): + url = "memory:///calls/a.mp3" + digest = hashlib.sha256(url.encode()).hexdigest()[:8] + assert transcribe_batch.sidecar_path(url) == Path(f"calls-a.mp3-{digest}.aai.json") + + +def test_glob_batch_writes_per_source_sidecars(tmp_path, mocker, monkeypatch): _auth() (tmp_path / "a.mp3").write_bytes(b"aaa") (tmp_path / "b.mp3").write_bytes(b"bbb") diff --git a/tests/test_transcribe_show_code.py b/tests/test_transcribe_show_code.py index d3e87d07..97522455 100644 --- a/tests/test_transcribe_show_code.py +++ b/tests/test_transcribe_show_code.py @@ -126,3 +126,18 @@ def _boom(*a, **k): assert result.exit_code == 0 compile(result.output, "", "exec") assert 'print(f"Speaker {utt.speaker}: {utt.text}")' in result.output + + +def test_transcribe_show_code_rejects_bucket_urls(monkeypatch): + # Generated SDK code can't fetch s3://-style URLs (the API only reads http(s)), + # so a bucket source is rejected up front instead of emitting a broken script. + called = [] + monkeypatch.setattr( + "aai_cli.transcribe_exec.client.transcribe", + lambda *a, **k: called.append(True), + ) + result = runner.invoke(app, ["transcribe", "s3://bucket/call.mp3", "--show-code"]) + assert result.exit_code == 2 + assert "--show-code does not support bucket URLs" in result.output + assert "pass the local file" in result.output + assert called == [] diff --git a/uv.lock b/uv.lock index 39966b36..e817e19f 100644 --- a/uv.lock +++ b/uv.lock @@ -22,6 +22,7 @@ source = { editable = "." } dependencies = [ { name = "assemblyai" }, { name = "audioop-lts", marker = "python_full_version >= '3.13'" }, + { name = "fsspec" }, { name = "httpx2" }, { name = "jiwer" }, { name = "keyring" }, @@ -73,6 +74,7 @@ dev = [ requires-dist = [ { name = "assemblyai", specifier = ">=0.64.4" }, { name = "audioop-lts", marker = "python_full_version >= '3.13'", specifier = ">=0.2" }, + { name = "fsspec", specifier = ">=2026.4.0" }, { name = "httpx2", specifier = ">=2.0.0" }, { name = "jiwer", specifier = ">=4.0" }, { name = "keyring", specifier = ">=25.7.0" }, @@ -681,6 +683,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/47/dd9a212ef6e343a6857485ffe25bba537304f1913bdbed446a23f7f592e1/filelock-3.29.0-py3-none-any.whl", hash = "sha256:96f5f6344709aa1572bbf631c640e4ebeeb519e08da902c39a001882f30ac258", size = 39812, upload-time = "2026-04-19T15:39:08.752Z" }, ] +[[package]] +name = "fsspec" +version = "2026.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d5/8d/1c51c094345df128ca4a990d633fe1a0ff28726c9e6b3c41ba65087bba1d/fsspec-2026.4.0.tar.gz", hash = "sha256:301d8ac70ae90ef3ad05dcf94d6c3754a097f9b5fe4667d2787aa359ec7df7e4", size = 312760, upload-time = "2026-04-29T20:42:38.635Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d5/0c/043d5e551459da400957a1395e0febbf771446ff34291afcbe3d8be2a279/fsspec-2026.4.0-py3-none-any.whl", hash = "sha256:11ef7bb35dab8a394fde6e608221d5cf3e8499401c249bebaeaad760a1a8dec2", size = 203402, upload-time = "2026-04-29T20:42:36.842Z" }, +] + [[package]] name = "grimp" version = "3.14"