From 910a294f8dde00fca94219e64b9231155c91d88a Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 16 Jun 2026 21:27:46 +0000 Subject: [PATCH 1/3] Transcribe a podcast RSS feed as a resumable batch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `assembly transcribe ` now expands a podcast RSS/Atom feed into its episode enclosure URLs and runs them through the existing batch path — one `.aai.json` sidecar per episode, resumable, concurrent, and compatible with `--llm`/`--llm-reduce`. The enclosures are direct media URLs the API fetches itself, so no per-episode yt-dlp download is needed (unlike a podcast *page*). Detection is deliberately narrow to avoid surprise fetches: only an http(s) URL whose path is feed-shaped (extensionless or `.xml`/`.rss`/`.atom`) and that no dedicated yt-dlp extractor already claims is probed, the response body is bounded to 10 MB, binary media content types are skipped, and only content that actually parses as a feed with at least one enclosure is treated as one. `--show-code` skips the probe entirely so it never touches the network. Docs (README, transcribe help/docstring, aai-cli skill reference) updated to list RSS feeds alongside files, URLs, and YouTube/podcast pages. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01VwZxsDGG57kDQU4J39u3oq --- README.md | 8 +- aai_cli/app/transcribe/feed.py | 125 +++++++ aai_cli/app/transcribe/run.py | 7 +- aai_cli/app/transcribe/sources.py | 27 +- aai_cli/commands/transcribe.py | 17 +- .../aai-cli/references/transcription.md | 9 +- .../test_snapshots_help_run.ambr | 30 +- tests/test_transcribe_feed.py | 311 ++++++++++++++++++ 8 files changed, 504 insertions(+), 30 deletions(-) create mode 100644 aai_cli/app/transcribe/feed.py create mode 100644 tests/test_transcribe_feed.py diff --git a/README.md b/README.md index 242008a8..8d89f8de 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![License](https://img.shields.io/badge/license-MIT-D6402E)](https://github.com/AssemblyAI/cli/blob/main/LICENSE) [![Docs](https://img.shields.io/badge/docs-assemblyai-D6402E)](https://www.assemblyai.com/docs) -The AssemblyAI CLI (`assembly`) brings speech AI directly into your terminal: transcribe files, URLs, and YouTube/podcast pages, stream live audio, talk to a two-way voice agent, prompt the LLM Gateway, benchmark speech models, and scaffold ready-to-deploy starter apps. +The AssemblyAI CLI (`assembly`) brings speech AI directly into your terminal: transcribe files, URLs, YouTube/podcast pages, and whole podcast RSS feeds, stream live audio, talk to a two-way voice agent, prompt the LLM Gateway, benchmark speech models, and scaffold ready-to-deploy starter apps.

The assembly CLI welcome screen, listing command groups for transcription, streaming, voice agents, app scaffolding, and account management @@ -44,7 +44,7 @@ That's it. Run `assembly onboard` for a guided tour, or see [Installation](#-ins | Command | What it does | | :--- | :--- | -| `assembly transcribe` | Transcribe files, URLs, YouTube/podcast pages, directories, globs, or bucket storage (`s3://`, `gs://`, `az://`) — with speaker labels, PII redaction, summarization, SRT/VTT captions, and resumable batch runs | +| `assembly transcribe` | Transcribe files, URLs, YouTube/podcast pages, podcast RSS feeds, directories, globs, or bucket storage (`s3://`, `gs://`, `az://`) — with speaker labels, PII redaction, summarization, SRT/VTT captions, and resumable batch runs | | `assembly stream` | Real-time transcription from your microphone, a file, or a URL — on macOS it can capture system audio too | | `assembly dictate` | Push-to-talk dictation: press Enter to record, Enter again for instant text (Sync STT API, up to 120 s per utterance) | | `assembly agent` | Full-duplex spoken conversation with a voice agent, right in your terminal | @@ -285,11 +285,13 @@ assembly transcribe video.mp4 -o srt # captions assembly transcribe call.mp3 --speaker-labels --summarization --json ``` -Transcribe in batches — a directory, a glob, or a piped list, resumable on re-run: +Transcribe in batches — a directory, a glob, a piped list, or a whole podcast +RSS feed (every episode becomes one source), resumable on re-run: ```sh assembly transcribe ./recordings assembly transcribe "s3://bucket/calls/*.mp3" # needs: pip install s3fs +assembly transcribe "https://feeds.simplecast.com/54nAGcIl" # every episode in the feed find . -name "*.wav" | assembly transcribe --from-stdin ``` diff --git a/aai_cli/app/transcribe/feed.py b/aai_cli/app/transcribe/feed.py new file mode 100644 index 00000000..3b4df05a --- /dev/null +++ b/aai_cli/app/transcribe/feed.py @@ -0,0 +1,125 @@ +"""Podcast RSS/Atom feed expansion for ``assembly transcribe``. + +A feed URL names a whole show, so transcribing it means transcribing every +episode. ``feed_episode_urls`` fetches the URL and, when it parses as an RSS or +Atom feed carrying audio/video enclosures, returns those enclosure URLs (in feed +order — newest first) for the batch path to transcribe, one resumable sidecar per +episode. The enclosures are direct media URLs the API fetches itself, so — unlike +a YouTube or podcast *page*, which yt-dlp downloads first — no local download step +is needed. + +Detection is deliberately narrow so a direct media URL or ordinary web page still +falls through to the single-source path untouched (and is never fetched twice): +only an http(s) URL whose path is feed-shaped — no extension, or one of +``.xml``/``.rss``/``.atom`` — and that no dedicated yt-dlp extractor already claims +is sniffed, the response body is bounded, and only content that actually parses as +a feed with at least one enclosure is treated as a feed. +""" + +from __future__ import annotations + +import html +import re +from pathlib import PurePosixPath +from urllib.parse import urlsplit + +from aai_cli.core import youtube + +# A feed lives at an extensionless URL (e.g. feeds.simplecast.com/) or a feed +# document (.xml/.rss/.atom). Every other path — .mp3, .txt, .pdf — is never a feed, +# so it is left for the single-source path and never fetched here. +_FEED_URL_SUFFIXES = frozenset({"", ".xml", ".rss", ".atom"}) + +# Bound the download so a hostile or huge URL can't exhaust memory; 10 MB of feed +# already holds thousands of episodes, far past any realistic batch. +_MAX_FEED_BYTES = 10 * 1024 * 1024 +_FETCH_TIMEOUT_SECONDS = 15.0 + +# A feed body must announce itself with an or root element +# (namespaced or not) before its s are trusted, so a stray HTML page +# that merely contains the word "enclosure" is never mistaken for a podcast. +_FEED_ROOT_RE = re.compile(r"<\s*(?:[\w.-]+:)?(?:rss|feed)\b", re.IGNORECASE) +# RSS 2.0 episodes: . The url +# attribute can sit anywhere in the tag and use either quote style. +_ENCLOSURE_TAG_RE = re.compile(r"<\s*enclosure\b([^>]*)>", re.IGNORECASE) +# Atom episodes: (rel/href in either order). +_LINK_TAG_RE = re.compile(r"<\s*link\b([^>]*)>", re.IGNORECASE) +_URL_ATTR_RE = re.compile(r"""\burl\s*=\s*["']([^"']+)["']""", re.IGNORECASE) +_HREF_ATTR_RE = re.compile(r"""\bhref\s*=\s*["']([^"']+)["']""", re.IGNORECASE) +_REL_ENCLOSURE_RE = re.compile(r"""\brel\s*=\s*["']enclosure["']""", re.IGNORECASE) + + +def feed_episode_urls(url: str) -> list[str] | None: + """The episode media URLs if `url` is a podcast feed, else ``None``. + + Returns ``None`` (stay single-source) for a direct-media URL, a yt-dlp page, + an unreachable URL, or any content that isn't a feed carrying enclosures. + """ + if not _looks_like_feed_url(url) or youtube.is_downloadable_url(url): + return None + body = _fetch(url) + if body is None: + return None + return _episode_urls(body) + + +def _looks_like_feed_url(url: str) -> bool: + """True when the URL path is feed-shaped: extensionless or a feed document.""" + suffix = PurePosixPath(urlsplit(url).path).suffix.lower() + return suffix in _FEED_URL_SUFFIXES + + +def _episode_urls(body: str) -> list[str] | None: + """The enclosure URLs in a feed body, deduped in document order; ``None`` when it + isn't a feed or carries no enclosures.""" + if not _FEED_ROOT_RE.search(body): + return None + urls = [*_rss_enclosure_urls(body), *_atom_enclosure_urls(body)] + deduped = list(dict.fromkeys(u for u in urls if u)) + return deduped or None + + +def _rss_enclosure_urls(body: str) -> list[str]: + """The ``url`` of every RSS ```` tag, HTML-unescaped.""" + return [ + html.unescape(match.group(1).strip()) + for attrs in _ENCLOSURE_TAG_RE.findall(body) + if (match := _URL_ATTR_RE.search(attrs)) is not None + ] + + +def _atom_enclosure_urls(body: str) -> list[str]: + """The ``href`` of every Atom ````, HTML-unescaped.""" + return [ + html.unescape(match.group(1).strip()) + for attrs in _LINK_TAG_RE.findall(body) + if _REL_ENCLOSURE_RE.search(attrs) is not None + and (match := _HREF_ATTR_RE.search(attrs)) is not None + ] + + +def _fetch(url: str) -> str | None: + """Up to ``_MAX_FEED_BYTES`` of `url` decoded as text, or ``None`` on any failure + or when the response is obviously binary media (audio/video/image).""" + import httpx2 as httpx + + chunks: list[bytes] = [] + try: + with ( + httpx.Client(timeout=_FETCH_TIMEOUT_SECONDS, follow_redirects=True) as client, + client.stream("GET", url) as response, + ): + if not response.is_success: + return None + content_type = response.headers.get("content-type", "").lower() + if content_type.startswith(("audio/", "video/", "image/")): + return None + total = 0 + for chunk in response.iter_bytes(): + chunks.append(chunk) + total += len(chunk) + if total >= _MAX_FEED_BYTES: + break + except (httpx.HTTPError, OSError): + return None + return b"".join(chunks).decode("utf-8", "replace") diff --git a/aai_cli/app/transcribe/run.py b/aai_cli/app/transcribe/run.py index f02d8358..e26458bf 100644 --- a/aai_cli/app/transcribe/run.py +++ b/aai_cli/app/transcribe/run.py @@ -356,7 +356,12 @@ def run_transcribe(opts: TranscribeOptions, state: AppState, *, json_mode: bool) transcribe_validate.validate_speakers_expected(merged) sources = transcribe_sources.expand_sources( - opts.source, from_stdin=opts.from_stdin, sample=opts.sample + opts.source, + from_stdin=opts.from_stdin, + sample=opts.sample, + # --show-code must never touch the network; skip the feed probe and treat a + # URL as a single source for code generation. + detect_feeds=not opts.show_code, ) if sources is not None: transcribe_sources.reject_single_source_flags( diff --git a/aai_cli/app/transcribe/sources.py b/aai_cli/app/transcribe/sources.py index e2dc6e9b..bead0a25 100644 --- a/aai_cli/app/transcribe/sources.py +++ b/aai_cli/app/transcribe/sources.py @@ -49,13 +49,18 @@ _GLOB_CHARS = frozenset("*?[") -def expand_sources(source: str | None, *, from_stdin: bool, sample: bool) -> list[str] | None: +def expand_sources( + source: str | None, *, from_stdin: bool, sample: bool, detect_feeds: bool = True +) -> list[str] | None: """The batch source list, or ``None`` when this is a single-source invocation. Batch mode triggers on ``--from-stdin``, a directory (scanned recursively for - audio files), a glob pattern that names no existing file, or a bucket URL - that is a glob or trailing-slash folder. A plain file, URL, ``-`` (audio - piped on stdin), or ``--sample`` stays on the single-source path. + audio files), a glob pattern that names no existing file, a bucket URL that is + a glob or trailing-slash folder, or an http(s) URL that turns out to be a + podcast RSS/Atom feed (each episode becomes one batch source). A plain file, + direct media URL, ``-`` (audio piped on stdin), or ``--sample`` stays on the + single-source path. ``detect_feeds=False`` skips the feed probe (and its + network fetch) for paths that must not touch the network, e.g. ``--show-code``. """ if from_stdin: return _stdin_sources(source, sample=sample) @@ -63,10 +68,22 @@ def expand_sources(source: str | None, *, from_stdin: bool, sample: bool) -> lis # unset shell variable in `assembly transcribe "$FILE"`. `Path("")` is `Path(".")`, # so it would otherwise fall into the directory branch and batch-transcribe the # whole working directory; instead it stays single-source and fails validation. - if not source or sample or source == "-" or source.startswith(URL_PREFIXES): + if not source or sample or source == "-": return None + if source.startswith(URL_PREFIXES): + # A podcast feed URL expands into its episode enclosure URLs (batch mode); + # a direct media URL or ordinary page returns None and stays single-source. + from aai_cli.app.transcribe import feed + + return feed.feed_episode_urls(source) if detect_feeds else None if remotefs.is_remote_url(source): return _remote_sources(source) + return _local_sources(source) + + +def _local_sources(source: str) -> list[str] | None: + """Batch sources for a local path: a directory's audio files or a glob's matches, + else ``None`` (a single file, which the single-source path handles).""" path = Path(source) if path.is_dir(): return _directory_sources(path) diff --git a/aai_cli/commands/transcribe.py b/aai_cli/commands/transcribe.py index 83f8365b..139a4f14 100644 --- a/aai_cli/commands/transcribe.py +++ b/aai_cli/commands/transcribe.py @@ -31,6 +31,10 @@ ("Try it with the hosted sample", "assembly transcribe --sample"), ("Transcribe a YouTube video", "assembly transcribe https://youtu.be/dtp6b76pMak"), ("Transcribe a podcast page", 'assembly transcribe "https://podcasts.apple.com/…"'), + ( + "Transcribe a whole podcast feed", + 'assembly transcribe "https://feeds.simplecast.com/…"', + ), ("Label who said what", "assembly transcribe call.mp3 --speaker-labels"), ("Redact PII for compliance", "assembly transcribe call.mp3 --redact-pii"), ("Summarize a recording", "assembly transcribe call.mp3 --summarization"), @@ -43,8 +47,8 @@ def transcribe( ctx: typer.Context, source: str | None = typer.Argument( None, - help="Audio file, URL, YouTube/podcast URL, bucket URL (s3://, gs://, …), or a " - "directory/glob (batch mode)", + help="Audio file, URL, YouTube/podcast URL, podcast RSS feed, bucket URL " + "(s3://, gs://, …), or a directory/glob (batch mode)", ), sample: bool = typer.Option(False, "--sample", help="Use the hosted wildfires.mp3 sample"), # batch mode @@ -362,10 +366,11 @@ def transcribe( URLs (any page yt-dlp can extract) are downloaded first, then transcribed. Batch mode: pass a directory or glob (or pipe a list with --from-stdin) to - transcribe many sources concurrently. Each source gets a .aai.json sidecar - with the full result (including any --llm responses), and a re-run skips - sources already transcribed — with changed --llm prompts it replays just - the LLM step, never a second transcription. + transcribe many sources concurrently. A podcast RSS/Atom feed URL also expands + to batch mode — every episode enclosure becomes one source. Each source gets a + .aai.json sidecar with the full result (including any --llm responses), and a + re-run skips sources already transcribed — with changed --llm prompts it + replays just the LLM step, never a second transcription. Bucket URLs (s3://, gs://, az://, sftp://, …) work for single files and for batches (a glob, or a folder ending in /); install the matching fsspec diff --git a/aai_cli/skills/aai-cli/references/transcription.md b/aai_cli/skills/aai-cli/references/transcription.md index f0c2ec56..0c1f29c9 100644 --- a/aai_cli/skills/aai-cli/references/transcription.md +++ b/aai_cli/skills/aai-cli/references/transcription.md @@ -5,12 +5,14 @@ Five commands. All accept `--json` (auto-enabled when piped); `transcribe`, `transcribe`, `stream`, and `agent` accept `--show-code` to print equivalent Python SDK code without calling the API. -## `assembly transcribe [SOURCE]` — file / URL / YouTube / podcast page +## `assembly transcribe [SOURCE]` — file / URL / YouTube / podcast page / RSS feed `SOURCE` is a local file path, public URL, or a media-page URL yt-dlp can extract (YouTube, Apple Podcasts, Spreaker, SoundCloud, …) — those are downloaded first. -Use `--sample` for the hosted `wildfires.mp3`. Analysis results (summary, -chapters, sentiment, …) render automatically in human mode. +A podcast RSS/Atom feed URL expands into a resumable batch run over every episode +enclosure (one `.aai.json` sidecar apiece). Use `--sample` for the hosted +`wildfires.mp3`. Analysis results (summary, chapters, sentiment, …) render +automatically in human mode. High-value flags (run `assembly transcribe --help` for the full set): @@ -37,6 +39,7 @@ assembly transcribe --sample assembly transcribe call.mp3 --speaker-labels --speakers-expected 2 --redact-pii assembly transcribe call.mp3 -o text assembly transcribe call.mp3 --show-code +assembly transcribe "https://feeds.simplecast.com/54nAGcIl" # every episode in the feed ``` ## `assembly stream [SOURCE]` — live real-time transcription diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr index 5b19f1df..ef6cd589 100644 --- a/tests/__snapshots__/test_snapshots_help_run.ambr +++ b/tests/__snapshots__/test_snapshots_help_run.ambr @@ -869,10 +869,11 @@ URLs (any page yt-dlp can extract) are downloaded first, then transcribed. Batch mode: pass a directory or glob (or pipe a list with --from-stdin) to - transcribe many sources concurrently. Each source gets a .aai.json sidecar - with the full result (including any --llm responses), and a re-run skips - sources already transcribed — with changed --llm prompts it replays just - the LLM step, never a second transcription. + transcribe many sources concurrently. A podcast RSS/Atom feed URL also expands + to batch mode — every episode enclosure becomes one source. Each source gets a + .aai.json sidecar with the full result (including any --llm responses), and a + re-run skips sources already transcribed — with changed --llm prompts it + replays just the LLM step, never a second transcription. Bucket URLs (s3://, gs://, az://, sftp://, …) work for single files and for batches (a glob, or a folder ending in /); install the matching fsspec @@ -883,8 +884,9 @@ mode. ╭─ Arguments ──────────────────────────────────────────────────────────────────╮ - │ source [SOURCE] Audio file, URL, YouTube/podcast URL, bucket URL │ - │ (s3://, gs://, …), or a directory/glob (batch mode) │ + │ source [SOURCE] Audio file, URL, YouTube/podcast URL, podcast RSS │ + │ feed, bucket URL (s3://, gs://, …), or a │ + │ directory/glob (batch mode) │ ╰──────────────────────────────────────────────────────────────────────────────╯ ╭─ Options ────────────────────────────────────────────────────────────────────╮ │ --sample Use the hosted │ @@ -1052,10 +1054,11 @@ URLs (any page yt-dlp can extract) are downloaded first, then transcribed. Batch mode: pass a directory or glob (or pipe a list with --from-stdin) to - transcribe many sources concurrently. Each source gets a .aai.json sidecar - with the full result (including any --llm responses), and a re-run skips - sources already transcribed — with changed --llm prompts it replays just - the LLM step, never a second transcription. + transcribe many sources concurrently. A podcast RSS/Atom feed URL also expands + to batch mode — every episode enclosure becomes one source. Each source gets a + .aai.json sidecar with the full result (including any --llm responses), and a + re-run skips sources already transcribed — with changed --llm prompts it + replays just the LLM step, never a second transcription. Bucket URLs (s3://, gs://, az://, sftp://, …) work for single files and for batches (a glob, or a folder ending in /); install the matching fsspec @@ -1066,8 +1069,9 @@ mode. ╭─ Arguments ──────────────────────────────────────────────────────────────────╮ - │ source [SOURCE] Audio file, URL, YouTube/podcast URL, bucket URL │ - │ (s3://, gs://, …), or a directory/glob (batch mode) │ + │ source [SOURCE] Audio file, URL, YouTube/podcast URL, podcast RSS │ + │ feed, bucket URL (s3://, gs://, …), or a │ + │ directory/glob (batch mode) │ ╰──────────────────────────────────────────────────────────────────────────────╯ ╭─ Options ────────────────────────────────────────────────────────────────────╮ │ --sample Use the hosted │ @@ -1228,6 +1232,8 @@ $ assembly transcribe https://youtu.be/dtp6b76pMak Transcribe a podcast page $ assembly transcribe "https://podcasts.apple.com/…" + Transcribe a whole podcast feed + $ assembly transcribe "https://feeds.simplecast.com/…" Label who said what $ assembly transcribe call.mp3 --speaker-labels Redact PII for compliance diff --git a/tests/test_transcribe_feed.py b/tests/test_transcribe_feed.py new file mode 100644 index 00000000..73127bd5 --- /dev/null +++ b/tests/test_transcribe_feed.py @@ -0,0 +1,311 @@ +"""Podcast RSS/Atom feed expansion for `assembly transcribe`. + +A feed URL becomes a batch over its episode enclosures. These tests cover the +parser and fetcher in `app/transcribe/feed.py` directly, the `expand_sources` +seam that routes a feed URL into batch mode, and the end-to-end CLI run with the +network fetch faked (the suite is socket-blocked). +""" + +import json + +import httpx2 as httpx +import pytest +from typer.testing import CliRunner + +from aai_cli.app.transcribe import feed +from aai_cli.app.transcribe import sources as transcribe_sources +from aai_cli.core import config +from aai_cli.main import app + +runner = CliRunner() + +_TRANSCRIBE = "aai_cli.app.transcribe.run.client.transcribe" + +# A minimal but realistic RSS 2.0 podcast feed: two episodes, newest first, with +# an &-escaped query string on the second enclosure URL. +_RSS = """ + + + Example Show + + Episode 2 + + + + Episode 1 + + + + +""" + + +# --- parsing ------------------------------------------------------------------ + + +def test_episode_urls_parses_rss_in_feed_order_and_unescapes(): + assert feed._episode_urls(_RSS) == [ + "https://cdn.example.com/ep2.mp3", + "https://cdn.example.com/ep1.mp3?token=a&b=2", # & decoded, item order kept + ] + + +def test_episode_urls_parses_atom_enclosure_links_ignoring_other_links(): + atom = """ + + + + + """ + # rel/href in either order; the non-enclosure is skipped. + assert feed._episode_urls(atom) == ["https://cdn.example.com/a.mp3"] + + +def test_episode_urls_dedupes_preserving_order(): + rss = """ + + + + """ + assert feed._episode_urls(rss) == ["https://x/a.mp3", "https://x/b.mp3"] + + +def test_episode_urls_returns_none_when_not_a_feed(): + # An ordinary HTML page that merely contains the word enclosure is not a feed. + html_page = '' + assert feed._episode_urls(html_page) is None + + +def test_episode_urls_returns_none_for_feed_without_enclosures(): + assert feed._episode_urls("No media") is None + + +def test_episode_urls_ignores_empty_url_attribute(): + assert feed._episode_urls('') is None + + +# --- media-extension gate ----------------------------------------------------- + + +@pytest.mark.parametrize( + ("url", "feed_shaped"), + [ + ("https://feeds.example.com/show.xml", True), + ("https://feeds.example.com/show.RSS?fmt=1", True), # case-insensitive, query ignored + ("https://feeds.example.com/feed.atom", True), + ("https://feeds.example.com/54nAGcIl", True), # extensionless feed + ("https://cdn.example.com/show/ep.mp3", False), # direct media — never probed + ("https://example.com/notes.txt", False), + ], +) +def test_looks_like_feed_url(url, feed_shaped): + assert feed._looks_like_feed_url(url) is feed_shaped + + +# --- feed_episode_urls (gate + fetch + parse) --------------------------------- + + +def test_feed_episode_urls_skips_direct_media_without_fetching(monkeypatch): + def _boom(url): + raise AssertionError("a direct media URL must not be fetched") + + monkeypatch.setattr(feed, "_fetch", _boom) + assert feed.feed_episode_urls("https://cdn.example.com/ep.mp3") is None + + +def test_feed_episode_urls_skips_ytdlp_page_without_fetching(monkeypatch): + def _boom(url): + raise AssertionError("a yt-dlp page URL must not be fetched") + + monkeypatch.setattr(feed, "_fetch", _boom) + assert feed.feed_episode_urls("https://youtu.be/abc") is None + + +def test_feed_episode_urls_returns_episodes_for_a_feed(monkeypatch): + monkeypatch.setattr(feed, "_fetch", lambda url: _RSS) + assert feed.feed_episode_urls("https://feeds.example.com/show") == [ + "https://cdn.example.com/ep2.mp3", + "https://cdn.example.com/ep1.mp3?token=a&b=2", + ] + + +def test_feed_episode_urls_returns_none_when_fetch_fails(monkeypatch): + monkeypatch.setattr(feed, "_fetch", lambda url: None) + assert feed.feed_episode_urls("https://feeds.example.com/show") is None + + +def test_feed_episode_urls_returns_none_for_non_feed_body(monkeypatch): + monkeypatch.setattr(feed, "_fetch", lambda url: "not a feed") + assert feed.feed_episode_urls("https://example.com/page") is None + + +# --- _fetch (httpx, faked offline) -------------------------------------------- + + +class _FakeStream: + def __init__(self, *, status=200, content_type="application/rss+xml", chunks=(b"",)): + self.status_code = status + self.is_success = 200 <= status < 300 # mirror httpx.Response.is_success + self.headers = {"content-type": content_type} + self._chunks = chunks + + def __enter__(self): + return self + + def __exit__(self, *exc): + return False + + def iter_bytes(self): + yield from self._chunks + + +class _FakeClient: + def __init__(self, stream): + self._stream = stream + + def __enter__(self): + return self + + def __exit__(self, *exc): + return False + + def stream(self, method, url): + return self._stream + + +def _patch_client(monkeypatch, stream): + monkeypatch.setattr(httpx, "Client", lambda **kwargs: _FakeClient(stream)) + + +def test_fetch_returns_decoded_body(monkeypatch): + _patch_client(monkeypatch, _FakeStream(chunks=(b"", b""))) + assert feed._fetch("https://feeds.example.com/show") == "" + + +def test_fetch_returns_none_on_http_error_status(monkeypatch): + _patch_client(monkeypatch, _FakeStream(status=404)) + assert feed._fetch("https://feeds.example.com/missing") is None + + +def test_fetch_treats_exactly_400_as_error(monkeypatch): + # The >= 400 boundary: a 400 is an error, not a body to parse. + _patch_client(monkeypatch, _FakeStream(status=400)) + assert feed._fetch("https://feeds.example.com/bad") is None + + +@pytest.mark.parametrize("content_type", ["audio/mpeg", "video/mp4", "image/png"]) +def test_fetch_skips_binary_media_content_types(monkeypatch, content_type): + _patch_client(monkeypatch, _FakeStream(content_type=content_type, chunks=(b"\x00\x01",))) + assert feed._fetch("https://cdn.example.com/file") is None + + +def test_fetch_truncates_at_the_byte_cap(monkeypatch): + monkeypatch.setattr(feed, "_MAX_FEED_BYTES", 4) + _patch_client(monkeypatch, _FakeStream(chunks=(b"aaa", b"bbb", b"ccc"))) + # Reads until the running total reaches the cap, then stops — never the third chunk. + assert feed._fetch("https://feeds.example.com/big") == "aaabbb" + + +def test_fetch_returns_none_on_network_error(monkeypatch): + def _raise(**kwargs): + raise httpx.ConnectError("boom") + + monkeypatch.setattr(httpx, "Client", _raise) + assert feed._fetch("https://feeds.example.com/show") is None + + +# --- expand_sources seam ------------------------------------------------------ + + +def test_expand_sources_routes_feed_url_to_batch(monkeypatch): + monkeypatch.setattr( + feed, "feed_episode_urls", lambda url: ["https://x/a.mp3", "https://x/b.mp3"] + ) + assert transcribe_sources.expand_sources( + "https://feeds.example.com/show", from_stdin=False, sample=False + ) == ["https://x/a.mp3", "https://x/b.mp3"] + + +def test_expand_sources_skips_feed_probe_when_detect_feeds_false(monkeypatch): + def _boom(url): + raise AssertionError("feed detection must be skipped when detect_feeds is False") + + monkeypatch.setattr(feed, "feed_episode_urls", _boom) + assert ( + transcribe_sources.expand_sources( + "https://feeds.example.com/show", from_stdin=False, sample=False, detect_feeds=False + ) + is None + ) + + +# --- end-to-end CLI ----------------------------------------------------------- + + +def _auth(): + config.set_api_key("default", "sk_live") + + +@pytest.fixture(autouse=True) +def workdir(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + + +def _patch_transcribe(mocker, monkeypatch): + seen = [] + + def fake(api_key, audio, *, config): + seen.append(audio) + t = mocker.MagicMock() + t.id = f"t_{audio}" + t.text = f"text of {audio}" + t.status = "completed" + t.json_response = {"id": t.id, "text": t.text, "status": "completed"} + return t + + monkeypatch.setattr(_TRANSCRIBE, fake) + return seen + + +def test_transcribe_feed_url_batches_every_episode(mocker, monkeypatch): + _auth() + monkeypatch.setattr(feed, "_fetch", lambda url: _RSS) + seen = _patch_transcribe(mocker, monkeypatch) + result = runner.invoke(app, ["transcribe", "https://feeds.example.com/show", "--json"]) + assert result.exit_code == 0 + # Each episode enclosure was transcribed directly (the API fetches the URL — no + # yt-dlp download), and the feed XML itself was never sent as a source. Batch + # workers finish in any order, so compare as a set. + assert sorted(seen) == sorted( + [ + "https://cdn.example.com/ep2.mp3", + "https://cdn.example.com/ep1.mp3?token=a&b=2", + ] + ) + statuses = [json.loads(line)["status"] for line in result.output.splitlines()] + assert statuses == ["completed", "completed"] + + +def test_transcribe_non_feed_url_stays_single_source(mocker, monkeypatch): + # A direct audio URL is passed straight to the API, not expanded into a batch. + _auth() + monkeypatch.setattr( + feed, "_fetch", lambda url: (_ for _ in ()).throw(AssertionError("no fetch")) + ) + seen = _patch_transcribe(mocker, monkeypatch) + result = runner.invoke(app, ["transcribe", "https://example.com/episode.mp3", "-o", "id"]) + assert result.exit_code == 0 + assert seen == ["https://example.com/episode.mp3"] + assert result.output.strip() == "t_https://example.com/episode.mp3" + + +def test_transcribe_feed_url_show_code_does_not_fetch(monkeypatch): + _auth() + + def _boom(url): + raise AssertionError("--show-code must not touch the network") + + monkeypatch.setattr(feed, "_fetch", _boom) + result = runner.invoke(app, ["transcribe", "https://feeds.example.com/show", "--show-code"]) + assert result.exit_code == 0 + assert "import assemblyai as aai" in result.output # generated SDK code, no network probe From 118d82303dee8475e5a61199c604d6dc9b67c59f Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 16 Jun 2026 22:12:38 +0000 Subject: [PATCH 2/3] Parse podcast feeds with feedparser, not a hand-rolled regex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Swap the regex-based RSS/Atom enclosure extraction in feed.py for `feedparser`, the de-facto standard feed parser — it handles the namespace, encoding, and malformed-markup edge cases a regex never will. The bounded, content-type-guarded httpx fetch stays the only network path: feedparser is handed the already-fetched bytes (never a URL), so it never fetches on its own. feedparser's result is untyped, so it's validated through a small pydantic model (the project pattern for untyped third-party returns — cf. core/wer.py), keeping feed.py strict-clean under mypy and pyright. Adds feedparser as a runtime dependency. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01VwZxsDGG57kDQU4J39u3oq --- aai_cli/app/transcribe/feed.py | 94 ++++++++++++++++------------------ pyproject.toml | 5 ++ tests/test_transcribe_feed.py | 14 ++++- uv.lock | 20 ++++++++ 4 files changed, 82 insertions(+), 51 deletions(-) diff --git a/aai_cli/app/transcribe/feed.py b/aai_cli/app/transcribe/feed.py index 3b4df05a..96a018d2 100644 --- a/aai_cli/app/transcribe/feed.py +++ b/aai_cli/app/transcribe/feed.py @@ -1,28 +1,29 @@ """Podcast RSS/Atom feed expansion for ``assembly transcribe``. A feed URL names a whole show, so transcribing it means transcribing every -episode. ``feed_episode_urls`` fetches the URL and, when it parses as an RSS or -Atom feed carrying audio/video enclosures, returns those enclosure URLs (in feed -order — newest first) for the batch path to transcribe, one resumable sidecar per -episode. The enclosures are direct media URLs the API fetches itself, so — unlike -a YouTube or podcast *page*, which yt-dlp downloads first — no local download step -is needed. +episode. ``feed_episode_urls`` fetches the URL and, when ``feedparser`` recognizes +it as an RSS or Atom feed, returns its episode enclosure URLs (in feed order — +newest first) for the batch path to transcribe, one resumable sidecar per episode. +The enclosures are direct media URLs the API fetches itself, so — unlike a YouTube +or podcast *page*, which yt-dlp downloads first — no local download step is needed. Detection is deliberately narrow so a direct media URL or ordinary web page still falls through to the single-source path untouched (and is never fetched twice): only an http(s) URL whose path is feed-shaped — no extension, or one of ``.xml``/``.rss``/``.atom`` — and that no dedicated yt-dlp extractor already claims -is sniffed, the response body is bounded, and only content that actually parses as -a feed with at least one enclosure is treated as a feed. +is sniffed, the response body is bounded, and only content ``feedparser`` parses as +a real feed with at least one enclosure is treated as a feed. We hand ``feedparser`` +the already-fetched bytes (never the URL) so our bounded, safe fetch below stays the +only network path. """ from __future__ import annotations -import html -import re from pathlib import PurePosixPath from urllib.parse import urlsplit +from pydantic import BaseModel + from aai_cli.core import youtube # A feed lives at an extensionless URL (e.g. feeds.simplecast.com/) or a feed @@ -32,21 +33,28 @@ # Bound the download so a hostile or huge URL can't exhaust memory; 10 MB of feed # already holds thousands of episodes, far past any realistic batch. -_MAX_FEED_BYTES = 10 * 1024 * 1024 -_FETCH_TIMEOUT_SECONDS = 15.0 - -# A feed body must announce itself with an or root element -# (namespaced or not) before its s are trusted, so a stray HTML page -# that merely contains the word "enclosure" is never mistaken for a podcast. -_FEED_ROOT_RE = re.compile(r"<\s*(?:[\w.-]+:)?(?:rss|feed)\b", re.IGNORECASE) -# RSS 2.0 episodes: . The url -# attribute can sit anywhere in the tag and use either quote style. -_ENCLOSURE_TAG_RE = re.compile(r"<\s*enclosure\b([^>]*)>", re.IGNORECASE) -# Atom episodes: (rel/href in either order). -_LINK_TAG_RE = re.compile(r"<\s*link\b([^>]*)>", re.IGNORECASE) -_URL_ATTR_RE = re.compile(r"""\burl\s*=\s*["']([^"']+)["']""", re.IGNORECASE) -_HREF_ATTR_RE = re.compile(r"""\bhref\s*=\s*["']([^"']+)["']""", re.IGNORECASE) -_REL_ENCLOSURE_RE = re.compile(r"""\brel\s*=\s*["']enclosure["']""", re.IGNORECASE) +_MAX_FEED_BYTES = 10 * 1024 * 1024 # pragma: no mutate -- tuning knob, not behavior +_FETCH_TIMEOUT_SECONDS = 15.0 # pragma: no mutate -- tuning knob, not behavior + + +class _Enclosure(BaseModel): + """One ```` / Atom enclosure link; ``href`` is the media URL.""" + + href: str = "" + + +class _Entry(BaseModel): + enclosures: list[_Enclosure] = [] + + +class _ParsedFeed(BaseModel): + """The slice of feedparser's untyped result we use, validated into a real type + (the project pattern for untyped third-party returns — cf. core/wer.py).""" + + # feedparser sets ``version`` to a non-empty id ("rss20", "atom10", …) for a + # recognized feed and to "" for anything it doesn't recognize as one. + version: str = "" + entries: list[_Entry] = [] def feed_episode_urls(url: str) -> list[str] | None: @@ -70,34 +78,22 @@ def _looks_like_feed_url(url: str) -> bool: def _episode_urls(body: str) -> list[str] | None: - """The enclosure URLs in a feed body, deduped in document order; ``None`` when it - isn't a feed or carries no enclosures.""" - if not _FEED_ROOT_RE.search(body): + """The enclosure URLs in a feed body, deduped in document order; ``None`` when + feedparser doesn't recognize it as a feed or it carries no enclosures.""" + import feedparser + + # feedparser ships only partial inline types (its parse signature is Unknown), + # so the result is validated through _ParsedFeed below; mirror remotefs.py's + # fsspec shim in ignoring the unavoidable unknown-member report on the call. + raw = feedparser.parse(body) # pyright: ignore[reportUnknownMemberType] + parsed = _ParsedFeed.model_validate(raw) + if not parsed.version: return None - urls = [*_rss_enclosure_urls(body), *_atom_enclosure_urls(body)] - deduped = list(dict.fromkeys(u for u in urls if u)) + urls = [enc.href for entry in parsed.entries for enc in entry.enclosures if enc.href] + deduped = list(dict.fromkeys(urls)) return deduped or None -def _rss_enclosure_urls(body: str) -> list[str]: - """The ``url`` of every RSS ```` tag, HTML-unescaped.""" - return [ - html.unescape(match.group(1).strip()) - for attrs in _ENCLOSURE_TAG_RE.findall(body) - if (match := _URL_ATTR_RE.search(attrs)) is not None - ] - - -def _atom_enclosure_urls(body: str) -> list[str]: - """The ``href`` of every Atom ````, HTML-unescaped.""" - return [ - html.unescape(match.group(1).strip()) - for attrs in _LINK_TAG_RE.findall(body) - if _REL_ENCLOSURE_RE.search(attrs) is not None - and (match := _HREF_ATTR_RE.search(attrs)) is not None - ] - - def _fetch(url: str) -> str | None: """Up to ``_MAX_FEED_BYTES`` of `url` decoded as text, or ``None`` on any failure or when the response is obviously binary media (audio/video/image).""" diff --git a/pyproject.toml b/pyproject.toml index d614d42d..8740d822 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,11 @@ dependencies = [ # imported lazily). fsspec core only — each protocol's backend (s3fs, gcsfs, adlfs, # …) stays a user-installed extra surfaced via a clean install hint. "fsspec>=2026.4.0", + # Podcast RSS/Atom feed parsing for `assembly transcribe ` (feed.py, + # imported lazily). The de-facto standard feed parser; pure-Python, no compiled + # deps. We hand it already-fetched bytes (never a URL) so our bounded, safe + # httpx fetch stays the only network path. + "feedparser>=6.0.11", ] [project.urls] diff --git a/tests/test_transcribe_feed.py b/tests/test_transcribe_feed.py index 73127bd5..5b3bf879 100644 --- a/tests/test_transcribe_feed.py +++ b/tests/test_transcribe_feed.py @@ -174,12 +174,22 @@ def stream(self, method, url): def _patch_client(monkeypatch, stream): - monkeypatch.setattr(httpx, "Client", lambda **kwargs: _FakeClient(stream)) + captured = {} + + def factory(**kwargs): + captured.update(kwargs) + return _FakeClient(stream) + + monkeypatch.setattr(httpx, "Client", factory) + return captured def test_fetch_returns_decoded_body(monkeypatch): - _patch_client(monkeypatch, _FakeStream(chunks=(b"", b""))) + captured = _patch_client(monkeypatch, _FakeStream(chunks=(b"", b""))) assert feed._fetch("https://feeds.example.com/show") == "" + # Feeds commonly 301/302 to a CDN, so redirects must be followed. + assert captured["follow_redirects"] is True + assert captured["timeout"] == feed._FETCH_TIMEOUT_SECONDS def test_fetch_returns_none_on_http_error_status(monkeypatch): diff --git a/uv.lock b/uv.lock index e817e19f..098244ab 100644 --- a/uv.lock +++ b/uv.lock @@ -22,6 +22,7 @@ source = { editable = "." } dependencies = [ { name = "assemblyai" }, { name = "audioop-lts", marker = "python_full_version >= '3.13'" }, + { name = "feedparser" }, { name = "fsspec" }, { name = "httpx2" }, { name = "jiwer" }, @@ -74,6 +75,7 @@ dev = [ requires-dist = [ { name = "assemblyai", specifier = ">=0.64.4" }, { name = "audioop-lts", marker = "python_full_version >= '3.13'", specifier = ">=0.2" }, + { name = "feedparser", specifier = ">=6.0.11" }, { name = "fsspec", specifier = ">=2026.4.0" }, { name = "httpx2", specifier = ">=2.0.0" }, { name = "jiwer", specifier = ">=4.0" }, @@ -674,6 +676,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e0/82/45359b62a067409bd929ae8a56b8ed13e5a8c8a61194b3c236920999ab83/fastapi-0.136.3-py3-none-any.whl", hash = "sha256:3d2a69bdf04b7e9f3afa292c3bc7a98816bbfafa10bc9b45f3f3700d2f761620", size = 117481, upload-time = "2026-05-23T18:53:16.924Z" }, ] +[[package]] +name = "feedparser" +version = "6.0.12" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "sgmllib3k" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dc/79/db7edb5e77d6dfbc54d7d9df72828be4318275b2e580549ff45a962f6461/feedparser-6.0.12.tar.gz", hash = "sha256:64f76ce90ae3e8ef5d1ede0f8d3b50ce26bcce71dd8ae5e82b1cd2d4a5f94228", size = 286579, upload-time = "2025-09-10T13:33:59.486Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/eb/c96d64137e29ae17d83ad2552470bafe3a7a915e85434d9942077d7fd011/feedparser-6.0.12-py3-none-any.whl", hash = "sha256:6bbff10f5a52662c00a2e3f86a38928c37c48f77b3c511aedcd51de933549324", size = 81480, upload-time = "2025-09-10T13:33:58.022Z" }, +] + [[package]] name = "filelock" version = "3.29.0" @@ -1825,6 +1839,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/46/f5af3402b579fd5e11573ce652019a67074317e18c1935cc0b4ba9b35552/secretstorage-3.5.0-py3-none-any.whl", hash = "sha256:0ce65888c0725fcb2c5bc0fdb8e5438eece02c523557ea40ce0703c266248137", size = 15554, upload-time = "2025-11-23T19:02:51.545Z" }, ] +[[package]] +name = "sgmllib3k" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9e/bd/3704a8c3e0942d711c1299ebf7b9091930adae6675d7c8f476a7ce48653c/sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9", size = 5750, upload-time = "2010-08-24T14:33:52.445Z" } + [[package]] name = "shellingham" version = "1.5.4" From 73ae4b0f2afb940ff58a71421c768790268890e3 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 16 Jun 2026 22:18:43 +0000 Subject: [PATCH 3/3] Use a typed default_factory for feed model list fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the `= []` field defaults on the pydantic feed models with `Field(default_factory=list[...])`. pydantic v2 already deep-copies mutable defaults per instance, so `= []` was not actually shared — but the explicit typed factory makes per-instance isolation obvious to readers and static analysis, while keeping the field's element type known under pyright strict (a bare `default_factory=list` infers `list[Unknown]`). Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01VwZxsDGG57kDQU4J39u3oq --- aai_cli/app/transcribe/feed.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/aai_cli/app/transcribe/feed.py b/aai_cli/app/transcribe/feed.py index 96a018d2..eb5a7dd1 100644 --- a/aai_cli/app/transcribe/feed.py +++ b/aai_cli/app/transcribe/feed.py @@ -22,7 +22,7 @@ from pathlib import PurePosixPath from urllib.parse import urlsplit -from pydantic import BaseModel +from pydantic import BaseModel, Field from aai_cli.core import youtube @@ -44,7 +44,9 @@ class _Enclosure(BaseModel): class _Entry(BaseModel): - enclosures: list[_Enclosure] = [] + # default_factory (not a shared `= []`) so each entry gets its own list, and the + # typed factory keeps the field's element type known under pyright strict. + enclosures: list[_Enclosure] = Field(default_factory=list[_Enclosure]) class _ParsedFeed(BaseModel): @@ -54,7 +56,7 @@ class _ParsedFeed(BaseModel): # feedparser sets ``version`` to a non-empty id ("rss20", "atom10", …) for a # recognized feed and to "" for anything it doesn't recognize as one. version: str = "" - entries: list[_Entry] = [] + entries: list[_Entry] = Field(default_factory=list[_Entry]) def feed_episode_urls(url: str) -> list[str] | None: