diff --git a/README.md b/README.md
index d9fa3e78..68693ada 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
[](https://github.com/AssemblyAI/cli/blob/main/LICENSE)
[](https://www.assemblyai.com/docs)
-The AssemblyAI CLI (`assembly`) brings speech AI directly into your terminal: transcribe files, URLs, and YouTube/podcast pages, stream live audio, talk to a two-way voice agent, prompt the LLM Gateway, benchmark speech models, and scaffold ready-to-deploy starter apps.
+The AssemblyAI CLI (`assembly`) brings speech AI directly into your terminal: transcribe files, URLs, YouTube/podcast pages, and whole podcast RSS feeds, stream live audio, talk to a two-way voice agent, prompt the LLM Gateway, benchmark speech models, and scaffold ready-to-deploy starter apps.
@@ -44,7 +44,7 @@ That's it. Run `assembly onboard` for a guided tour, or see [Installation](#-ins
| Command | What it does |
| :--- | :--- |
-| `assembly transcribe` | Transcribe files, URLs, YouTube/podcast pages, directories, globs, or bucket storage (`s3://`, `gs://`, `az://`) — with speaker labels, PII redaction, summarization, SRT/VTT captions, and resumable batch runs |
+| `assembly transcribe` | Transcribe files, URLs, YouTube/podcast pages, podcast RSS feeds, directories, globs, or bucket storage (`s3://`, `gs://`, `az://`) — with speaker labels, PII redaction, summarization, SRT/VTT captions, and resumable batch runs |
| `assembly stream` | Real-time transcription from your microphone, a file, or a URL — on macOS it can capture system audio too |
| `assembly dictate` | Push-to-talk dictation: press Enter to record, Enter again for instant text (Sync STT API, up to 120 s per utterance) |
| `assembly agent` | Full-duplex spoken conversation with a voice agent, right in your terminal |
@@ -285,11 +285,13 @@ assembly transcribe video.mp4 -o srt # captions
assembly transcribe call.mp3 --speaker-labels --summarization --json
```
-Transcribe in batches — a directory, a glob, or a piped list, resumable on re-run:
+Transcribe in batches — a directory, a glob, a piped list, or a whole podcast
+RSS feed (every episode becomes one source), resumable on re-run:
```sh
assembly transcribe ./recordings
assembly transcribe "s3://bucket/calls/*.mp3" # needs: pip install s3fs
+assembly transcribe "https://feeds.simplecast.com/54nAGcIl" # every episode in the feed
find . -name "*.wav" | assembly transcribe --from-stdin
```
diff --git a/aai_cli/app/transcribe/feed.py b/aai_cli/app/transcribe/feed.py
new file mode 100644
index 00000000..eb5a7dd1
--- /dev/null
+++ b/aai_cli/app/transcribe/feed.py
@@ -0,0 +1,123 @@
+"""Podcast RSS/Atom feed expansion for ``assembly transcribe``.
+
+A feed URL names a whole show, so transcribing it means transcribing every
+episode. ``feed_episode_urls`` fetches the URL and, when ``feedparser`` recognizes
+it as an RSS or Atom feed, returns its episode enclosure URLs (in feed order —
+newest first) for the batch path to transcribe, one resumable sidecar per episode.
+The enclosures are direct media URLs the API fetches itself, so — unlike a YouTube
+or podcast *page*, which yt-dlp downloads first — no local download step is needed.
+
+Detection is deliberately narrow so a direct media URL or ordinary web page still
+falls through to the single-source path untouched (and is never fetched twice):
+only an http(s) URL whose path is feed-shaped — no extension, or one of
+``.xml``/``.rss``/``.atom`` — and that no dedicated yt-dlp extractor already claims
+is sniffed, the response body is bounded, and only content ``feedparser`` parses as
+a real feed with at least one enclosure is treated as a feed. We hand ``feedparser``
+the already-fetched bytes (never the URL) so our bounded, safe fetch below stays the
+only network path.
+"""
+
+from __future__ import annotations
+
+from pathlib import PurePosixPath
+from urllib.parse import urlsplit
+
+from pydantic import BaseModel, Field
+
+from aai_cli.core import youtube
+
+# A feed lives at an extensionless URL (e.g. feeds.simplecast.com/) or a feed
+# document (.xml/.rss/.atom). Every other path — .mp3, .txt, .pdf — is never a feed,
+# so it is left for the single-source path and never fetched here.
+_FEED_URL_SUFFIXES = frozenset({"", ".xml", ".rss", ".atom"})
+
+# Bound the download so a hostile or huge URL can't exhaust memory; 10 MB of feed
+# already holds thousands of episodes, far past any realistic batch.
+_MAX_FEED_BYTES = 10 * 1024 * 1024 # pragma: no mutate -- tuning knob, not behavior
+_FETCH_TIMEOUT_SECONDS = 15.0 # pragma: no mutate -- tuning knob, not behavior
+
+
+class _Enclosure(BaseModel):
+ """One ```` / Atom enclosure link; ``href`` is the media URL."""
+
+ href: str = ""
+
+
+class _Entry(BaseModel):
+ # default_factory (not a shared `= []`) so each entry gets its own list, and the
+ # typed factory keeps the field's element type known under pyright strict.
+ enclosures: list[_Enclosure] = Field(default_factory=list[_Enclosure])
+
+
+class _ParsedFeed(BaseModel):
+ """The slice of feedparser's untyped result we use, validated into a real type
+ (the project pattern for untyped third-party returns — cf. core/wer.py)."""
+
+ # feedparser sets ``version`` to a non-empty id ("rss20", "atom10", …) for a
+ # recognized feed and to "" for anything it doesn't recognize as one.
+ version: str = ""
+ entries: list[_Entry] = Field(default_factory=list[_Entry])
+
+
+def feed_episode_urls(url: str) -> list[str] | None:
+ """The episode media URLs if `url` is a podcast feed, else ``None``.
+
+ Returns ``None`` (stay single-source) for a direct-media URL, a yt-dlp page,
+ an unreachable URL, or any content that isn't a feed carrying enclosures.
+ """
+ if not _looks_like_feed_url(url) or youtube.is_downloadable_url(url):
+ return None
+ body = _fetch(url)
+ if body is None:
+ return None
+ return _episode_urls(body)
+
+
+def _looks_like_feed_url(url: str) -> bool:
+ """True when the URL path is feed-shaped: extensionless or a feed document."""
+ suffix = PurePosixPath(urlsplit(url).path).suffix.lower()
+ return suffix in _FEED_URL_SUFFIXES
+
+
+def _episode_urls(body: str) -> list[str] | None:
+ """The enclosure URLs in a feed body, deduped in document order; ``None`` when
+ feedparser doesn't recognize it as a feed or it carries no enclosures."""
+ import feedparser
+
+ # feedparser ships only partial inline types (its parse signature is Unknown),
+ # so the result is validated through _ParsedFeed below; mirror remotefs.py's
+ # fsspec shim in ignoring the unavoidable unknown-member report on the call.
+ raw = feedparser.parse(body) # pyright: ignore[reportUnknownMemberType]
+ parsed = _ParsedFeed.model_validate(raw)
+ if not parsed.version:
+ return None
+ urls = [enc.href for entry in parsed.entries for enc in entry.enclosures if enc.href]
+ deduped = list(dict.fromkeys(urls))
+ return deduped or None
+
+
+def _fetch(url: str) -> str | None:
+ """Up to ``_MAX_FEED_BYTES`` of `url` decoded as text, or ``None`` on any failure
+ or when the response is obviously binary media (audio/video/image)."""
+ import httpx2 as httpx
+
+ chunks: list[bytes] = []
+ try:
+ with (
+ httpx.Client(timeout=_FETCH_TIMEOUT_SECONDS, follow_redirects=True) as client,
+ client.stream("GET", url) as response,
+ ):
+ if not response.is_success:
+ return None
+ content_type = response.headers.get("content-type", "").lower()
+ if content_type.startswith(("audio/", "video/", "image/")):
+ return None
+ total = 0
+ for chunk in response.iter_bytes():
+ chunks.append(chunk)
+ total += len(chunk)
+ if total >= _MAX_FEED_BYTES:
+ break
+ except (httpx.HTTPError, OSError):
+ return None
+ return b"".join(chunks).decode("utf-8", "replace")
diff --git a/aai_cli/app/transcribe/run.py b/aai_cli/app/transcribe/run.py
index f02d8358..e26458bf 100644
--- a/aai_cli/app/transcribe/run.py
+++ b/aai_cli/app/transcribe/run.py
@@ -356,7 +356,12 @@ def run_transcribe(opts: TranscribeOptions, state: AppState, *, json_mode: bool)
transcribe_validate.validate_speakers_expected(merged)
sources = transcribe_sources.expand_sources(
- opts.source, from_stdin=opts.from_stdin, sample=opts.sample
+ opts.source,
+ from_stdin=opts.from_stdin,
+ sample=opts.sample,
+ # --show-code must never touch the network; skip the feed probe and treat a
+ # URL as a single source for code generation.
+ detect_feeds=not opts.show_code,
)
if sources is not None:
transcribe_sources.reject_single_source_flags(
diff --git a/aai_cli/app/transcribe/sources.py b/aai_cli/app/transcribe/sources.py
index e2dc6e9b..bead0a25 100644
--- a/aai_cli/app/transcribe/sources.py
+++ b/aai_cli/app/transcribe/sources.py
@@ -49,13 +49,18 @@
_GLOB_CHARS = frozenset("*?[")
-def expand_sources(source: str | None, *, from_stdin: bool, sample: bool) -> list[str] | None:
+def expand_sources(
+ source: str | None, *, from_stdin: bool, sample: bool, detect_feeds: bool = True
+) -> list[str] | None:
"""The batch source list, or ``None`` when this is a single-source invocation.
Batch mode triggers on ``--from-stdin``, a directory (scanned recursively for
- audio files), a glob pattern that names no existing file, or a bucket URL
- that is a glob or trailing-slash folder. A plain file, URL, ``-`` (audio
- piped on stdin), or ``--sample`` stays on the single-source path.
+ audio files), a glob pattern that names no existing file, a bucket URL that is
+ a glob or trailing-slash folder, or an http(s) URL that turns out to be a
+ podcast RSS/Atom feed (each episode becomes one batch source). A plain file,
+ direct media URL, ``-`` (audio piped on stdin), or ``--sample`` stays on the
+ single-source path. ``detect_feeds=False`` skips the feed probe (and its
+ network fetch) for paths that must not touch the network, e.g. ``--show-code``.
"""
if from_stdin:
return _stdin_sources(source, sample=sample)
@@ -63,10 +68,22 @@ def expand_sources(source: str | None, *, from_stdin: bool, sample: bool) -> lis
# unset shell variable in `assembly transcribe "$FILE"`. `Path("")` is `Path(".")`,
# so it would otherwise fall into the directory branch and batch-transcribe the
# whole working directory; instead it stays single-source and fails validation.
- if not source or sample or source == "-" or source.startswith(URL_PREFIXES):
+ if not source or sample or source == "-":
return None
+ if source.startswith(URL_PREFIXES):
+ # A podcast feed URL expands into its episode enclosure URLs (batch mode);
+ # a direct media URL or ordinary page returns None and stays single-source.
+ from aai_cli.app.transcribe import feed
+
+ return feed.feed_episode_urls(source) if detect_feeds else None
if remotefs.is_remote_url(source):
return _remote_sources(source)
+ return _local_sources(source)
+
+
+def _local_sources(source: str) -> list[str] | None:
+ """Batch sources for a local path: a directory's audio files or a glob's matches,
+ else ``None`` (a single file, which the single-source path handles)."""
path = Path(source)
if path.is_dir():
return _directory_sources(path)
diff --git a/aai_cli/commands/transcribe.py b/aai_cli/commands/transcribe.py
index 83f8365b..139a4f14 100644
--- a/aai_cli/commands/transcribe.py
+++ b/aai_cli/commands/transcribe.py
@@ -31,6 +31,10 @@
("Try it with the hosted sample", "assembly transcribe --sample"),
("Transcribe a YouTube video", "assembly transcribe https://youtu.be/dtp6b76pMak"),
("Transcribe a podcast page", 'assembly transcribe "https://podcasts.apple.com/…"'),
+ (
+ "Transcribe a whole podcast feed",
+ 'assembly transcribe "https://feeds.simplecast.com/…"',
+ ),
("Label who said what", "assembly transcribe call.mp3 --speaker-labels"),
("Redact PII for compliance", "assembly transcribe call.mp3 --redact-pii"),
("Summarize a recording", "assembly transcribe call.mp3 --summarization"),
@@ -43,8 +47,8 @@ def transcribe(
ctx: typer.Context,
source: str | None = typer.Argument(
None,
- help="Audio file, URL, YouTube/podcast URL, bucket URL (s3://, gs://, …), or a "
- "directory/glob (batch mode)",
+ help="Audio file, URL, YouTube/podcast URL, podcast RSS feed, bucket URL "
+ "(s3://, gs://, …), or a directory/glob (batch mode)",
),
sample: bool = typer.Option(False, "--sample", help="Use the hosted wildfires.mp3 sample"),
# batch mode
@@ -362,10 +366,11 @@ def transcribe(
URLs (any page yt-dlp can extract) are downloaded first, then transcribed.
Batch mode: pass a directory or glob (or pipe a list with --from-stdin) to
- transcribe many sources concurrently. Each source gets a .aai.json sidecar
- with the full result (including any --llm responses), and a re-run skips
- sources already transcribed — with changed --llm prompts it replays just
- the LLM step, never a second transcription.
+ transcribe many sources concurrently. A podcast RSS/Atom feed URL also expands
+ to batch mode — every episode enclosure becomes one source. Each source gets a
+ .aai.json sidecar with the full result (including any --llm responses), and a
+ re-run skips sources already transcribed — with changed --llm prompts it
+ replays just the LLM step, never a second transcription.
Bucket URLs (s3://, gs://, az://, sftp://, …) work for single files and for
batches (a glob, or a folder ending in /); install the matching fsspec
diff --git a/aai_cli/skills/aai-cli/references/transcription.md b/aai_cli/skills/aai-cli/references/transcription.md
index f0c2ec56..0c1f29c9 100644
--- a/aai_cli/skills/aai-cli/references/transcription.md
+++ b/aai_cli/skills/aai-cli/references/transcription.md
@@ -5,12 +5,14 @@ Five commands. All accept `--json` (auto-enabled when piped); `transcribe`,
`transcribe`, `stream`, and `agent` accept `--show-code` to print equivalent
Python SDK code without calling the API.
-## `assembly transcribe [SOURCE]` — file / URL / YouTube / podcast page
+## `assembly transcribe [SOURCE]` — file / URL / YouTube / podcast page / RSS feed
`SOURCE` is a local file path, public URL, or a media-page URL yt-dlp can extract
(YouTube, Apple Podcasts, Spreaker, SoundCloud, …) — those are downloaded first.
-Use `--sample` for the hosted `wildfires.mp3`. Analysis results (summary,
-chapters, sentiment, …) render automatically in human mode.
+A podcast RSS/Atom feed URL expands into a resumable batch run over every episode
+enclosure (one `.aai.json` sidecar apiece). Use `--sample` for the hosted
+`wildfires.mp3`. Analysis results (summary, chapters, sentiment, …) render
+automatically in human mode.
High-value flags (run `assembly transcribe --help` for the full set):
@@ -37,6 +39,7 @@ assembly transcribe --sample
assembly transcribe call.mp3 --speaker-labels --speakers-expected 2 --redact-pii
assembly transcribe call.mp3 -o text
assembly transcribe call.mp3 --show-code
+assembly transcribe "https://feeds.simplecast.com/54nAGcIl" # every episode in the feed
```
## `assembly stream [SOURCE]` — live real-time transcription
diff --git a/pyproject.toml b/pyproject.toml
index d614d42d..8740d822 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,6 +56,11 @@ dependencies = [
# imported lazily). fsspec core only — each protocol's backend (s3fs, gcsfs, adlfs,
# …) stays a user-installed extra surfaced via a clean install hint.
"fsspec>=2026.4.0",
+ # Podcast RSS/Atom feed parsing for `assembly transcribe ` (feed.py,
+ # imported lazily). The de-facto standard feed parser; pure-Python, no compiled
+ # deps. We hand it already-fetched bytes (never a URL) so our bounded, safe
+ # httpx fetch stays the only network path.
+ "feedparser>=6.0.11",
]
[project.urls]
diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr
index ffc0ae1a..7719a9bf 100644
--- a/tests/__snapshots__/test_snapshots_help_run.ambr
+++ b/tests/__snapshots__/test_snapshots_help_run.ambr
@@ -893,10 +893,11 @@
URLs (any page yt-dlp can extract) are downloaded first, then transcribed.
Batch mode: pass a directory or glob (or pipe a list with --from-stdin) to
- transcribe many sources concurrently. Each source gets a .aai.json sidecar
- with the full result (including any --llm responses), and a re-run skips
- sources already transcribed — with changed --llm prompts it replays just
- the LLM step, never a second transcription.
+ transcribe many sources concurrently. A podcast RSS/Atom feed URL also expands
+ to batch mode — every episode enclosure becomes one source. Each source gets a
+ .aai.json sidecar with the full result (including any --llm responses), and a
+ re-run skips sources already transcribed — with changed --llm prompts it
+ replays just the LLM step, never a second transcription.
Bucket URLs (s3://, gs://, az://, sftp://, …) work for single files and for
batches (a glob, or a folder ending in /); install the matching fsspec
@@ -907,8 +908,9 @@
mode.
╭─ Arguments ──────────────────────────────────────────────────────────────────╮
- │ source [SOURCE] Audio file, URL, YouTube/podcast URL, bucket URL │
- │ (s3://, gs://, …), or a directory/glob (batch mode) │
+ │ source [SOURCE] Audio file, URL, YouTube/podcast URL, podcast RSS │
+ │ feed, bucket URL (s3://, gs://, …), or a │
+ │ directory/glob (batch mode) │
╰──────────────────────────────────────────────────────────────────────────────╯
╭─ Options ────────────────────────────────────────────────────────────────────╮
│ --sample Use the hosted │
@@ -1076,10 +1078,11 @@
URLs (any page yt-dlp can extract) are downloaded first, then transcribed.
Batch mode: pass a directory or glob (or pipe a list with --from-stdin) to
- transcribe many sources concurrently. Each source gets a .aai.json sidecar
- with the full result (including any --llm responses), and a re-run skips
- sources already transcribed — with changed --llm prompts it replays just
- the LLM step, never a second transcription.
+ transcribe many sources concurrently. A podcast RSS/Atom feed URL also expands
+ to batch mode — every episode enclosure becomes one source. Each source gets a
+ .aai.json sidecar with the full result (including any --llm responses), and a
+ re-run skips sources already transcribed — with changed --llm prompts it
+ replays just the LLM step, never a second transcription.
Bucket URLs (s3://, gs://, az://, sftp://, …) work for single files and for
batches (a glob, or a folder ending in /); install the matching fsspec
@@ -1090,8 +1093,9 @@
mode.
╭─ Arguments ──────────────────────────────────────────────────────────────────╮
- │ source [SOURCE] Audio file, URL, YouTube/podcast URL, bucket URL │
- │ (s3://, gs://, …), or a directory/glob (batch mode) │
+ │ source [SOURCE] Audio file, URL, YouTube/podcast URL, podcast RSS │
+ │ feed, bucket URL (s3://, gs://, …), or a │
+ │ directory/glob (batch mode) │
╰──────────────────────────────────────────────────────────────────────────────╯
╭─ Options ────────────────────────────────────────────────────────────────────╮
│ --sample Use the hosted │
@@ -1252,6 +1256,8 @@
$ assembly transcribe https://youtu.be/dtp6b76pMak
Transcribe a podcast page
$ assembly transcribe "https://podcasts.apple.com/…"
+ Transcribe a whole podcast feed
+ $ assembly transcribe "https://feeds.simplecast.com/…"
Label who said what
$ assembly transcribe call.mp3 --speaker-labels
Redact PII for compliance
diff --git a/tests/test_transcribe_feed.py b/tests/test_transcribe_feed.py
new file mode 100644
index 00000000..5b3bf879
--- /dev/null
+++ b/tests/test_transcribe_feed.py
@@ -0,0 +1,321 @@
+"""Podcast RSS/Atom feed expansion for `assembly transcribe`.
+
+A feed URL becomes a batch over its episode enclosures. These tests cover the
+parser and fetcher in `app/transcribe/feed.py` directly, the `expand_sources`
+seam that routes a feed URL into batch mode, and the end-to-end CLI run with the
+network fetch faked (the suite is socket-blocked).
+"""
+
+import json
+
+import httpx2 as httpx
+import pytest
+from typer.testing import CliRunner
+
+from aai_cli.app.transcribe import feed
+from aai_cli.app.transcribe import sources as transcribe_sources
+from aai_cli.core import config
+from aai_cli.main import app
+
+runner = CliRunner()
+
+_TRANSCRIBE = "aai_cli.app.transcribe.run.client.transcribe"
+
+# A minimal but realistic RSS 2.0 podcast feed: two episodes, newest first, with
+# an &-escaped query string on the second enclosure URL.
+_RSS = """
+
+
+ Example Show
+
+ Episode 2
+
+
+
+ Episode 1
+
+
+
+
+"""
+
+
+# --- parsing ------------------------------------------------------------------
+
+
+def test_episode_urls_parses_rss_in_feed_order_and_unescapes():
+ assert feed._episode_urls(_RSS) == [
+ "https://cdn.example.com/ep2.mp3",
+ "https://cdn.example.com/ep1.mp3?token=a&b=2", # & decoded, item order kept
+ ]
+
+
+def test_episode_urls_parses_atom_enclosure_links_ignoring_other_links():
+ atom = """
+
+
+
+
+ """
+ # rel/href in either order; the non-enclosure is skipped.
+ assert feed._episode_urls(atom) == ["https://cdn.example.com/a.mp3"]
+
+
+def test_episode_urls_dedupes_preserving_order():
+ rss = """
+
+
+
+ """
+ assert feed._episode_urls(rss) == ["https://x/a.mp3", "https://x/b.mp3"]
+
+
+def test_episode_urls_returns_none_when_not_a_feed():
+ # An ordinary HTML page that merely contains the word enclosure is not a feed.
+ html_page = ''
+ assert feed._episode_urls(html_page) is None
+
+
+def test_episode_urls_returns_none_for_feed_without_enclosures():
+ assert feed._episode_urls("No media") is None
+
+
+def test_episode_urls_ignores_empty_url_attribute():
+ assert feed._episode_urls('') is None
+
+
+# --- media-extension gate -----------------------------------------------------
+
+
+@pytest.mark.parametrize(
+ ("url", "feed_shaped"),
+ [
+ ("https://feeds.example.com/show.xml", True),
+ ("https://feeds.example.com/show.RSS?fmt=1", True), # case-insensitive, query ignored
+ ("https://feeds.example.com/feed.atom", True),
+ ("https://feeds.example.com/54nAGcIl", True), # extensionless feed
+ ("https://cdn.example.com/show/ep.mp3", False), # direct media — never probed
+ ("https://example.com/notes.txt", False),
+ ],
+)
+def test_looks_like_feed_url(url, feed_shaped):
+ assert feed._looks_like_feed_url(url) is feed_shaped
+
+
+# --- feed_episode_urls (gate + fetch + parse) ---------------------------------
+
+
+def test_feed_episode_urls_skips_direct_media_without_fetching(monkeypatch):
+ def _boom(url):
+ raise AssertionError("a direct media URL must not be fetched")
+
+ monkeypatch.setattr(feed, "_fetch", _boom)
+ assert feed.feed_episode_urls("https://cdn.example.com/ep.mp3") is None
+
+
+def test_feed_episode_urls_skips_ytdlp_page_without_fetching(monkeypatch):
+ def _boom(url):
+ raise AssertionError("a yt-dlp page URL must not be fetched")
+
+ monkeypatch.setattr(feed, "_fetch", _boom)
+ assert feed.feed_episode_urls("https://youtu.be/abc") is None
+
+
+def test_feed_episode_urls_returns_episodes_for_a_feed(monkeypatch):
+ monkeypatch.setattr(feed, "_fetch", lambda url: _RSS)
+ assert feed.feed_episode_urls("https://feeds.example.com/show") == [
+ "https://cdn.example.com/ep2.mp3",
+ "https://cdn.example.com/ep1.mp3?token=a&b=2",
+ ]
+
+
+def test_feed_episode_urls_returns_none_when_fetch_fails(monkeypatch):
+ monkeypatch.setattr(feed, "_fetch", lambda url: None)
+ assert feed.feed_episode_urls("https://feeds.example.com/show") is None
+
+
+def test_feed_episode_urls_returns_none_for_non_feed_body(monkeypatch):
+ monkeypatch.setattr(feed, "_fetch", lambda url: "not a feed")
+ assert feed.feed_episode_urls("https://example.com/page") is None
+
+
+# --- _fetch (httpx, faked offline) --------------------------------------------
+
+
+class _FakeStream:
+ def __init__(self, *, status=200, content_type="application/rss+xml", chunks=(b"",)):
+ self.status_code = status
+ self.is_success = 200 <= status < 300 # mirror httpx.Response.is_success
+ self.headers = {"content-type": content_type}
+ self._chunks = chunks
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, *exc):
+ return False
+
+ def iter_bytes(self):
+ yield from self._chunks
+
+
+class _FakeClient:
+ def __init__(self, stream):
+ self._stream = stream
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, *exc):
+ return False
+
+ def stream(self, method, url):
+ return self._stream
+
+
+def _patch_client(monkeypatch, stream):
+ captured = {}
+
+ def factory(**kwargs):
+ captured.update(kwargs)
+ return _FakeClient(stream)
+
+ monkeypatch.setattr(httpx, "Client", factory)
+ return captured
+
+
+def test_fetch_returns_decoded_body(monkeypatch):
+ captured = _patch_client(monkeypatch, _FakeStream(chunks=(b"", b"")))
+ assert feed._fetch("https://feeds.example.com/show") == ""
+ # Feeds commonly 301/302 to a CDN, so redirects must be followed.
+ assert captured["follow_redirects"] is True
+ assert captured["timeout"] == feed._FETCH_TIMEOUT_SECONDS
+
+
+def test_fetch_returns_none_on_http_error_status(monkeypatch):
+ _patch_client(monkeypatch, _FakeStream(status=404))
+ assert feed._fetch("https://feeds.example.com/missing") is None
+
+
+def test_fetch_treats_exactly_400_as_error(monkeypatch):
+ # The >= 400 boundary: a 400 is an error, not a body to parse.
+ _patch_client(monkeypatch, _FakeStream(status=400))
+ assert feed._fetch("https://feeds.example.com/bad") is None
+
+
+@pytest.mark.parametrize("content_type", ["audio/mpeg", "video/mp4", "image/png"])
+def test_fetch_skips_binary_media_content_types(monkeypatch, content_type):
+ _patch_client(monkeypatch, _FakeStream(content_type=content_type, chunks=(b"\x00\x01",)))
+ assert feed._fetch("https://cdn.example.com/file") is None
+
+
+def test_fetch_truncates_at_the_byte_cap(monkeypatch):
+ monkeypatch.setattr(feed, "_MAX_FEED_BYTES", 4)
+ _patch_client(monkeypatch, _FakeStream(chunks=(b"aaa", b"bbb", b"ccc")))
+ # Reads until the running total reaches the cap, then stops — never the third chunk.
+ assert feed._fetch("https://feeds.example.com/big") == "aaabbb"
+
+
+def test_fetch_returns_none_on_network_error(monkeypatch):
+ def _raise(**kwargs):
+ raise httpx.ConnectError("boom")
+
+ monkeypatch.setattr(httpx, "Client", _raise)
+ assert feed._fetch("https://feeds.example.com/show") is None
+
+
+# --- expand_sources seam ------------------------------------------------------
+
+
+def test_expand_sources_routes_feed_url_to_batch(monkeypatch):
+ monkeypatch.setattr(
+ feed, "feed_episode_urls", lambda url: ["https://x/a.mp3", "https://x/b.mp3"]
+ )
+ assert transcribe_sources.expand_sources(
+ "https://feeds.example.com/show", from_stdin=False, sample=False
+ ) == ["https://x/a.mp3", "https://x/b.mp3"]
+
+
+def test_expand_sources_skips_feed_probe_when_detect_feeds_false(monkeypatch):
+ def _boom(url):
+ raise AssertionError("feed detection must be skipped when detect_feeds is False")
+
+ monkeypatch.setattr(feed, "feed_episode_urls", _boom)
+ assert (
+ transcribe_sources.expand_sources(
+ "https://feeds.example.com/show", from_stdin=False, sample=False, detect_feeds=False
+ )
+ is None
+ )
+
+
+# --- end-to-end CLI -----------------------------------------------------------
+
+
+def _auth():
+ config.set_api_key("default", "sk_live")
+
+
+@pytest.fixture(autouse=True)
+def workdir(tmp_path, monkeypatch):
+ monkeypatch.chdir(tmp_path)
+
+
+def _patch_transcribe(mocker, monkeypatch):
+ seen = []
+
+ def fake(api_key, audio, *, config):
+ seen.append(audio)
+ t = mocker.MagicMock()
+ t.id = f"t_{audio}"
+ t.text = f"text of {audio}"
+ t.status = "completed"
+ t.json_response = {"id": t.id, "text": t.text, "status": "completed"}
+ return t
+
+ monkeypatch.setattr(_TRANSCRIBE, fake)
+ return seen
+
+
+def test_transcribe_feed_url_batches_every_episode(mocker, monkeypatch):
+ _auth()
+ monkeypatch.setattr(feed, "_fetch", lambda url: _RSS)
+ seen = _patch_transcribe(mocker, monkeypatch)
+ result = runner.invoke(app, ["transcribe", "https://feeds.example.com/show", "--json"])
+ assert result.exit_code == 0
+ # Each episode enclosure was transcribed directly (the API fetches the URL — no
+ # yt-dlp download), and the feed XML itself was never sent as a source. Batch
+ # workers finish in any order, so compare as a set.
+ assert sorted(seen) == sorted(
+ [
+ "https://cdn.example.com/ep2.mp3",
+ "https://cdn.example.com/ep1.mp3?token=a&b=2",
+ ]
+ )
+ statuses = [json.loads(line)["status"] for line in result.output.splitlines()]
+ assert statuses == ["completed", "completed"]
+
+
+def test_transcribe_non_feed_url_stays_single_source(mocker, monkeypatch):
+ # A direct audio URL is passed straight to the API, not expanded into a batch.
+ _auth()
+ monkeypatch.setattr(
+ feed, "_fetch", lambda url: (_ for _ in ()).throw(AssertionError("no fetch"))
+ )
+ seen = _patch_transcribe(mocker, monkeypatch)
+ result = runner.invoke(app, ["transcribe", "https://example.com/episode.mp3", "-o", "id"])
+ assert result.exit_code == 0
+ assert seen == ["https://example.com/episode.mp3"]
+ assert result.output.strip() == "t_https://example.com/episode.mp3"
+
+
+def test_transcribe_feed_url_show_code_does_not_fetch(monkeypatch):
+ _auth()
+
+ def _boom(url):
+ raise AssertionError("--show-code must not touch the network")
+
+ monkeypatch.setattr(feed, "_fetch", _boom)
+ result = runner.invoke(app, ["transcribe", "https://feeds.example.com/show", "--show-code"])
+ assert result.exit_code == 0
+ assert "import assemblyai as aai" in result.output # generated SDK code, no network probe
diff --git a/uv.lock b/uv.lock
index e817e19f..098244ab 100644
--- a/uv.lock
+++ b/uv.lock
@@ -22,6 +22,7 @@ source = { editable = "." }
dependencies = [
{ name = "assemblyai" },
{ name = "audioop-lts", marker = "python_full_version >= '3.13'" },
+ { name = "feedparser" },
{ name = "fsspec" },
{ name = "httpx2" },
{ name = "jiwer" },
@@ -74,6 +75,7 @@ dev = [
requires-dist = [
{ name = "assemblyai", specifier = ">=0.64.4" },
{ name = "audioop-lts", marker = "python_full_version >= '3.13'", specifier = ">=0.2" },
+ { name = "feedparser", specifier = ">=6.0.11" },
{ name = "fsspec", specifier = ">=2026.4.0" },
{ name = "httpx2", specifier = ">=2.0.0" },
{ name = "jiwer", specifier = ">=4.0" },
@@ -674,6 +676,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e0/82/45359b62a067409bd929ae8a56b8ed13e5a8c8a61194b3c236920999ab83/fastapi-0.136.3-py3-none-any.whl", hash = "sha256:3d2a69bdf04b7e9f3afa292c3bc7a98816bbfafa10bc9b45f3f3700d2f761620", size = 117481, upload-time = "2026-05-23T18:53:16.924Z" },
]
+[[package]]
+name = "feedparser"
+version = "6.0.12"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "sgmllib3k" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/dc/79/db7edb5e77d6dfbc54d7d9df72828be4318275b2e580549ff45a962f6461/feedparser-6.0.12.tar.gz", hash = "sha256:64f76ce90ae3e8ef5d1ede0f8d3b50ce26bcce71dd8ae5e82b1cd2d4a5f94228", size = 286579, upload-time = "2025-09-10T13:33:59.486Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/4e/eb/c96d64137e29ae17d83ad2552470bafe3a7a915e85434d9942077d7fd011/feedparser-6.0.12-py3-none-any.whl", hash = "sha256:6bbff10f5a52662c00a2e3f86a38928c37c48f77b3c511aedcd51de933549324", size = 81480, upload-time = "2025-09-10T13:33:58.022Z" },
+]
+
[[package]]
name = "filelock"
version = "3.29.0"
@@ -1825,6 +1839,12 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/b7/46/f5af3402b579fd5e11573ce652019a67074317e18c1935cc0b4ba9b35552/secretstorage-3.5.0-py3-none-any.whl", hash = "sha256:0ce65888c0725fcb2c5bc0fdb8e5438eece02c523557ea40ce0703c266248137", size = 15554, upload-time = "2025-11-23T19:02:51.545Z" },
]
+[[package]]
+name = "sgmllib3k"
+version = "1.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9e/bd/3704a8c3e0942d711c1299ebf7b9091930adae6675d7c8f476a7ce48653c/sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9", size = 5750, upload-time = "2010-08-24T14:33:52.445Z" }
+
[[package]]
name = "shellingham"
version = "1.5.4"