From 910a294f8dde00fca94219e64b9231155c91d88a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 16 Jun 2026 21:27:46 +0000
Subject: [PATCH 1/3] Transcribe a podcast RSS feed as a resumable batch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`assembly transcribe <feed-url>` now expands a podcast RSS/Atom feed into its
episode enclosure URLs and runs them through the existing batch path — one
`.aai.json` sidecar per episode, resumable, concurrent, and compatible with
`--llm`/`--llm-reduce`. The enclosures are direct media URLs the API fetches
itself, so no per-episode yt-dlp download is needed (unlike a podcast *page*).

Detection is deliberately narrow to avoid surprise fetches: only an http(s) URL
whose path is feed-shaped (extensionless or `.xml`/`.rss`/`.atom`) and that no
dedicated yt-dlp extractor already claims is probed, the response body is bounded
to 10 MB, binary media content types are skipped, and only content that actually
parses as a feed with at least one enclosure is treated as one. `--show-code`
skips the probe entirely so it never touches the network.

Docs (README, transcribe help/docstring, aai-cli skill reference) updated to
list RSS feeds alongside files, URLs, and YouTube/podcast pages.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VwZxsDGG57kDQU4J39u3oq
---
 README.md                                     |   8 +-
 aai_cli/app/transcribe/feed.py                | 125 +++++++
 aai_cli/app/transcribe/run.py                 |   7 +-
 aai_cli/app/transcribe/sources.py             |  27 +-
 aai_cli/commands/transcribe.py                |  17 +-
 .../aai-cli/references/transcription.md       |   9 +-
 .../test_snapshots_help_run.ambr              |  30 +-
 tests/test_transcribe_feed.py                 | 311 ++++++++++++++++++
 8 files changed, 504 insertions(+), 30 deletions(-)
 create mode 100644 aai_cli/app/transcribe/feed.py
 create mode 100644 tests/test_transcribe_feed.py
diff --git a/README.md b/README.md
index 242008a8..8d89f8de 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 [![License](https://img.shields.io/badge/license-MIT-D6402E)](https://github.com/AssemblyAI/cli/blob/main/LICENSE)
 [![Docs](https://img.shields.io/badge/docs-assemblyai-D6402E)](https://www.assemblyai.com/docs)
 
-The AssemblyAI CLI (`assembly`) brings speech AI directly into your terminal: transcribe files, URLs, and YouTube/podcast pages, stream live audio, talk to a two-way voice agent, prompt the LLM Gateway, benchmark speech models, and scaffold ready-to-deploy starter apps.
+The AssemblyAI CLI (`assembly`) brings speech AI directly into your terminal: transcribe files, URLs, YouTube/podcast pages, and whole podcast RSS feeds, stream live audio, talk to a two-way voice agent, prompt the LLM Gateway, benchmark speech models, and scaffold ready-to-deploy starter apps.
 
 <p align="center">
   <img src="assets/welcome.png" alt="The assembly CLI welcome screen, listing command groups for transcription, streaming, voice agents, app scaffolding, and account management" width="820">
@@ -44,7 +44,7 @@ That's it. Run `assembly onboard` for a guided tour, or see [Installation](#-ins
 
 | Command | What it does |
 | :--- | :--- |
-| `assembly transcribe` | Transcribe files, URLs, YouTube/podcast pages, directories, globs, or bucket storage (`s3://`, `gs://`, `az://`) — with speaker labels, PII redaction, summarization, SRT/VTT captions, and resumable batch runs |
+| `assembly transcribe` | Transcribe files, URLs, YouTube/podcast pages, podcast RSS feeds, directories, globs, or bucket storage (`s3://`, `gs://`, `az://`) — with speaker labels, PII redaction, summarization, SRT/VTT captions, and resumable batch runs |
 | `assembly stream` | Real-time transcription from your microphone, a file, or a URL — on macOS it can capture system audio too |
 | `assembly dictate` | Push-to-talk dictation: press Enter to record, Enter again for instant text (Sync STT API, up to 120 s per utterance) |
 | `assembly agent` | Full-duplex spoken conversation with a voice agent, right in your terminal |
@@ -285,11 +285,13 @@ assembly transcribe video.mp4 -o srt   # captions
 assembly transcribe call.mp3 --speaker-labels --summarization --json
 ```
 
-Transcribe in batches — a directory, a glob, or a piped list, resumable on re-run:
+Transcribe in batches — a directory, a glob, a piped list, or a whole podcast
+RSS feed (every episode becomes one source), resumable on re-run:
 
 ```sh
 assembly transcribe ./recordings
 assembly transcribe "s3://bucket/calls/*.mp3"   # needs: pip install s3fs
+assembly transcribe "https://feeds.simplecast.com/54nAGcIl"   # every episode in the feed
 find . -name "*.wav" | assembly transcribe --from-stdin
 ```
 
diff --git a/aai_cli/app/transcribe/feed.py b/aai_cli/app/transcribe/feed.py
new file mode 100644
index 00000000..3b4df05a
--- /dev/null
+++ b/aai_cli/app/transcribe/feed.py
@@ -0,0 +1,125 @@
+"""Podcast RSS/Atom feed expansion for ``assembly transcribe``.
+
+A feed URL names a whole show, so transcribing it means transcribing every
+episode. ``feed_episode_urls`` fetches the URL and, when it parses as an RSS or
+Atom feed carrying audio/video enclosures, returns those enclosure URLs (in feed
+order — newest first) for the batch path to transcribe, one resumable sidecar per
+episode. The enclosures are direct media URLs the API fetches itself, so — unlike
+a YouTube or podcast *page*, which yt-dlp downloads first — no local download step
+is needed.
+
+Detection is deliberately narrow so a direct media URL or ordinary web page still
+falls through to the single-source path untouched (and is never fetched twice):
+only an http(s) URL whose path is feed-shaped — no extension, or one of
+``.xml``/``.rss``/``.atom`` — and that no dedicated yt-dlp extractor already claims
+is sniffed, the response body is bounded, and only content that actually parses as
+a feed with at least one enclosure is treated as a feed.
+"""
+
+from __future__ import annotations
+
+import html
+import re
+from pathlib import PurePosixPath
+from urllib.parse import urlsplit
+
+from aai_cli.core import youtube
+
+# A feed lives at an extensionless URL (e.g. feeds.simplecast.com/<id>) or a feed
+# document (.xml/.rss/.atom). Every other path — .mp3, .txt, .pdf — is never a feed,
+# so it is left for the single-source path and never fetched here.
+_FEED_URL_SUFFIXES = frozenset({"", ".xml", ".rss", ".atom"})
+
+# Bound the download so a hostile or huge URL can't exhaust memory; 10 MB of feed
+# already holds thousands of episodes, far past any realistic batch.
+_MAX_FEED_BYTES = 10 * 1024 * 1024
+_FETCH_TIMEOUT_SECONDS = 15.0
+
+# A feed body must announce itself with an <rss …> or <feed …> root element
+# (namespaced or not) before its <enclosure>s are trusted, so a stray HTML page
+# that merely contains the word "enclosure" is never mistaken for a podcast.
+_FEED_ROOT_RE = re.compile(r"<\s*(?:[\w.-]+:)?(?:rss|feed)\b", re.IGNORECASE)
+# RSS 2.0 episodes: <enclosure url="…" type="audio/mpeg" length="…"/>. The url
+# attribute can sit anywhere in the tag and use either quote style.
+_ENCLOSURE_TAG_RE = re.compile(r"<\s*enclosure\b([^>]*)>", re.IGNORECASE)
+# Atom episodes: <link rel="enclosure" href="…"/> (rel/href in either order).
+_LINK_TAG_RE = re.compile(r"<\s*link\b([^>]*)>", re.IGNORECASE)
+_URL_ATTR_RE = re.compile(r"""\burl\s*=\s*["']([^"']+)["']""", re.IGNORECASE)
+_HREF_ATTR_RE = re.compile(r"""\bhref\s*=\s*["']([^"']+)["']""", re.IGNORECASE)
+_REL_ENCLOSURE_RE = re.compile(r"""\brel\s*=\s*["']enclosure["']""", re.IGNORECASE)
+
+
+def feed_episode_urls(url: str) -> list[str] | None:
+    """The episode media URLs if `url` is a podcast feed, else ``None``.
+
+    Returns ``None`` (stay single-source) for a direct-media URL, a yt-dlp page,
+    an unreachable URL, or any content that isn't a feed carrying enclosures.
+    """
+    if not _looks_like_feed_url(url) or youtube.is_downloadable_url(url):
+        return None
+    body = _fetch(url)
+    if body is None:
+        return None
+    return _episode_urls(body)
+
+
+def _looks_like_feed_url(url: str) -> bool:
+    """True when the URL path is feed-shaped: extensionless or a feed document."""
+    suffix = PurePosixPath(urlsplit(url).path).suffix.lower()
+    return suffix in _FEED_URL_SUFFIXES
+
+
+def _episode_urls(body: str) -> list[str] | None:
+    """The enclosure URLs in a feed body, deduped in document order; ``None`` when it
+    isn't a feed or carries no enclosures."""
+    if not _FEED_ROOT_RE.search(body):
+        return None
+    urls = [*_rss_enclosure_urls(body), *_atom_enclosure_urls(body)]
+    deduped = list(dict.fromkeys(u for u in urls if u))
+    return deduped or None
+
+
+def _rss_enclosure_urls(body: str) -> list[str]:
+    """The ``url`` of every RSS ``<enclosure url="…">`` tag, HTML-unescaped."""
+    return [
+        html.unescape(match.group(1).strip())
+        for attrs in _ENCLOSURE_TAG_RE.findall(body)
+        if (match := _URL_ATTR_RE.search(attrs)) is not None
+    ]
+
+
+def _atom_enclosure_urls(body: str) -> list[str]:
+    """The ``href`` of every Atom ``<link rel="enclosure" href="…">``, HTML-unescaped."""
+    return [
+        html.unescape(match.group(1).strip())
+        for attrs in _LINK_TAG_RE.findall(body)
+        if _REL_ENCLOSURE_RE.search(attrs) is not None
+        and (match := _HREF_ATTR_RE.search(attrs)) is not None
+    ]
+
+
+def _fetch(url: str) -> str | None:
+    """Up to ``_MAX_FEED_BYTES`` of `url` decoded as text, or ``None`` on any failure
+    or when the response is obviously binary media (audio/video/image)."""
+    import httpx2 as httpx
+
+    chunks: list[bytes] = []
+    try:
+        with (
+            httpx.Client(timeout=_FETCH_TIMEOUT_SECONDS, follow_redirects=True) as client,
+            client.stream("GET", url) as response,
+        ):
+            if not response.is_success:
+                return None
+            content_type = response.headers.get("content-type", "").lower()
+            if content_type.startswith(("audio/", "video/", "image/")):
+                return None
+            total = 0
+            for chunk in response.iter_bytes():
+                chunks.append(chunk)
+                total += len(chunk)
+                if total >= _MAX_FEED_BYTES:
+                    break
+    except (httpx.HTTPError, OSError):
+        return None
+    return b"".join(chunks).decode("utf-8", "replace")
diff --git a/aai_cli/app/transcribe/run.py b/aai_cli/app/transcribe/run.py
index f02d8358..e26458bf 100644
--- a/aai_cli/app/transcribe/run.py
+++ b/aai_cli/app/transcribe/run.py
@@ -356,7 +356,12 @@ def run_transcribe(opts: TranscribeOptions, state: AppState, *, json_mode: bool)
     transcribe_validate.validate_speakers_expected(merged)
 
     sources = transcribe_sources.expand_sources(
-        opts.source, from_stdin=opts.from_stdin, sample=opts.sample
+        opts.source,
+        from_stdin=opts.from_stdin,
+        sample=opts.sample,
+        # --show-code must never touch the network; skip the feed probe and treat a
+        # URL as a single source for code generation.
+        detect_feeds=not opts.show_code,
     )
     if sources is not None:
         transcribe_sources.reject_single_source_flags(
diff --git a/aai_cli/app/transcribe/sources.py b/aai_cli/app/transcribe/sources.py
index e2dc6e9b..bead0a25 100644
--- a/aai_cli/app/transcribe/sources.py
+++ b/aai_cli/app/transcribe/sources.py
@@ -49,13 +49,18 @@
 _GLOB_CHARS = frozenset("*?[")
 
 
-def expand_sources(source: str | None, *, from_stdin: bool, sample: bool) -> list[str] | None:
+def expand_sources(
+    source: str | None, *, from_stdin: bool, sample: bool, detect_feeds: bool = True
+) -> list[str] | None:
     """The batch source list, or ``None`` when this is a single-source invocation.
 
     Batch mode triggers on ``--from-stdin``, a directory (scanned recursively for
-    audio files), a glob pattern that names no existing file, or a bucket URL
-    that is a glob or trailing-slash folder. A plain file, URL, ``-`` (audio
-    piped on stdin), or ``--sample`` stays on the single-source path.
+    audio files), a glob pattern that names no existing file, a bucket URL that is
+    a glob or trailing-slash folder, or an http(s) URL that turns out to be a
+    podcast RSS/Atom feed (each episode becomes one batch source). A plain file,
+    direct media URL, ``-`` (audio piped on stdin), or ``--sample`` stays on the
+    single-source path. ``detect_feeds=False`` skips the feed probe (and its
+    network fetch) for paths that must not touch the network, e.g. ``--show-code``.
     """
     if from_stdin:
         return _stdin_sources(source, sample=sample)
@@ -63,10 +68,22 @@ def expand_sources(source: str | None, *, from_stdin: bool, sample: bool) -> lis
     # unset shell variable in `assembly transcribe "$FILE"`. `Path("")` is `Path(".")`,
     # so it would otherwise fall into the directory branch and batch-transcribe the
     # whole working directory; instead it stays single-source and fails validation.
-    if not source or sample or source == "-" or source.startswith(URL_PREFIXES):
+    if not source or sample or source == "-":
         return None
+    if source.startswith(URL_PREFIXES):
+        # A podcast feed URL expands into its episode enclosure URLs (batch mode);
+        # a direct media URL or ordinary page returns None and stays single-source.
+        from aai_cli.app.transcribe import feed
+
+        return feed.feed_episode_urls(source) if detect_feeds else None
     if remotefs.is_remote_url(source):
         return _remote_sources(source)
+    return _local_sources(source)
+
+
+def _local_sources(source: str) -> list[str] | None:
+    """Batch sources for a local path: a directory's audio files or a glob's matches,
+    else ``None`` (a single file, which the single-source path handles)."""
     path = Path(source)
     if path.is_dir():
         return _directory_sources(path)
diff --git a/aai_cli/commands/transcribe.py b/aai_cli/commands/transcribe.py
index 83f8365b..139a4f14 100644
--- a/aai_cli/commands/transcribe.py
+++ b/aai_cli/commands/transcribe.py
@@ -31,6 +31,10 @@
             ("Try it with the hosted sample", "assembly transcribe --sample"),
             ("Transcribe a YouTube video", "assembly transcribe https://youtu.be/dtp6b76pMak"),
             ("Transcribe a podcast page", 'assembly transcribe "https://podcasts.apple.com/…"'),
+            (
+                "Transcribe a whole podcast feed",
+                'assembly transcribe "https://feeds.simplecast.com/…"',
+            ),
             ("Label who said what", "assembly transcribe call.mp3 --speaker-labels"),
             ("Redact PII for compliance", "assembly transcribe call.mp3 --redact-pii"),
             ("Summarize a recording", "assembly transcribe call.mp3 --summarization"),
@@ -43,8 +47,8 @@ def transcribe(
     ctx: typer.Context,
     source: str | None = typer.Argument(
         None,
-        help="Audio file, URL, YouTube/podcast URL, bucket URL (s3://, gs://, …), or a "
-        "directory/glob (batch mode)",
+        help="Audio file, URL, YouTube/podcast URL, podcast RSS feed, bucket URL "
+        "(s3://, gs://, …), or a directory/glob (batch mode)",
     ),
     sample: bool = typer.Option(False, "--sample", help="Use the hosted wildfires.mp3 sample"),
     # batch mode
@@ -362,10 +366,11 @@ def transcribe(
     URLs (any page yt-dlp can extract) are downloaded first, then transcribed.
 
     Batch mode: pass a directory or glob (or pipe a list with --from-stdin) to
-    transcribe many sources concurrently. Each source gets a .aai.json sidecar
-    with the full result (including any --llm responses), and a re-run skips
-    sources already transcribed — with changed --llm prompts it replays just
-    the LLM step, never a second transcription.
+    transcribe many sources concurrently. A podcast RSS/Atom feed URL also expands
+    to batch mode — every episode enclosure becomes one source. Each source gets a
+    .aai.json sidecar with the full result (including any --llm responses), and a
+    re-run skips sources already transcribed — with changed --llm prompts it
+    replays just the LLM step, never a second transcription.
 
     Bucket URLs (s3://, gs://, az://, sftp://, …) work for single files and for
     batches (a glob, or a folder ending in /); install the matching fsspec
diff --git a/aai_cli/skills/aai-cli/references/transcription.md b/aai_cli/skills/aai-cli/references/transcription.md
index f0c2ec56..0c1f29c9 100644
--- a/aai_cli/skills/aai-cli/references/transcription.md
+++ b/aai_cli/skills/aai-cli/references/transcription.md
@@ -5,12 +5,14 @@ Five commands. All accept `--json` (auto-enabled when piped); `transcribe`,
 `transcribe`, `stream`, and `agent` accept `--show-code` to print equivalent
 Python SDK code without calling the API.
 
-## `assembly transcribe [SOURCE]` — file / URL / YouTube / podcast page
+## `assembly transcribe [SOURCE]` — file / URL / YouTube / podcast page / RSS feed
 
 `SOURCE` is a local file path, public URL, or a media-page URL yt-dlp can extract
 (YouTube, Apple Podcasts, Spreaker, SoundCloud, …) — those are downloaded first.
-Use `--sample` for the hosted `wildfires.mp3`. Analysis results (summary,
-chapters, sentiment, …) render automatically in human mode.
+A podcast RSS/Atom feed URL expands into a resumable batch run over every episode
+enclosure (one `.aai.json` sidecar apiece). Use `--sample` for the hosted
+`wildfires.mp3`. Analysis results (summary, chapters, sentiment, …) render
+automatically in human mode.
 
 High-value flags (run `assembly transcribe --help` for the full set):
 
@@ -37,6 +39,7 @@ assembly transcribe --sample
 assembly transcribe call.mp3 --speaker-labels --speakers-expected 2 --redact-pii
 assembly transcribe call.mp3 -o text
 assembly transcribe call.mp3 --show-code
+assembly transcribe "https://feeds.simplecast.com/54nAGcIl"   # every episode in the feed
 ```
 
 ## `assembly stream [SOURCE]` — live real-time transcription
diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr
index 5b19f1df..ef6cd589 100644
--- a/tests/__snapshots__/test_snapshots_help_run.ambr
+++ b/tests/__snapshots__/test_snapshots_help_run.ambr
@@ -869,10 +869,11 @@
    URLs (any page yt-dlp can extract) are downloaded first, then transcribed.
   
    Batch mode: pass a directory or glob (or pipe a list with --from-stdin) to
-   transcribe many sources concurrently. Each source gets a .aai.json sidecar
-   with the full result (including any --llm responses), and a re-run skips
-   sources already transcribed — with changed --llm prompts it replays just
-   the LLM step, never a second transcription.
+   transcribe many sources concurrently. A podcast RSS/Atom feed URL also expands
+   to batch mode — every episode enclosure becomes one source. Each source gets a
+   .aai.json sidecar with the full result (including any --llm responses), and a
+   re-run skips sources already transcribed — with changed --llm prompts it
+   replays just the LLM step, never a second transcription.
   
    Bucket URLs (s3://, gs://, az://, sftp://, …) work for single files and for
    batches (a glob, or a folder ending in /); install the matching fsspec
@@ -883,8 +884,9 @@
    mode.
   
   ╭─ Arguments ──────────────────────────────────────────────────────────────────╮
-  │   source      [SOURCE]  Audio file, URL, YouTube/podcast URL, bucket URL     │
-  │                         (s3://, gs://, …), or a directory/glob (batch mode)  │
+  │   source      [SOURCE]  Audio file, URL, YouTube/podcast URL, podcast RSS    │
+  │                         feed, bucket URL (s3://, gs://, …), or a             │
+  │                         directory/glob (batch mode)                          │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Options ────────────────────────────────────────────────────────────────────╮
   │ --sample                                             Use the hosted          │
@@ -1052,10 +1054,11 @@
    URLs (any page yt-dlp can extract) are downloaded first, then transcribed.
   
    Batch mode: pass a directory or glob (or pipe a list with --from-stdin) to
-   transcribe many sources concurrently. Each source gets a .aai.json sidecar
-   with the full result (including any --llm responses), and a re-run skips
-   sources already transcribed — with changed --llm prompts it replays just
-   the LLM step, never a second transcription.
+   transcribe many sources concurrently. A podcast RSS/Atom feed URL also expands
+   to batch mode — every episode enclosure becomes one source. Each source gets a
+   .aai.json sidecar with the full result (including any --llm responses), and a
+   re-run skips sources already transcribed — with changed --llm prompts it
+   replays just the LLM step, never a second transcription.
   
    Bucket URLs (s3://, gs://, az://, sftp://, …) work for single files and for
    batches (a glob, or a folder ending in /); install the matching fsspec
@@ -1066,8 +1069,9 @@
    mode.
   
   ╭─ Arguments ──────────────────────────────────────────────────────────────────╮
-  │   source      [SOURCE]  Audio file, URL, YouTube/podcast URL, bucket URL     │
-  │                         (s3://, gs://, …), or a directory/glob (batch mode)  │
+  │   source      [SOURCE]  Audio file, URL, YouTube/podcast URL, podcast RSS    │
+  │                         feed, bucket URL (s3://, gs://, …), or a             │
+  │                         directory/glob (batch mode)                          │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Options ────────────────────────────────────────────────────────────────────╮
   │ --sample                                             Use the hosted          │
@@ -1228,6 +1232,8 @@
    $ assembly transcribe https://youtu.be/dtp6b76pMak
    Transcribe a podcast page
    $ assembly transcribe "https://podcasts.apple.com/…"
+   Transcribe a whole podcast feed
+   $ assembly transcribe "https://feeds.simplecast.com/…"
    Label who said what
    $ assembly transcribe call.mp3 --speaker-labels
    Redact PII for compliance
diff --git a/tests/test_transcribe_feed.py b/tests/test_transcribe_feed.py
new file mode 100644
index 00000000..73127bd5
--- /dev/null
+++ b/tests/test_transcribe_feed.py
@@ -0,0 +1,311 @@
+"""Podcast RSS/Atom feed expansion for `assembly transcribe`.
+
+A feed URL becomes a batch over its episode enclosures. These tests cover the
+parser and fetcher in `app/transcribe/feed.py` directly, the `expand_sources`
+seam that routes a feed URL into batch mode, and the end-to-end CLI run with the
+network fetch faked (the suite is socket-blocked).
+"""
+
+import json
+
+import httpx2 as httpx
+import pytest
+from typer.testing import CliRunner
+
+from aai_cli.app.transcribe import feed
+from aai_cli.app.transcribe import sources as transcribe_sources
+from aai_cli.core import config
+from aai_cli.main import app
+
+runner = CliRunner()
+
+_TRANSCRIBE = "aai_cli.app.transcribe.run.client.transcribe"
+
+# A minimal but realistic RSS 2.0 podcast feed: two episodes, newest first, with
+# an &amp;-escaped query string on the second enclosure URL.
+_RSS = """<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <title>Example Show</title>
+    <item>
+      <title>Episode 2</title>
+      <enclosure url="https://cdn.example.com/ep2.mp3" length="1" type="audio/mpeg"/>
+    </item>
+    <item>
+      <title>Episode 1</title>
+      <enclosure type="audio/mpeg" url="https://cdn.example.com/ep1.mp3?token=a&amp;b=2"/>
+    </item>
+  </channel>
+</rss>
+"""
+
+
+# --- parsing ------------------------------------------------------------------
+
+
+def test_episode_urls_parses_rss_in_feed_order_and_unescapes():
+    assert feed._episode_urls(_RSS) == [
+        "https://cdn.example.com/ep2.mp3",
+        "https://cdn.example.com/ep1.mp3?token=a&b=2",  # &amp; decoded, item order kept
+    ]
+
+
+def test_episode_urls_parses_atom_enclosure_links_ignoring_other_links():
+    atom = """<feed xmlns="http://www.w3.org/2005/Atom">
+      <entry>
+        <link rel="alternate" href="https://example.com/page"/>
+        <link href="https://cdn.example.com/a.mp3" rel="enclosure" type="audio/mpeg"/>
+      </entry>
+    </feed>"""
+    # rel/href in either order; the non-enclosure <link> is skipped.
+    assert feed._episode_urls(atom) == ["https://cdn.example.com/a.mp3"]
+
+
+def test_episode_urls_dedupes_preserving_order():
+    rss = """<rss><channel>
+      <item><enclosure url="https://x/a.mp3"/></item>
+      <item><enclosure url="https://x/b.mp3"/></item>
+      <item><enclosure url="https://x/a.mp3"/></item>
+    </channel></rss>"""
+    assert feed._episode_urls(rss) == ["https://x/a.mp3", "https://x/b.mp3"]
+
+
+def test_episode_urls_returns_none_when_not_a_feed():
+    # An ordinary HTML page that merely contains the word enclosure is not a feed.
+    html_page = '<html><body><enclosure url="https://x/a.mp3"/></body></html>'
+    assert feed._episode_urls(html_page) is None
+
+
+def test_episode_urls_returns_none_for_feed_without_enclosures():
+    assert feed._episode_urls("<rss><channel><title>No media</title></channel></rss>") is None
+
+
+def test_episode_urls_ignores_empty_url_attribute():
+    assert feed._episode_urls('<rss><channel><enclosure url=""/></channel></rss>') is None
+
+
+# --- media-extension gate -----------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    ("url", "feed_shaped"),
+    [
+        ("https://feeds.example.com/show.xml", True),
+        ("https://feeds.example.com/show.RSS?fmt=1", True),  # case-insensitive, query ignored
+        ("https://feeds.example.com/feed.atom", True),
+        ("https://feeds.example.com/54nAGcIl", True),  # extensionless feed
+        ("https://cdn.example.com/show/ep.mp3", False),  # direct media — never probed
+        ("https://example.com/notes.txt", False),
+    ],
+)
+def test_looks_like_feed_url(url, feed_shaped):
+    assert feed._looks_like_feed_url(url) is feed_shaped
+
+
+# --- feed_episode_urls (gate + fetch + parse) ---------------------------------
+
+
+def test_feed_episode_urls_skips_direct_media_without_fetching(monkeypatch):
+    def _boom(url):
+        raise AssertionError("a direct media URL must not be fetched")
+
+    monkeypatch.setattr(feed, "_fetch", _boom)
+    assert feed.feed_episode_urls("https://cdn.example.com/ep.mp3") is None
+
+
+def test_feed_episode_urls_skips_ytdlp_page_without_fetching(monkeypatch):
+    def _boom(url):
+        raise AssertionError("a yt-dlp page URL must not be fetched")
+
+    monkeypatch.setattr(feed, "_fetch", _boom)
+    assert feed.feed_episode_urls("https://youtu.be/abc") is None
+
+
+def test_feed_episode_urls_returns_episodes_for_a_feed(monkeypatch):
+    monkeypatch.setattr(feed, "_fetch", lambda url: _RSS)
+    assert feed.feed_episode_urls("https://feeds.example.com/show") == [
+        "https://cdn.example.com/ep2.mp3",
+        "https://cdn.example.com/ep1.mp3?token=a&b=2",
+    ]
+
+
+def test_feed_episode_urls_returns_none_when_fetch_fails(monkeypatch):
+    monkeypatch.setattr(feed, "_fetch", lambda url: None)
+    assert feed.feed_episode_urls("https://feeds.example.com/show") is None
+
+
+def test_feed_episode_urls_returns_none_for_non_feed_body(monkeypatch):
+    monkeypatch.setattr(feed, "_fetch", lambda url: "<html>not a feed</html>")
+    assert feed.feed_episode_urls("https://example.com/page") is None
+
+
+# --- _fetch (httpx, faked offline) --------------------------------------------
+
+
+class _FakeStream:
+    def __init__(self, *, status=200, content_type="application/rss+xml", chunks=(b"<rss/>",)):
+        self.status_code = status
+        self.is_success = 200 <= status < 300  # mirror httpx.Response.is_success
+        self.headers = {"content-type": content_type}
+        self._chunks = chunks
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *exc):
+        return False
+
+    def iter_bytes(self):
+        yield from self._chunks
+
+
+class _FakeClient:
+    def __init__(self, stream):
+        self._stream = stream
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *exc):
+        return False
+
+    def stream(self, method, url):
+        return self._stream
+
+
+def _patch_client(monkeypatch, stream):
+    monkeypatch.setattr(httpx, "Client", lambda **kwargs: _FakeClient(stream))
+
+
+def test_fetch_returns_decoded_body(monkeypatch):
+    _patch_client(monkeypatch, _FakeStream(chunks=(b"<rss>", b"</rss>")))
+    assert feed._fetch("https://feeds.example.com/show") == "<rss></rss>"
+
+
+def test_fetch_returns_none_on_http_error_status(monkeypatch):
+    _patch_client(monkeypatch, _FakeStream(status=404))
+    assert feed._fetch("https://feeds.example.com/missing") is None
+
+
+def test_fetch_treats_exactly_400_as_error(monkeypatch):
+    # The >= 400 boundary: a 400 is an error, not a body to parse.
+    _patch_client(monkeypatch, _FakeStream(status=400))
+    assert feed._fetch("https://feeds.example.com/bad") is None
+
+
+@pytest.mark.parametrize("content_type", ["audio/mpeg", "video/mp4", "image/png"])
+def test_fetch_skips_binary_media_content_types(monkeypatch, content_type):
+    _patch_client(monkeypatch, _FakeStream(content_type=content_type, chunks=(b"\x00\x01",)))
+    assert feed._fetch("https://cdn.example.com/file") is None
+
+
+def test_fetch_truncates_at_the_byte_cap(monkeypatch):
+    monkeypatch.setattr(feed, "_MAX_FEED_BYTES", 4)
+    _patch_client(monkeypatch, _FakeStream(chunks=(b"aaa", b"bbb", b"ccc")))
+    # Reads until the running total reaches the cap, then stops — never the third chunk.
+    assert feed._fetch("https://feeds.example.com/big") == "aaabbb"
+
+
+def test_fetch_returns_none_on_network_error(monkeypatch):
+    def _raise(**kwargs):
+        raise httpx.ConnectError("boom")
+
+    monkeypatch.setattr(httpx, "Client", _raise)
+    assert feed._fetch("https://feeds.example.com/show") is None
+
+
+# --- expand_sources seam ------------------------------------------------------
+
+
+def test_expand_sources_routes_feed_url_to_batch(monkeypatch):
+    monkeypatch.setattr(
+        feed, "feed_episode_urls", lambda url: ["https://x/a.mp3", "https://x/b.mp3"]
+    )
+    assert transcribe_sources.expand_sources(
+        "https://feeds.example.com/show", from_stdin=False, sample=False
+    ) == ["https://x/a.mp3", "https://x/b.mp3"]
+
+
+def test_expand_sources_skips_feed_probe_when_detect_feeds_false(monkeypatch):
+    def _boom(url):
+        raise AssertionError("feed detection must be skipped when detect_feeds is False")
+
+    monkeypatch.setattr(feed, "feed_episode_urls", _boom)
+    assert (
+        transcribe_sources.expand_sources(
+            "https://feeds.example.com/show", from_stdin=False, sample=False, detect_feeds=False
+        )
+        is None
+    )
+
+
+# --- end-to-end CLI -----------------------------------------------------------
+
+
+def _auth():
+    config.set_api_key("default", "sk_live")
+
+
+@pytest.fixture(autouse=True)
+def workdir(tmp_path, monkeypatch):
+    monkeypatch.chdir(tmp_path)
+
+
+def _patch_transcribe(mocker, monkeypatch):
+    seen = []
+
+    def fake(api_key, audio, *, config):
+        seen.append(audio)
+        t = mocker.MagicMock()
+        t.id = f"t_{audio}"
+        t.text = f"text of {audio}"
+        t.status = "completed"
+        t.json_response = {"id": t.id, "text": t.text, "status": "completed"}
+        return t
+
+    monkeypatch.setattr(_TRANSCRIBE, fake)
+    return seen
+
+
+def test_transcribe_feed_url_batches_every_episode(mocker, monkeypatch):
+    _auth()
+    monkeypatch.setattr(feed, "_fetch", lambda url: _RSS)
+    seen = _patch_transcribe(mocker, monkeypatch)
+    result = runner.invoke(app, ["transcribe", "https://feeds.example.com/show", "--json"])
+    assert result.exit_code == 0
+    # Each episode enclosure was transcribed directly (the API fetches the URL — no
+    # yt-dlp download), and the feed XML itself was never sent as a source. Batch
+    # workers finish in any order, so compare as a set.
+    assert sorted(seen) == sorted(
+        [
+            "https://cdn.example.com/ep2.mp3",
+            "https://cdn.example.com/ep1.mp3?token=a&b=2",
+        ]
+    )
+    statuses = [json.loads(line)["status"] for line in result.output.splitlines()]
+    assert statuses == ["completed", "completed"]
+
+
+def test_transcribe_non_feed_url_stays_single_source(mocker, monkeypatch):
+    # A direct audio URL is passed straight to the API, not expanded into a batch.
+    _auth()
+    monkeypatch.setattr(
+        feed, "_fetch", lambda url: (_ for _ in ()).throw(AssertionError("no fetch"))
+    )
+    seen = _patch_transcribe(mocker, monkeypatch)
+    result = runner.invoke(app, ["transcribe", "https://example.com/episode.mp3", "-o", "id"])
+    assert result.exit_code == 0
+    assert seen == ["https://example.com/episode.mp3"]
+    assert result.output.strip() == "t_https://example.com/episode.mp3"
+
+
+def test_transcribe_feed_url_show_code_does_not_fetch(monkeypatch):
+    _auth()
+
+    def _boom(url):
+        raise AssertionError("--show-code must not touch the network")
+
+    monkeypatch.setattr(feed, "_fetch", _boom)
+    result = runner.invoke(app, ["transcribe", "https://feeds.example.com/show", "--show-code"])
+    assert result.exit_code == 0
+    assert "import assemblyai as aai" in result.output  # generated SDK code, no network probe

From 118d82303dee8475e5a61199c604d6dc9b67c59f Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 16 Jun 2026 22:12:38 +0000
Subject: [PATCH 2/3] Parse podcast feeds with feedparser, not a hand-rolled
 regex
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Swap the regex-based RSS/Atom enclosure extraction in feed.py for `feedparser`,
the de-facto standard feed parser — it handles the namespace, encoding, and
malformed-markup edge cases a regex never will. The bounded, content-type-guarded
httpx fetch stays the only network path: feedparser is handed the already-fetched
bytes (never a URL), so it never fetches on its own.

feedparser's result is untyped, so it's validated through a small pydantic model
(the project pattern for untyped third-party returns — cf. core/wer.py), keeping
feed.py strict-clean under mypy and pyright.

Adds feedparser as a runtime dependency.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VwZxsDGG57kDQU4J39u3oq
---
 aai_cli/app/transcribe/feed.py | 94 ++++++++++++++++------------------
 pyproject.toml                 |  5 ++
 tests/test_transcribe_feed.py  | 14 ++++-
 uv.lock                        | 20 ++++++++
 4 files changed, 82 insertions(+), 51 deletions(-)

diff --git a/aai_cli/app/transcribe/feed.py b/aai_cli/app/transcribe/feed.py
index 3b4df05a..96a018d2 100644
--- a/aai_cli/app/transcribe/feed.py
+++ b/aai_cli/app/transcribe/feed.py
@@ -1,28 +1,29 @@
 """Podcast RSS/Atom feed expansion for ``assembly transcribe``.
 
 A feed URL names a whole show, so transcribing it means transcribing every
-episode. ``feed_episode_urls`` fetches the URL and, when it parses as an RSS or
-Atom feed carrying audio/video enclosures, returns those enclosure URLs (in feed
-order — newest first) for the batch path to transcribe, one resumable sidecar per
-episode. The enclosures are direct media URLs the API fetches itself, so — unlike
-a YouTube or podcast *page*, which yt-dlp downloads first — no local download step
-is needed.
+episode. ``feed_episode_urls`` fetches the URL and, when ``feedparser`` recognizes
+it as an RSS or Atom feed, returns its episode enclosure URLs (in feed order —
+newest first) for the batch path to transcribe, one resumable sidecar per episode.
+The enclosures are direct media URLs the API fetches itself, so — unlike a YouTube
+or podcast *page*, which yt-dlp downloads first — no local download step is needed.
 
 Detection is deliberately narrow so a direct media URL or ordinary web page still
 falls through to the single-source path untouched (and is never fetched twice):
 only an http(s) URL whose path is feed-shaped — no extension, or one of
 ``.xml``/``.rss``/``.atom`` — and that no dedicated yt-dlp extractor already claims
-is sniffed, the response body is bounded, and only content that actually parses as
-a feed with at least one enclosure is treated as a feed.
+is sniffed, the response body is bounded, and only content ``feedparser`` parses as
+a real feed with at least one enclosure is treated as a feed. We hand ``feedparser``
+the already-fetched bytes (never the URL) so our bounded, safe fetch below stays the
+only network path.
 """
 
 from __future__ import annotations
 
-import html
-import re
 from pathlib import PurePosixPath
 from urllib.parse import urlsplit
 
+from pydantic import BaseModel
+
 from aai_cli.core import youtube
 
 # A feed lives at an extensionless URL (e.g. feeds.simplecast.com/<id>) or a feed
@@ -32,21 +33,28 @@
 
 # Bound the download so a hostile or huge URL can't exhaust memory; 10 MB of feed
 # already holds thousands of episodes, far past any realistic batch.
-_MAX_FEED_BYTES = 10 * 1024 * 1024
-_FETCH_TIMEOUT_SECONDS = 15.0
-
-# A feed body must announce itself with an <rss …> or <feed …> root element
-# (namespaced or not) before its <enclosure>s are trusted, so a stray HTML page
-# that merely contains the word "enclosure" is never mistaken for a podcast.
-_FEED_ROOT_RE = re.compile(r"<\s*(?:[\w.-]+:)?(?:rss|feed)\b", re.IGNORECASE)
-# RSS 2.0 episodes: <enclosure url="…" type="audio/mpeg" length="…"/>. The url
-# attribute can sit anywhere in the tag and use either quote style.
-_ENCLOSURE_TAG_RE = re.compile(r"<\s*enclosure\b([^>]*)>", re.IGNORECASE)
-# Atom episodes: <link rel="enclosure" href="…"/> (rel/href in either order).
-_LINK_TAG_RE = re.compile(r"<\s*link\b([^>]*)>", re.IGNORECASE)
-_URL_ATTR_RE = re.compile(r"""\burl\s*=\s*["']([^"']+)["']""", re.IGNORECASE)
-_HREF_ATTR_RE = re.compile(r"""\bhref\s*=\s*["']([^"']+)["']""", re.IGNORECASE)
-_REL_ENCLOSURE_RE = re.compile(r"""\brel\s*=\s*["']enclosure["']""", re.IGNORECASE)
+_MAX_FEED_BYTES = 10 * 1024 * 1024  # pragma: no mutate -- tuning knob, not behavior
+_FETCH_TIMEOUT_SECONDS = 15.0  # pragma: no mutate -- tuning knob, not behavior
+
+
+class _Enclosure(BaseModel):
+    """One ``<enclosure>`` / Atom enclosure link; ``href`` is the media URL."""
+
+    href: str = ""
+
+
+class _Entry(BaseModel):
+    enclosures: list[_Enclosure] = []
+
+
+class _ParsedFeed(BaseModel):
+    """The slice of feedparser's untyped result we use, validated into a real type
+    (the project pattern for untyped third-party returns — cf. core/wer.py)."""
+
+    # feedparser sets ``version`` to a non-empty id ("rss20", "atom10", …) for a
+    # recognized feed and to "" for anything it doesn't recognize as one.
+    version: str = ""
+    entries: list[_Entry] = []
 
 
 def feed_episode_urls(url: str) -> list[str] | None:
@@ -70,34 +78,22 @@ def _looks_like_feed_url(url: str) -> bool:
 
 
 def _episode_urls(body: str) -> list[str] | None:
-    """The enclosure URLs in a feed body, deduped in document order; ``None`` when it
-    isn't a feed or carries no enclosures."""
-    if not _FEED_ROOT_RE.search(body):
+    """The enclosure URLs in a feed body, deduped in document order; ``None`` when
+    feedparser doesn't recognize it as a feed or it carries no enclosures."""
+    import feedparser
+
+    # feedparser ships only partial inline types (its parse signature is Unknown),
+    # so the result is validated through _ParsedFeed below; mirror remotefs.py's
+    # fsspec shim in ignoring the unavoidable unknown-member report on the call.
+    raw = feedparser.parse(body)  # pyright: ignore[reportUnknownMemberType]
+    parsed = _ParsedFeed.model_validate(raw)
+    if not parsed.version:
         return None
-    urls = [*_rss_enclosure_urls(body), *_atom_enclosure_urls(body)]
-    deduped = list(dict.fromkeys(u for u in urls if u))
+    urls = [enc.href for entry in parsed.entries for enc in entry.enclosures if enc.href]
+    deduped = list(dict.fromkeys(urls))
     return deduped or None
 
 
-def _rss_enclosure_urls(body: str) -> list[str]:
-    """The ``url`` of every RSS ``<enclosure url="…">`` tag, HTML-unescaped."""
-    return [
-        html.unescape(match.group(1).strip())
-        for attrs in _ENCLOSURE_TAG_RE.findall(body)
-        if (match := _URL_ATTR_RE.search(attrs)) is not None
-    ]
-
-
-def _atom_enclosure_urls(body: str) -> list[str]:
-    """The ``href`` of every Atom ``<link rel="enclosure" href="…">``, HTML-unescaped."""
-    return [
-        html.unescape(match.group(1).strip())
-        for attrs in _LINK_TAG_RE.findall(body)
-        if _REL_ENCLOSURE_RE.search(attrs) is not None
-        and (match := _HREF_ATTR_RE.search(attrs)) is not None
-    ]
-
-
 def _fetch(url: str) -> str | None:
     """Up to ``_MAX_FEED_BYTES`` of `url` decoded as text, or ``None`` on any failure
     or when the response is obviously binary media (audio/video/image)."""
diff --git a/pyproject.toml b/pyproject.toml
index d614d42d..8740d822 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,6 +56,11 @@ dependencies = [
     # imported lazily). fsspec core only — each protocol's backend (s3fs, gcsfs, adlfs,
     # …) stays a user-installed extra surfaced via a clean install hint.
     "fsspec>=2026.4.0",
+    # Podcast RSS/Atom feed parsing for `assembly transcribe <feed-url>` (feed.py,
+    # imported lazily). The de-facto standard feed parser; pure-Python, no compiled
+    # deps. We hand it already-fetched bytes (never a URL) so our bounded, safe
+    # httpx fetch stays the only network path.
+    "feedparser>=6.0.11",
 ]
 
 [project.urls]
diff --git a/tests/test_transcribe_feed.py b/tests/test_transcribe_feed.py
index 73127bd5..5b3bf879 100644
--- a/tests/test_transcribe_feed.py
+++ b/tests/test_transcribe_feed.py
@@ -174,12 +174,22 @@ def stream(self, method, url):
 
 
 def _patch_client(monkeypatch, stream):
-    monkeypatch.setattr(httpx, "Client", lambda **kwargs: _FakeClient(stream))
+    captured = {}
+
+    def factory(**kwargs):
+        captured.update(kwargs)
+        return _FakeClient(stream)
+
+    monkeypatch.setattr(httpx, "Client", factory)
+    return captured
 
 
 def test_fetch_returns_decoded_body(monkeypatch):
-    _patch_client(monkeypatch, _FakeStream(chunks=(b"<rss>", b"</rss>")))
+    captured = _patch_client(monkeypatch, _FakeStream(chunks=(b"<rss>", b"</rss>")))
     assert feed._fetch("https://feeds.example.com/show") == "<rss></rss>"
+    # Feeds commonly 301/302 to a CDN, so redirects must be followed.
+    assert captured["follow_redirects"] is True
+    assert captured["timeout"] == feed._FETCH_TIMEOUT_SECONDS
 
 
 def test_fetch_returns_none_on_http_error_status(monkeypatch):
diff --git a/uv.lock b/uv.lock
index e817e19f..098244ab 100644
--- a/uv.lock
+++ b/uv.lock
@@ -22,6 +22,7 @@ source = { editable = "." }
 dependencies = [
     { name = "assemblyai" },
     { name = "audioop-lts", marker = "python_full_version >= '3.13'" },
+    { name = "feedparser" },
     { name = "fsspec" },
     { name = "httpx2" },
     { name = "jiwer" },
@@ -74,6 +75,7 @@ dev = [
 requires-dist = [
     { name = "assemblyai", specifier = ">=0.64.4" },
     { name = "audioop-lts", marker = "python_full_version >= '3.13'", specifier = ">=0.2" },
+    { name = "feedparser", specifier = ">=6.0.11" },
     { name = "fsspec", specifier = ">=2026.4.0" },
     { name = "httpx2", specifier = ">=2.0.0" },
     { name = "jiwer", specifier = ">=4.0" },
@@ -674,6 +676,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e0/82/45359b62a067409bd929ae8a56b8ed13e5a8c8a61194b3c236920999ab83/fastapi-0.136.3-py3-none-any.whl", hash = "sha256:3d2a69bdf04b7e9f3afa292c3bc7a98816bbfafa10bc9b45f3f3700d2f761620", size = 117481, upload-time = "2026-05-23T18:53:16.924Z" },
 ]
 
+[[package]]
+name = "feedparser"
+version = "6.0.12"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "sgmllib3k" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/dc/79/db7edb5e77d6dfbc54d7d9df72828be4318275b2e580549ff45a962f6461/feedparser-6.0.12.tar.gz", hash = "sha256:64f76ce90ae3e8ef5d1ede0f8d3b50ce26bcce71dd8ae5e82b1cd2d4a5f94228", size = 286579, upload-time = "2025-09-10T13:33:59.486Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4e/eb/c96d64137e29ae17d83ad2552470bafe3a7a915e85434d9942077d7fd011/feedparser-6.0.12-py3-none-any.whl", hash = "sha256:6bbff10f5a52662c00a2e3f86a38928c37c48f77b3c511aedcd51de933549324", size = 81480, upload-time = "2025-09-10T13:33:58.022Z" },
+]
+
 [[package]]
 name = "filelock"
 version = "3.29.0"
@@ -1825,6 +1839,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b7/46/f5af3402b579fd5e11573ce652019a67074317e18c1935cc0b4ba9b35552/secretstorage-3.5.0-py3-none-any.whl", hash = "sha256:0ce65888c0725fcb2c5bc0fdb8e5438eece02c523557ea40ce0703c266248137", size = 15554, upload-time = "2025-11-23T19:02:51.545Z" },
 ]
 
+[[package]]
+name = "sgmllib3k"
+version = "1.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9e/bd/3704a8c3e0942d711c1299ebf7b9091930adae6675d7c8f476a7ce48653c/sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9", size = 5750, upload-time = "2010-08-24T14:33:52.445Z" }
+
 [[package]]
 name = "shellingham"
 version = "1.5.4"

From 73ae4b0f2afb940ff58a71421c768790268890e3 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 16 Jun 2026 22:18:43 +0000
Subject: [PATCH 3/3] Use a typed default_factory for feed model list fields
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the `= []` field defaults on the pydantic feed models with
`Field(default_factory=list[...])`. pydantic v2 already deep-copies mutable
defaults per instance, so `= []` was not actually shared — but the explicit
typed factory makes per-instance isolation obvious to readers and static
analysis, while keeping the field's element type known under pyright strict
(a bare `default_factory=list` infers `list[Unknown]`).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VwZxsDGG57kDQU4J39u3oq
---
 aai_cli/app/transcribe/feed.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/aai_cli/app/transcribe/feed.py b/aai_cli/app/transcribe/feed.py
index 96a018d2..eb5a7dd1 100644
--- a/aai_cli/app/transcribe/feed.py
+++ b/aai_cli/app/transcribe/feed.py
@@ -22,7 +22,7 @@
 from pathlib import PurePosixPath
 from urllib.parse import urlsplit
 
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 
 from aai_cli.core import youtube
 
@@ -44,7 +44,9 @@ class _Enclosure(BaseModel):
 
 
 class _Entry(BaseModel):
-    enclosures: list[_Enclosure] = []
+    # default_factory (not a shared `= []`) so each entry gets its own list, and the
+    # typed factory keeps the field's element type known under pyright strict.
+    enclosures: list[_Enclosure] = Field(default_factory=list[_Enclosure])
 
 
 class _ParsedFeed(BaseModel):
@@ -54,7 +56,7 @@ class _ParsedFeed(BaseModel):
     # feedparser sets ``version`` to a non-empty id ("rss20", "atom10", …) for a
     # recognized feed and to "" for anything it doesn't recognize as one.
     version: str = ""
-    entries: list[_Entry] = []
+    entries: list[_Entry] = Field(default_factory=list[_Entry])
 
 
 def feed_episode_urls(url: str) -> list[str] | None: