AssemblyAI · alexkroman · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026 · aikido-pr-checks
diff --git a/aai_cli/commands/speak/__init__.py b/aai_cli/commands/speak/__init__.py
@@ -37,6 +37,10 @@
                 "Override a speaker's voice",
                 "… | assembly --sandbox speak --voice A=vera --voice B=paul",
             ),
+            (
+                "Read a web page aloud (boilerplate stripped)",
+                "assembly --sandbox speak --url https://example.com/post",
+            ),
             (
                 "Save to a WAV instead of playing",
                 'assembly --sandbox speak "Hello" --out /tmp/hello.wav',
@@ -47,6 +51,12 @@
 def speak(
     ctx: typer.Context,
     text: str | None = typer.Argument(None, help="Text to speak. Omit to read from stdin."),
+    url: str | None = typer.Option(
+        None,
+        "--url",
+        help="Read a web page aloud: fetch the URL and narrate its main text "
+        "(boilerplate stripped). Mutually exclusive with the text argument",
+    ),
     voice: list[str] = typer.Option(
         [],
         "--voice",
@@ -72,16 +82,18 @@ def speak(
 ) -> None:
     r"""\[sandbox] Synthesize speech from text with AssemblyAI streaming TTS
 
-    Plays the audio through your speakers by default, or writes a WAV with
-    --out. Speaker-labeled input (from 'assembly transcribe
-    --speaker-labels') is detected automatically: the labels are stripped
-    and each speaker gets a different voice. This feature only exists in
-    the sandbox today — run it as 'assembly --sandbox speak' (--sandbox
-    goes before the subcommand).
+    Reads text from the argument, piped stdin, or a web page with --url
+    (its main content is extracted and the boilerplate stripped). Plays the
+    audio through your speakers by default, or writes a WAV with --out.
+    Speaker-labeled input (from 'assembly transcribe --speaker-labels') is
+    detected automatically: the labels are stripped and each speaker gets a
+    different voice. This feature only exists in the sandbox today — run it
+    as 'assembly --sandbox speak' (--sandbox goes before the subcommand).
     """
 
     opts = speak_exec.SpeakOptions(
         text=text,
+        url=url,
         voice=voice,
         language=language,
         sample_rate=sample_rate,

diff --git a/aai_cli/commands/speak/_exec.py b/aai_cli/commands/speak/_exec.py
@@ -12,8 +12,8 @@
 from pathlib import Path
 
 from aai_cli.app.context import AppState
-from aai_cli.core import signals, stdio
-from aai_cli.core.errors import UsageError
+from aai_cli.core import signals, stdio, webpage
+from aai_cli.core.errors import UsageError, mutually_exclusive
 from aai_cli.tts import audio, dialogue, session, voices
 from aai_cli.ui import output
 
@@ -30,12 +30,26 @@ class SpeakOptions:
     resolves it into the ``json_mode`` argument)."""
 
     text: str | None
+    url: str | None
     voice: list[str]
     language: str
     sample_rate: int | None
     out: Path | None
 
 
+def _resolve_input(opts: SpeakOptions) -> str:
+    """The text to speak: a fetched web page when --url is given, else the
+    argument or piped stdin. --url is mutually exclusive with both."""
+    mutually_exclusive(
+        ("the text argument", opts.text),
+        ("--url", opts.url),
+        suggestion="Speak a URL or literal text, not both.",
+    )
+    if opts.url is not None:
+        return webpage.fetch_article(opts.url).text
+    return _read_text(opts.text)
+
+
 def _read_text(text: str | None) -> str:
     """The text to speak: the non-blank argument, or piped stdin when the argument
     is omitted entirely. A *blank* argument (e.g. "") is a usage error, never a
@@ -179,7 +193,7 @@ def _speak_dialogue(
 def run_speak(opts: SpeakOptions, state: AppState, *, json_mode: bool) -> None:
     """Execute one `assembly speak` invocation from already-parsed flags."""
     session.require_available("speak")
-    spoken = _read_text(opts.text)
+    spoken = _resolve_input(opts)
     api_key = state.resolve_api_key()
     bare_voice, overrides = dialogue.parse_voice_overrides(opts.voice)
     # SIGTERM aborts synthesis/playback the same way Ctrl-C does, so an external

diff --git a/aai_cli/core/webpage.py b/aai_cli/core/webpage.py
@@ -0,0 +1,83 @@
+"""Fetch a web page and extract its main article text.
+
+Backs ``assembly speak --url``: httpx2 (the project's pinned client) fetches the
+HTML and trafilatura strips the boilerplate — nav, sidebars, cookie banners,
+footers, comment threads — down to the readable article body, so text-to-speech
+narrates the piece rather than the page chrome. trafilatura (and its lxml
+backend) is the heavy import, so it is deferred to call time to stay off the
+CLI's startup path.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import httpx2 as httpx
+
+from aai_cli.core.errors import APIError, UsageError
+
+# A page fetch shouldn't hang a TTS run; cap it.
+_TIMEOUT = 30.0  # pragma: no mutate -- request timeout; nothing observable to assert
+# Browser-like UA: some sites serve a stub or block page to unknown clients.
+_USER_AGENT = "Mozilla/5.0 (compatible; assembly-cli; +https://www.assemblyai.com)"
+
+
+@dataclass(frozen=True)
+class Article:
+    """The readable content extracted from a web page."""
+
+    text: str
+    title: str | None
+    url: str
+
+
+def fetch_article(url: str) -> Article:
+    """Fetch ``url`` and return its main article text with boilerplate removed.
+
+    Raises a :class:`UsageError` when ``url`` isn't an http(s) address or the
+    page yields no readable text, and an :class:`APIError` when the fetch itself
+    fails (DNS, timeout, non-2xx).
+    """
+    if not url.startswith(("http://", "https://")):
+        raise UsageError(
+            f"Not a web page URL: {url}",
+            suggestion="Pass an http(s) URL, e.g. assembly speak --url https://example.com/post.",
+        )
+    text, title = _extract(_fetch_html(url))
+    if not text:
+        raise UsageError(
+            f"Couldn't find readable text at {url}.",
+            suggestion="The page may be paywalled, JavaScript-rendered, or not an article.",
+        )
+    return Article(text=text, title=title, url=url)
+
+
+def _fetch_html(url: str) -> str:
+    """GET the raw HTML for ``url``, mapping any network/HTTP failure to APIError."""
+    try:
+        with httpx.Client(
+            timeout=_TIMEOUT,
+            follow_redirects=True,
+            headers={"User-Agent": _USER_AGENT},
+        ) as client:
+            response = client.get(url)
+            response.raise_for_status()
+            return response.text
+    except httpx.HTTPError as exc:
+        raise APIError(f"Couldn't fetch {url}: {exc}") from exc
+
+
+def _extract(html: str) -> tuple[str | None, str | None]:
+    """Pull the main text and title out of ``html`` (trafilatura, imported lazily)."""
+    import trafilatura
+
+    text = trafilatura.extract(
+        html,
+        output_format="txt",
+        # Don't narrate the comment thread. trafilatura's comment classifier keys
+        # off real-world markup (Disqus, microformats), which synthetic test
+        # fixtures can't reproduce, so this flag stays unasserted by the suite.
+        include_comments=False,  # pragma: no mutate
+    )
+    title = getattr(trafilatura.extract_metadata(html), "title", None)
+    return text, title
diff --git a/pyproject.toml b/pyproject.toml
@@ -61,6 +61,10 @@ dependencies = [
     # deps. We hand it already-fetched bytes (never a URL) so our bounded, safe
     # httpx fetch stays the only network path.
     "feedparser>=6.0.11",
+    # Web-page article extraction for `assembly speak --url` (webpage.py, imported
+    # lazily). Strips boilerplate down to the readable body; ships prebuilt wheels
+    # (lxml included), so it adds no source-compile step to Homebrew bottling.
+    "trafilatura>=2.1.0",
 ]
 
 [project.urls]

diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr
@@ -659,17 +659,23 @@
 
    [sandbox] Synthesize speech from text with AssemblyAI streaming TTS
 
-   Plays the audio through your speakers by default, or writes a WAV with
-   --out. Speaker-labeled input (from 'assembly transcribe
-   --speaker-labels') is detected automatically: the labels are stripped
-   and each speaker gets a different voice. This feature only exists in
-   the sandbox today — run it as 'assembly --sandbox speak' (--sandbox
-   goes before the subcommand).
+   Reads text from the argument, piped stdin, or a web page with --url
+   (its main content is extracted and the boilerplate stripped). Plays the
+   audio through your speakers by default, or writes a WAV with --out.
+   Speaker-labeled input (from 'assembly transcribe --speaker-labels') is
+   detected automatically: the labels are stripped and each speaker gets a
+   different voice. This feature only exists in the sandbox today — run it
+   as 'assembly --sandbox speak' (--sandbox goes before the subcommand).
 
   ╭─ Arguments ──────────────────────────────────────────────────────────────────╮
   │   text      [TEXT]  Text to speak. Omit to read from stdin.                  │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Options ────────────────────────────────────────────────────────────────────╮
+  │ --url                  TEXT                  Read a web page aloud: fetch    │
+  │                                              the URL and narrate its main    │
+  │                                              text (boilerplate stripped).    │
+  │                                              Mutually exclusive with the     │
+  │                                              text argument                   │
   │ --voice                TEXT                  Voice id (e.g. jane, michael,   │
   │                                              mary, paul, eve, george), or    │
   │                                              SPEAKER=VOICE for diarized      │
@@ -699,6 +705,8 @@
    $ assembly transcribe meeting.mp3 --speaker-labels | assembly --sandbox speak
    Override a speaker's voice
    $ … | assembly --sandbox speak --voice A=vera --voice B=paul
+   Read a web page aloud (boilerplate stripped)
+   $ assembly --sandbox speak --url https://example.com/post
    Save to a WAV instead of playing
    $ assembly --sandbox speak "Hello" --out /tmp/hello.wav
 

diff --git a/tests/test_command_options_seam.py b/tests/test_command_options_seam.py
@@ -94,6 +94,7 @@
 
 SPEAK_DEFAULTS = speak_exec.SpeakOptions(
     text=None,
+    url=None,
     voice=[],
     language=speak_exec.DEFAULT_LANGUAGE,
     sample_rate=None,

diff --git a/tests/test_speak.py b/tests/test_speak.py
@@ -82,6 +82,31 @@ def test_out_writes_wav_and_does_not_play(monkeypatch, tmp_path, fake_synthesize
     assert "played" not in result.stderr
 
 
+def test_url_reads_web_page_aloud(monkeypatch, fake_synthesize):
+    # --url fetches a page and narrates its extracted main text.
+    from aai_cli.core import webpage
+
+    monkeypatch.setattr("aai_cli.commands.speak._exec.audio.play_pcm", lambda *a, **k: None)
+    monkeypatch.setattr(
+        webpage,
+        "fetch_article",
+        lambda url: webpage.Article(text="The article body.", title="Headline", url=url),
+    )
+    result = runner.invoke(app, ["--sandbox", "speak", "--url", "https://example.com/post"])
+    assert result.exit_code == 0
+    assert fake_synthesize["cfg"].text == "The article body."
+
+
+def test_url_and_text_argument_are_mutually_exclusive(monkeypatch):
+    result = runner.invoke(
+        app, ["--sandbox", "speak", "Hello", "--url", "https://example.com/post"]
+    )
+    assert result.exit_code == 2
+    # Both conflicting inputs are named so the fix is unambiguous.
+    assert "the text argument" in result.output
+    assert "--url" in result.output
+
+
 def test_installs_sigterm_handler_around_synthesis(monkeypatch):
     captured: dict[str, object] = {}
 

diff --git a/tests/test_webpage.py b/tests/test_webpage.py
@@ -0,0 +1,121 @@
+from __future__ import annotations
+
+import dataclasses
+
+import httpx2 as httpx
+import pytest
+
+from aai_cli.core import webpage
+from aai_cli.core.errors import APIError, UsageError
+
+# An article wrapped in the usual page chrome: nav, a comment thread, and a
+# <title>. The extractor should keep the body and drop the rest.
+ARTICLE_HTML = """<!DOCTYPE html><html><head><title>The Real Headline</title></head>
+<body>
+<nav>Home | About | SubscribeNavBoilerplate</nav>
+<article>
+<h1>The Real Headline</h1>
+<p>This is the first real paragraph of the article body that we care about.</p>
+<p>Here is a second substantive paragraph with more content to extract.</p>
+</article>
+<section class="comments"><p>UserCommentText that appears in the discussion thread below.</p></section>
+<footer>FooterBoilerplate copyright 2026.</footer>
+</body></html>"""
+
+
+def _client_returning(monkeypatch, handler):
+    """Patch webpage.httpx.Client to route requests through a MockTransport handler
+    (the test_eval_data_hf.py pattern) so no real socket is opened."""
+    real_client = httpx.Client
+
+    def fake_client(*args, **kwargs):
+        kwargs["transport"] = httpx.MockTransport(handler)
+        return real_client(*args, **kwargs)
+
+    monkeypatch.setattr(webpage.httpx, "Client", fake_client)
+
+
+def test_article_is_immutable():
+    # frozen=True: a fetched Article can't be mutated out from under a caller.
+    article = webpage.Article(text="body", title="T", url="https://example.com/p")
+    # A dynamic field name (not a literal) keeps pyright from resolving the
+    # assignment to the read-only attribute — see test_command_options_seam.py.
+    field_name = dataclasses.fields(article)[0].name
+    with pytest.raises(dataclasses.FrozenInstanceError):
+        setattr(article, field_name, "tampered")
+
+
+def test_fetch_html_returns_body_and_sends_browser_user_agent(monkeypatch):
+    seen: dict[str, str] = {}
+
+    def handler(request: httpx.Request) -> httpx.Response:
+        seen["ua"] = request.headers["user-agent"]
+        return httpx.Response(200, text="<html>ok</html>")
+
+    _client_returning(monkeypatch, handler)
+    assert webpage._fetch_html("https://example.com/post") == "<html>ok</html>"
+    # The browser-like UA is sent so sites don't serve a stub/block page.
+    assert "assembly-cli" in seen["ua"]
+
+
+def test_fetch_html_follows_redirects(monkeypatch):
+    # A 301 must be followed to the final 200; without follow_redirects the
+    # client would return the empty 301 body instead of the article.
+    def handler(request: httpx.Request) -> httpx.Response:
+        if request.url.path == "/start":
+            return httpx.Response(301, headers={"Location": "https://example.com/final"})
+        return httpx.Response(200, text="final body")
+
+    _client_returning(monkeypatch, handler)
+    assert webpage._fetch_html("https://example.com/start") == "final body"
+
+
+def test_fetch_html_non_2xx_becomes_api_error(monkeypatch):
+    _client_returning(monkeypatch, lambda request: httpx.Response(404, text="nope"))
+    with pytest.raises(APIError) as exc:
+        webpage._fetch_html("https://example.com/missing")
+    assert "https://example.com/missing" in exc.value.message
+
+
+def test_fetch_html_connect_error_becomes_api_error(monkeypatch):
+    def handler(request: httpx.Request) -> httpx.Response:
+        raise httpx.ConnectError("boom")
+
+    _client_returning(monkeypatch, handler)
+    with pytest.raises(APIError):
+        webpage._fetch_html("https://example.com/post")
+
+
+def test_extract_strips_boilerplate_and_comments_and_reads_title():
+    text, title = webpage._extract(ARTICLE_HTML)
+    assert text is not None
+    # The article body survives...
+    assert "first real paragraph of the article body" in text
+    # ...while the nav and footer chrome are dropped.
+    assert "NavBoilerplate" not in text
+    assert "FooterBoilerplate" not in text
+    # The <title> drives the extracted title.
+    assert title == "The Real Headline"
+
+
+def test_fetch_article_rejects_non_http_url():
+    with pytest.raises(UsageError) as exc:
+        webpage.fetch_article("ftp://example.com/file")
+    assert "Not a web page URL" in exc.value.message
+    assert "http" in (exc.value.suggestion or "")
+
+
+def test_fetch_article_returns_extracted_text_and_title(monkeypatch):
+    monkeypatch.setattr(webpage, "_fetch_html", lambda url: ARTICLE_HTML)
+    article = webpage.fetch_article("https://example.com/post")
+    assert "first real paragraph of the article body" in article.text
+    assert article.title == "The Real Headline"
+    assert article.url == "https://example.com/post"
+
+
+def test_fetch_article_without_readable_text_is_a_usage_error(monkeypatch):
+    # A page trafilatura can't extract an article from yields no text -> usage error.
+    monkeypatch.setattr(webpage, "_fetch_html", lambda url: "<html><body></body></html>")
+    with pytest.raises(UsageError) as exc:
+        webpage.fetch_article("https://example.com/empty")
+    assert "Couldn't find readable text" in exc.value.message