From 3803309ae1dad9b3363ea35cf1a884b83a17119b Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 17 Jun 2026 15:04:48 +0000
Subject: [PATCH] speak: narrate PDFs via --url, not just HTML pages

assembly speak --url now handles PDF URLs in addition to HTML. The fetch
reads the full response and dispatches on content type: PDFs (detected by
Content-Type or the %PDF- magic bytes, so a mislabeled octet-stream still
routes correctly) go through pypdf's text-layer extraction; HTML keeps the
trafilatura boilerplate-stripping path. A scanned/image-only PDF (no text
layer) and an unparseable PDF both surface a clear UsageError.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_015Z1o33Ezt9aznmePd4Jc9R
---
 aai_cli/commands/speak/__init__.py            |   4 +-
 aai_cli/core/webpage.py                       |  77 +++++++++---
 pyproject.toml                                |   4 +
 .../test_snapshots_help_run.ambr              |  12 +-
 tests/test_webpage.py                         | 119 ++++++++++++++++--
 uv.lock                                       |  11 ++
 6 files changed, 192 insertions(+), 35 deletions(-)

diff --git a/aai_cli/commands/speak/__init__.py b/aai_cli/commands/speak/__init__.py
index f5679515..9df21b4a 100644
--- a/aai_cli/commands/speak/__init__.py
+++ b/aai_cli/commands/speak/__init__.py
@@ -54,7 +54,7 @@ def speak(
     url: str | None = typer.Option(
         None,
         "--url",
-        help="Read a web page aloud: fetch the URL and narrate its main text "
+        help="Read a web page or PDF aloud: fetch the URL and narrate its main text "
         "(boilerplate stripped). Mutually exclusive with the text argument",
     ),
     voice: list[str] = typer.Option(
@@ -82,7 +82,7 @@ def speak(
 ) -> None:
     r"""\[sandbox] Synthesize speech from text with AssemblyAI streaming TTS
 
-    Reads text from the argument, piped stdin, or a web page with --url
+    Reads text from the argument, piped stdin, or a web page or PDF with --url
     (its main content is extracted and the boilerplate stripped). Plays the
     audio through your speakers by default, or writes a WAV with --out.
     Speaker-labeled input (from 'assembly transcribe --speaker-labels') is
diff --git a/aai_cli/core/webpage.py b/aai_cli/core/webpage.py
index 244df6ee..f711ff70 100644
--- a/aai_cli/core/webpage.py
+++ b/aai_cli/core/webpage.py
@@ -1,10 +1,12 @@
-"""Fetch a web page and extract its main article text.
+"""Fetch a web page (or PDF) and extract its main readable text.
 
 Backs ``assembly speak --url``: httpx2 (the project's pinned client) fetches the
-HTML and trafilatura strips the boilerplate — nav, sidebars, cookie banners,
-footers, comment threads — down to the readable article body, so text-to-speech
-narrates the piece rather than the page chrome. trafilatura (and its lxml
-backend) is the heavy import, so it is deferred to call time to stay off the
+resource, then the body is narrowed to the readable text. For HTML, trafilatura
+strips the boilerplate — nav, sidebars, cookie banners, footers, comment threads
+— down to the article body; for a PDF (detected by Content-Type or the ``%PDF-``
+magic bytes) pypdf pulls the text layer out of every page. Either way text-to-speech
+narrates the piece rather than the page chrome. trafilatura (and its lxml backend)
+and pypdf are the heavy imports, so both are deferred to call time to stay off the
 CLI's startup path.
 """
 
@@ -20,11 +22,14 @@
 _TIMEOUT = 30.0  # pragma: no mutate -- request timeout; nothing observable to assert
 # Browser-like UA: some sites serve a stub or block page to unknown clients.
 _USER_AGENT = "Mozilla/5.0 (compatible; assembly-cli; +https://www.assemblyai.com)"
+# Every PDF begins with this signature; the robust signal when a server mislabels
+# the Content-Type (e.g. application/octet-stream) or the URL has no .pdf suffix.
+_PDF_MAGIC = b"%PDF-"
 
 
 @dataclass(frozen=True)
 class Article:
-    """The readable content extracted from a web page."""
+    """The readable content extracted from a web page or PDF."""
 
     text: str
     title: str | None
@@ -32,10 +37,11 @@ class Article:
 
 
 def fetch_article(url: str) -> Article:
-    """Fetch ``url`` and return its main article text with boilerplate removed.
+    """Fetch ``url`` and return its main readable text with boilerplate removed.
 
-    Raises a :class:`UsageError` when ``url`` isn't an http(s) address or the
-    page yields no readable text, and an :class:`APIError` when the fetch itself
+    HTML pages go through trafilatura; PDFs go through pypdf. Raises a
+    :class:`UsageError` when ``url`` isn't an http(s) address or the resource
+    yields no readable text, and an :class:`APIError` when the fetch itself
     fails (DNS, timeout, non-2xx).
     """
     if not url.startswith(("http://", "https://")):
@@ -43,17 +49,29 @@ def fetch_article(url: str) -> Article:
             f"Not a web page URL: {url}",
             suggestion="Pass an http(s) URL, e.g. assembly speak --url https://example.com/post.",
         )
-    text, title = _extract(_fetch_html(url))
-    if not text:
-        raise UsageError(
-            f"Couldn't find readable text at {url}.",
-            suggestion="The page may be paywalled, JavaScript-rendered, or not an article.",
+    response = _fetch(url)
+    content_type = response.headers.get("content-type", "").lower()
+    data = response.content
+    if _is_pdf(data, content_type):
+        text, title = _extract_pdf(data)
+        empty_hint = (
+            "The PDF may be scanned or image-only — there's no text layer to read "
+            "(that needs OCR, which speak doesn't do)."
         )
+    else:
+        text, title = _extract(response.text)
+        empty_hint = "The page may be paywalled, JavaScript-rendered, or not an article."
+    if not text:
+        raise UsageError(f"Couldn't find readable text at {url}.", suggestion=empty_hint)
     return Article(text=text, title=title, url=url)
 
 
-def _fetch_html(url: str) -> str:
-    """GET the raw HTML for ``url``, mapping any network/HTTP failure to APIError."""
+def _fetch(url: str) -> httpx.Response:
+    """GET ``url``, mapping any network/HTTP failure to APIError.
+
+    Returns the fully-read response so the caller can read it as text (HTML) or
+    bytes (PDF) depending on the content type.
+    """
     try:
         with httpx.Client(
             timeout=_TIMEOUT,
@@ -62,11 +80,16 @@ def _fetch_html(url: str) -> str:
         ) as client:
             response = client.get(url)
             response.raise_for_status()
-            return response.text
+            return response
     except httpx.HTTPError as exc:
         raise APIError(f"Couldn't fetch {url}: {exc}") from exc
 
 
+def _is_pdf(data: bytes, content_type: str) -> bool:
+    """True when ``data`` is a PDF, by Content-Type or the ``%PDF-`` magic bytes."""
+    return "application/pdf" in content_type or data.startswith(_PDF_MAGIC)
+
+
 def _extract(html: str) -> tuple[str | None, str | None]:
     """Pull the main text and title out of ``html`` (trafilatura, imported lazily)."""
     import trafilatura
@@ -81,3 +104,23 @@ def _extract(html: str) -> tuple[str | None, str | None]:
     )
     title = getattr(trafilatura.extract_metadata(html), "title", None)
     return text, title
+
+
+def _extract_pdf(data: bytes) -> tuple[str | None, str | None]:
+    """Pull the text layer and title out of a PDF (pypdf, imported lazily)."""
+    from io import BytesIO
+
+    from pypdf import PdfReader
+    from pypdf.errors import PyPdfError
+
+    try:
+        reader = PdfReader(BytesIO(data))
+        pages = [page.extract_text() for page in reader.pages]
+    except PyPdfError as exc:
+        raise UsageError(
+            f"Couldn't read the PDF at the URL: {exc}",
+            suggestion="The file may be encrypted, corrupt, or not a valid PDF.",
+        ) from exc
+    text = "\n\n".join(page for page in pages if page).strip() or None
+    title = getattr(reader.metadata, "title", None)
+    return text, title
diff --git a/pyproject.toml b/pyproject.toml
index 78404223..609b661f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,6 +65,10 @@ dependencies = [
     # lazily). Strips boilerplate down to the readable body; ships prebuilt wheels
     # (lxml included), so it adds no source-compile step to Homebrew bottling.
     "trafilatura>=2.1.0",
+    # PDF text extraction for `assembly speak --url` when the URL serves a PDF
+    # (webpage.py, imported lazily). Pure-Python, permissively licensed, ships a
+    # universal wheel, so it adds no source-compile step to Homebrew bottling.
+    "pypdf>=5.1.0",
 ]
 
 [project.urls]
diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr
index e9a69d92..ecd8d601 100644
--- a/tests/__snapshots__/test_snapshots_help_run.ambr
+++ b/tests/__snapshots__/test_snapshots_help_run.ambr
@@ -706,7 +706,7 @@
   
    [sandbox] Synthesize speech from text with AssemblyAI streaming TTS
   
-   Reads text from the argument, piped stdin, or a web page with --url
+   Reads text from the argument, piped stdin, or a web page or PDF with --url
    (its main content is extracted and the boilerplate stripped). Plays the
    audio through your speakers by default, or writes a WAV with --out.
    Speaker-labeled input (from 'assembly transcribe --speaker-labels') is
@@ -718,11 +718,11 @@
   │   text      [TEXT]  Text to speak. Omit to read from stdin.                  │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Options ────────────────────────────────────────────────────────────────────╮
-  │ --url                  TEXT                  Read a web page aloud: fetch    │
-  │                                              the URL and narrate its main    │
-  │                                              text (boilerplate stripped).    │
-  │                                              Mutually exclusive with the     │
-  │                                              text argument                   │
+  │ --url                  TEXT                  Read a web page or PDF aloud:   │
+  │                                              fetch the URL and narrate its   │
+  │                                              main text (boilerplate          │
+  │                                              stripped). Mutually exclusive   │
+  │                                              with the text argument          │
   │ --voice                TEXT                  Voice id (e.g. jane, michael,   │
   │                                              mary, paul, eve, george), or    │
   │                                              SPEAKER=VOICE for diarized      │
diff --git a/tests/test_webpage.py b/tests/test_webpage.py
index 893c7b81..2c18c46c 100644
--- a/tests/test_webpage.py
+++ b/tests/test_webpage.py
@@ -45,7 +45,7 @@ def test_article_is_immutable():
         setattr(article, field_name, "tampered")
 
 
-def test_fetch_html_returns_body_and_sends_browser_user_agent(monkeypatch):
+def test_fetch_returns_body_and_sends_browser_user_agent(monkeypatch):
     seen: dict[str, str] = {}
 
     def handler(request: httpx.Request) -> httpx.Response:
@@ -53,12 +53,12 @@ def handler(request: httpx.Request) -> httpx.Response:
         return httpx.Response(200, text="<html>ok</html>")
 
     _client_returning(monkeypatch, handler)
-    assert webpage._fetch_html("https://example.com/post") == "<html>ok</html>"
+    assert webpage._fetch("https://example.com/post").text == "<html>ok</html>"
     # The browser-like UA is sent so sites don't serve a stub/block page.
     assert "assembly-cli" in seen["ua"]
 
 
-def test_fetch_html_follows_redirects(monkeypatch):
+def test_fetch_follows_redirects(monkeypatch):
     # A 301 must be followed to the final 200; without follow_redirects the
     # client would return the empty 301 body instead of the article.
     def handler(request: httpx.Request) -> httpx.Response:
@@ -67,23 +67,23 @@ def handler(request: httpx.Request) -> httpx.Response:
         return httpx.Response(200, text="final body")
 
     _client_returning(monkeypatch, handler)
-    assert webpage._fetch_html("https://example.com/start") == "final body"
+    assert webpage._fetch("https://example.com/start").text == "final body"
 
 
-def test_fetch_html_non_2xx_becomes_api_error(monkeypatch):
+def test_fetch_non_2xx_becomes_api_error(monkeypatch):
     _client_returning(monkeypatch, lambda request: httpx.Response(404, text="nope"))
     with pytest.raises(APIError) as exc:
-        webpage._fetch_html("https://example.com/missing")
+        webpage._fetch("https://example.com/missing")
     assert "https://example.com/missing" in exc.value.message
 
 
-def test_fetch_html_connect_error_becomes_api_error(monkeypatch):
+def test_fetch_connect_error_becomes_api_error(monkeypatch):
     def handler(request: httpx.Request) -> httpx.Response:
         raise httpx.ConnectError("boom")
 
     _client_returning(monkeypatch, handler)
     with pytest.raises(APIError):
-        webpage._fetch_html("https://example.com/post")
+        webpage._fetch("https://example.com/post")
 
 
 def test_extract_strips_boilerplate_and_comments_and_reads_title():
@@ -106,7 +106,12 @@ def test_fetch_article_rejects_non_http_url():
 
 
 def test_fetch_article_returns_extracted_text_and_title(monkeypatch):
-    monkeypatch.setattr(webpage, "_fetch_html", lambda url: ARTICLE_HTML)
+    _client_returning(
+        monkeypatch,
+        lambda request: httpx.Response(
+            200, text=ARTICLE_HTML, headers={"content-type": "text/html; charset=utf-8"}
+        ),
+    )
     article = webpage.fetch_article("https://example.com/post")
     assert "first real paragraph of the article body" in article.text
     assert article.title == "The Real Headline"
@@ -115,7 +120,101 @@ def test_fetch_article_returns_extracted_text_and_title(monkeypatch):
 
 def test_fetch_article_without_readable_text_is_a_usage_error(monkeypatch):
     # A page trafilatura can't extract an article from yields no text -> usage error.
-    monkeypatch.setattr(webpage, "_fetch_html", lambda url: "<html><body></body></html>")
+    _client_returning(
+        monkeypatch,
+        lambda request: httpx.Response(
+            200, text="<html><body></body></html>", headers={"content-type": "text/html"}
+        ),
+    )
     with pytest.raises(UsageError) as exc:
         webpage.fetch_article("https://example.com/empty")
     assert "Couldn't find readable text" in exc.value.message
+    # The HTML-specific hint, not the scanned-PDF one.
+    assert "paywalled" in (exc.value.suggestion or "")
+
+
+def _make_pdf(body_text: str, *, title: str | None = None) -> bytes:
+    """Build a minimal one-page PDF whose content stream shows ``body_text``.
+
+    Offsets in the xref table are computed as the file is assembled so pypdf reads
+    it without falling back to recovery — enough of a real PDF to exercise the
+    text-layer extraction path end to end.
+    """
+    content = b"BT /F1 24 Tf 72 120 Td (" + body_text.encode("latin-1") + b") Tj ET"
+    info = b"<< /Title (" + title.encode("latin-1") + b") >>" if title else None
+    page = (
+        b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 300 200] /Contents 4 0 R "
+        b"/Resources << /Font << /F1 5 0 R >> >> >>"
+    )
+    objects = [
+        b"<< /Type /Catalog /Pages 2 0 R >>",
+        b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
+        page,
+        b"<< /Length %d >>\nstream\n%s\nendstream" % (len(content), content),
+        b"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>",
+    ]
+    if info is not None:
+        objects.append(info)
+    out = bytearray(b"%PDF-1.4\n")
+    offsets: list[int] = []
+    for i, obj in enumerate(objects, start=1):
+        offsets.append(len(out))
+        out += b"%d 0 obj\n%s\nendobj\n" % (i, obj)
+    xref_pos = len(out)
+    out += b"xref\n0 %d\n0000000000 65535 f \n" % (len(objects) + 1)
+    for off in offsets:
+        out += b"%010d 00000 n \n" % off
+    trailer = b"<< /Size %d /Root 1 0 R" % (len(objects) + 1)
+    if info is not None:
+        trailer += b" /Info %d 0 R" % len(objects)
+    trailer += b" >>"
+    out += b"trailer\n%s\nstartxref\n%d\n%%%%EOF" % (trailer, xref_pos)
+    return bytes(out)
+
+
+def _pdf_response(data: bytes, content_type: str = "application/pdf") -> httpx.Response:
+    return httpx.Response(200, content=data, headers={"content-type": content_type})
+
+
+def test_is_pdf_detects_by_content_type_and_magic_bytes():
+    # Either signal alone is sufficient...
+    assert webpage._is_pdf(b"not a pdf", "application/pdf; charset=binary")
+    assert webpage._is_pdf(b"%PDF-1.7\n...", "application/octet-stream")
+    # ...and an HTML response is not a PDF.
+    assert not webpage._is_pdf(b"<html></html>", "text/html; charset=utf-8")
+
+
+def test_fetch_article_extracts_pdf_text_and_title(monkeypatch):
+    pdf = _make_pdf("Hello from the PDF body text", title="A PDF Report")
+    _client_returning(monkeypatch, lambda request: _pdf_response(pdf))
+    article = webpage.fetch_article("https://example.com/report")
+    assert "Hello from the PDF body text" in article.text
+    assert article.title == "A PDF Report"
+    assert article.url == "https://example.com/report"
+
+
+def test_fetch_article_dispatches_pdf_by_magic_bytes_despite_generic_type(monkeypatch):
+    # A server mislabeling the PDF as octet-stream still takes the PDF path.
+    pdf = _make_pdf("Magic-byte routed body")
+    _client_returning(
+        monkeypatch, lambda request: _pdf_response(pdf, content_type="application/octet-stream")
+    )
+    assert "Magic-byte routed body" in webpage.fetch_article("https://example.com/x").text
+
+
+def test_fetch_article_scanned_pdf_without_text_is_a_usage_error(monkeypatch):
+    # An image-only PDF has no text layer -> usage error with the OCR-shaped hint.
+    pdf = _make_pdf("")
+    _client_returning(monkeypatch, lambda request: _pdf_response(pdf))
+    with pytest.raises(UsageError) as exc:
+        webpage.fetch_article("https://example.com/scanned.pdf")
+    assert "Couldn't find readable text" in exc.value.message
+    assert "scanned" in (exc.value.suggestion or "")
+
+
+def test_fetch_article_corrupt_pdf_is_a_usage_error(monkeypatch):
+    # Passes the %PDF- magic check but isn't a parseable PDF -> usage error.
+    _client_returning(monkeypatch, lambda request: _pdf_response(b"%PDF-1.4\nnot really a pdf"))
+    with pytest.raises(UsageError) as exc:
+        webpage.fetch_article("https://example.com/broken.pdf")
+    assert "PDF" in exc.value.message
diff --git a/uv.lock b/uv.lock
index e73c0e56..53e2d924 100644
--- a/uv.lock
+++ b/uv.lock
@@ -31,6 +31,7 @@ dependencies = [
     { name = "packaging" },
     { name = "platformdirs" },
     { name = "pydantic" },
+    { name = "pypdf" },
     { name = "questionary" },
     { name = "rich" },
     { name = "sounddevice" },
@@ -85,6 +86,7 @@ requires-dist = [
     { name = "packaging", specifier = ">=24.0" },
     { name = "platformdirs", specifier = ">=4.10.0" },
     { name = "pydantic", specifier = ">=2.13.4" },
+    { name = "pypdf", specifier = ">=5.1.0" },
     { name = "questionary", specifier = ">=2.0.1" },
     { name = "rich", specifier = ">=15.0.0" },
     { name = "sounddevice", specifier = ">=0.5.5" },
@@ -1648,6 +1650,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" },
 ]
 
+[[package]]
+name = "pypdf"
+version = "6.13.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/99/0a/48fe05c6bb3aa4bb4d2a4079a383d33c0dfec1edf613a642f07d8b8b5c2e/pypdf-6.13.2.tar.gz", hash = "sha256:5a96a17dbdfbf9c2ab24c0a13fa0aba182be22ba6f283098712c16fc242f509f", size = 6479250, upload-time = "2026-06-10T16:42:34.5Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cb/17/378943705992f74e451a06de3401ce68e3213763c81e44d0614559c45599/pypdf-6.13.2-py3-none-any.whl", hash = "sha256:6eeb9e57693f29d41bd01255d02660cbbb41fd7fc818a982677389a35e4f2083", size = 346555, upload-time = "2026-06-10T16:42:32.37Z" },
+]
+
 [[package]]
 name = "pyright"
 version = "1.1.410"