From 3803309ae1dad9b3363ea35cf1a884b83a17119b Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 17 Jun 2026 15:04:48 +0000 Subject: [PATCH] speak: narrate PDFs via --url, not just HTML pages assembly speak --url now handles PDF URLs in addition to HTML. The fetch reads the full response and dispatches on content type: PDFs (detected by Content-Type or the %PDF- magic bytes, so a mislabeled octet-stream still routes correctly) go through pypdf's text-layer extraction; HTML keeps the trafilatura boilerplate-stripping path. A scanned/image-only PDF (no text layer) and an unparseable PDF both surface a clear UsageError. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_015Z1o33Ezt9aznmePd4Jc9R --- aai_cli/commands/speak/__init__.py | 4 +- aai_cli/core/webpage.py | 77 +++++++++--- pyproject.toml | 4 + .../test_snapshots_help_run.ambr | 12 +- tests/test_webpage.py | 119 ++++++++++++++++-- uv.lock | 11 ++ 6 files changed, 192 insertions(+), 35 deletions(-) diff --git a/aai_cli/commands/speak/__init__.py b/aai_cli/commands/speak/__init__.py index f5679515..9df21b4a 100644 --- a/aai_cli/commands/speak/__init__.py +++ b/aai_cli/commands/speak/__init__.py @@ -54,7 +54,7 @@ def speak( url: str | None = typer.Option( None, "--url", - help="Read a web page aloud: fetch the URL and narrate its main text " + help="Read a web page or PDF aloud: fetch the URL and narrate its main text " "(boilerplate stripped). Mutually exclusive with the text argument", ), voice: list[str] = typer.Option( @@ -82,7 +82,7 @@ def speak( ) -> None: r"""\[sandbox] Synthesize speech from text with AssemblyAI streaming TTS - Reads text from the argument, piped stdin, or a web page with --url + Reads text from the argument, piped stdin, or a web page or PDF with --url (its main content is extracted and the boilerplate stripped). Plays the audio through your speakers by default, or writes a WAV with --out. Speaker-labeled input (from 'assembly transcribe --speaker-labels') is diff --git a/aai_cli/core/webpage.py b/aai_cli/core/webpage.py index 244df6ee..f711ff70 100644 --- a/aai_cli/core/webpage.py +++ b/aai_cli/core/webpage.py @@ -1,10 +1,12 @@ -"""Fetch a web page and extract its main article text. +"""Fetch a web page (or PDF) and extract its main readable text. Backs ``assembly speak --url``: httpx2 (the project's pinned client) fetches the -HTML and trafilatura strips the boilerplate — nav, sidebars, cookie banners, -footers, comment threads — down to the readable article body, so text-to-speech -narrates the piece rather than the page chrome. trafilatura (and its lxml -backend) is the heavy import, so it is deferred to call time to stay off the +resource, then the body is narrowed to the readable text. For HTML, trafilatura +strips the boilerplate — nav, sidebars, cookie banners, footers, comment threads +— down to the article body; for a PDF (detected by Content-Type or the ``%PDF-`` +magic bytes) pypdf pulls the text layer out of every page. Either way text-to-speech +narrates the piece rather than the page chrome. trafilatura (and its lxml backend) +and pypdf are the heavy imports, so both are deferred to call time to stay off the CLI's startup path. """ @@ -20,11 +22,14 @@ _TIMEOUT = 30.0 # pragma: no mutate -- request timeout; nothing observable to assert # Browser-like UA: some sites serve a stub or block page to unknown clients. _USER_AGENT = "Mozilla/5.0 (compatible; assembly-cli; +https://www.assemblyai.com)" +# Every PDF begins with this signature; the robust signal when a server mislabels +# the Content-Type (e.g. application/octet-stream) or the URL has no .pdf suffix. +_PDF_MAGIC = b"%PDF-" @dataclass(frozen=True) class Article: - """The readable content extracted from a web page.""" + """The readable content extracted from a web page or PDF.""" text: str title: str | None @@ -32,10 +37,11 @@ class Article: def fetch_article(url: str) -> Article: - """Fetch ``url`` and return its main article text with boilerplate removed. + """Fetch ``url`` and return its main readable text with boilerplate removed. - Raises a :class:`UsageError` when ``url`` isn't an http(s) address or the - page yields no readable text, and an :class:`APIError` when the fetch itself + HTML pages go through trafilatura; PDFs go through pypdf. Raises a + :class:`UsageError` when ``url`` isn't an http(s) address or the resource + yields no readable text, and an :class:`APIError` when the fetch itself fails (DNS, timeout, non-2xx). """ if not url.startswith(("http://", "https://")): @@ -43,17 +49,29 @@ def fetch_article(url: str) -> Article: f"Not a web page URL: {url}", suggestion="Pass an http(s) URL, e.g. assembly speak --url https://example.com/post.", ) - text, title = _extract(_fetch_html(url)) - if not text: - raise UsageError( - f"Couldn't find readable text at {url}.", - suggestion="The page may be paywalled, JavaScript-rendered, or not an article.", + response = _fetch(url) + content_type = response.headers.get("content-type", "").lower() + data = response.content + if _is_pdf(data, content_type): + text, title = _extract_pdf(data) + empty_hint = ( + "The PDF may be scanned or image-only — there's no text layer to read " + "(that needs OCR, which speak doesn't do)." ) + else: + text, title = _extract(response.text) + empty_hint = "The page may be paywalled, JavaScript-rendered, or not an article." + if not text: + raise UsageError(f"Couldn't find readable text at {url}.", suggestion=empty_hint) return Article(text=text, title=title, url=url) -def _fetch_html(url: str) -> str: - """GET the raw HTML for ``url``, mapping any network/HTTP failure to APIError.""" +def _fetch(url: str) -> httpx.Response: + """GET ``url``, mapping any network/HTTP failure to APIError. + + Returns the fully-read response so the caller can read it as text (HTML) or + bytes (PDF) depending on the content type. + """ try: with httpx.Client( timeout=_TIMEOUT, @@ -62,11 +80,16 @@ def _fetch_html(url: str) -> str: ) as client: response = client.get(url) response.raise_for_status() - return response.text + return response except httpx.HTTPError as exc: raise APIError(f"Couldn't fetch {url}: {exc}") from exc +def _is_pdf(data: bytes, content_type: str) -> bool: + """True when ``data`` is a PDF, by Content-Type or the ``%PDF-`` magic bytes.""" + return "application/pdf" in content_type or data.startswith(_PDF_MAGIC) + + def _extract(html: str) -> tuple[str | None, str | None]: """Pull the main text and title out of ``html`` (trafilatura, imported lazily).""" import trafilatura @@ -81,3 +104,23 @@ def _extract(html: str) -> tuple[str | None, str | None]: ) title = getattr(trafilatura.extract_metadata(html), "title", None) return text, title + + +def _extract_pdf(data: bytes) -> tuple[str | None, str | None]: + """Pull the text layer and title out of a PDF (pypdf, imported lazily).""" + from io import BytesIO + + from pypdf import PdfReader + from pypdf.errors import PyPdfError + + try: + reader = PdfReader(BytesIO(data)) + pages = [page.extract_text() for page in reader.pages] + except PyPdfError as exc: + raise UsageError( + f"Couldn't read the PDF at the URL: {exc}", + suggestion="The file may be encrypted, corrupt, or not a valid PDF.", + ) from exc + text = "\n\n".join(page for page in pages if page).strip() or None + title = getattr(reader.metadata, "title", None) + return text, title diff --git a/pyproject.toml b/pyproject.toml index 78404223..609b661f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,6 +65,10 @@ dependencies = [ # lazily). Strips boilerplate down to the readable body; ships prebuilt wheels # (lxml included), so it adds no source-compile step to Homebrew bottling. "trafilatura>=2.1.0", + # PDF text extraction for `assembly speak --url` when the URL serves a PDF + # (webpage.py, imported lazily). Pure-Python, permissively licensed, ships a + # universal wheel, so it adds no source-compile step to Homebrew bottling. + "pypdf>=5.1.0", ] [project.urls] diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr index e9a69d92..ecd8d601 100644 --- a/tests/__snapshots__/test_snapshots_help_run.ambr +++ b/tests/__snapshots__/test_snapshots_help_run.ambr @@ -706,7 +706,7 @@ [sandbox] Synthesize speech from text with AssemblyAI streaming TTS - Reads text from the argument, piped stdin, or a web page with --url + Reads text from the argument, piped stdin, or a web page or PDF with --url (its main content is extracted and the boilerplate stripped). Plays the audio through your speakers by default, or writes a WAV with --out. Speaker-labeled input (from 'assembly transcribe --speaker-labels') is @@ -718,11 +718,11 @@ │ text [TEXT] Text to speak. Omit to read from stdin. │ ╰──────────────────────────────────────────────────────────────────────────────╯ ╭─ Options ────────────────────────────────────────────────────────────────────╮ - │ --url TEXT Read a web page aloud: fetch │ - │ the URL and narrate its main │ - │ text (boilerplate stripped). │ - │ Mutually exclusive with the │ - │ text argument │ + │ --url TEXT Read a web page or PDF aloud: │ + │ fetch the URL and narrate its │ + │ main text (boilerplate │ + │ stripped). Mutually exclusive │ + │ with the text argument │ │ --voice TEXT Voice id (e.g. jane, michael, │ │ mary, paul, eve, george), or │ │ SPEAKER=VOICE for diarized │ diff --git a/tests/test_webpage.py b/tests/test_webpage.py index 893c7b81..2c18c46c 100644 --- a/tests/test_webpage.py +++ b/tests/test_webpage.py @@ -45,7 +45,7 @@ def test_article_is_immutable(): setattr(article, field_name, "tampered") -def test_fetch_html_returns_body_and_sends_browser_user_agent(monkeypatch): +def test_fetch_returns_body_and_sends_browser_user_agent(monkeypatch): seen: dict[str, str] = {} def handler(request: httpx.Request) -> httpx.Response: @@ -53,12 +53,12 @@ def handler(request: httpx.Request) -> httpx.Response: return httpx.Response(200, text="ok") _client_returning(monkeypatch, handler) - assert webpage._fetch_html("https://example.com/post") == "ok" + assert webpage._fetch("https://example.com/post").text == "ok" # The browser-like UA is sent so sites don't serve a stub/block page. assert "assembly-cli" in seen["ua"] -def test_fetch_html_follows_redirects(monkeypatch): +def test_fetch_follows_redirects(monkeypatch): # A 301 must be followed to the final 200; without follow_redirects the # client would return the empty 301 body instead of the article. def handler(request: httpx.Request) -> httpx.Response: @@ -67,23 +67,23 @@ def handler(request: httpx.Request) -> httpx.Response: return httpx.Response(200, text="final body") _client_returning(monkeypatch, handler) - assert webpage._fetch_html("https://example.com/start") == "final body" + assert webpage._fetch("https://example.com/start").text == "final body" -def test_fetch_html_non_2xx_becomes_api_error(monkeypatch): +def test_fetch_non_2xx_becomes_api_error(monkeypatch): _client_returning(monkeypatch, lambda request: httpx.Response(404, text="nope")) with pytest.raises(APIError) as exc: - webpage._fetch_html("https://example.com/missing") + webpage._fetch("https://example.com/missing") assert "https://example.com/missing" in exc.value.message -def test_fetch_html_connect_error_becomes_api_error(monkeypatch): +def test_fetch_connect_error_becomes_api_error(monkeypatch): def handler(request: httpx.Request) -> httpx.Response: raise httpx.ConnectError("boom") _client_returning(monkeypatch, handler) with pytest.raises(APIError): - webpage._fetch_html("https://example.com/post") + webpage._fetch("https://example.com/post") def test_extract_strips_boilerplate_and_comments_and_reads_title(): @@ -106,7 +106,12 @@ def test_fetch_article_rejects_non_http_url(): def test_fetch_article_returns_extracted_text_and_title(monkeypatch): - monkeypatch.setattr(webpage, "_fetch_html", lambda url: ARTICLE_HTML) + _client_returning( + monkeypatch, + lambda request: httpx.Response( + 200, text=ARTICLE_HTML, headers={"content-type": "text/html; charset=utf-8"} + ), + ) article = webpage.fetch_article("https://example.com/post") assert "first real paragraph of the article body" in article.text assert article.title == "The Real Headline" @@ -115,7 +120,101 @@ def test_fetch_article_returns_extracted_text_and_title(monkeypatch): def test_fetch_article_without_readable_text_is_a_usage_error(monkeypatch): # A page trafilatura can't extract an article from yields no text -> usage error. - monkeypatch.setattr(webpage, "_fetch_html", lambda url: "") + _client_returning( + monkeypatch, + lambda request: httpx.Response( + 200, text="", headers={"content-type": "text/html"} + ), + ) with pytest.raises(UsageError) as exc: webpage.fetch_article("https://example.com/empty") assert "Couldn't find readable text" in exc.value.message + # The HTML-specific hint, not the scanned-PDF one. + assert "paywalled" in (exc.value.suggestion or "") + + +def _make_pdf(body_text: str, *, title: str | None = None) -> bytes: + """Build a minimal one-page PDF whose content stream shows ``body_text``. + + Offsets in the xref table are computed as the file is assembled so pypdf reads + it without falling back to recovery — enough of a real PDF to exercise the + text-layer extraction path end to end. + """ + content = b"BT /F1 24 Tf 72 120 Td (" + body_text.encode("latin-1") + b") Tj ET" + info = b"<< /Title (" + title.encode("latin-1") + b") >>" if title else None + page = ( + b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 300 200] /Contents 4 0 R " + b"/Resources << /Font << /F1 5 0 R >> >> >>" + ) + objects = [ + b"<< /Type /Catalog /Pages 2 0 R >>", + b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>", + page, + b"<< /Length %d >>\nstream\n%s\nendstream" % (len(content), content), + b"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>", + ] + if info is not None: + objects.append(info) + out = bytearray(b"%PDF-1.4\n") + offsets: list[int] = [] + for i, obj in enumerate(objects, start=1): + offsets.append(len(out)) + out += b"%d 0 obj\n%s\nendobj\n" % (i, obj) + xref_pos = len(out) + out += b"xref\n0 %d\n0000000000 65535 f \n" % (len(objects) + 1) + for off in offsets: + out += b"%010d 00000 n \n" % off + trailer = b"<< /Size %d /Root 1 0 R" % (len(objects) + 1) + if info is not None: + trailer += b" /Info %d 0 R" % len(objects) + trailer += b" >>" + out += b"trailer\n%s\nstartxref\n%d\n%%%%EOF" % (trailer, xref_pos) + return bytes(out) + + +def _pdf_response(data: bytes, content_type: str = "application/pdf") -> httpx.Response: + return httpx.Response(200, content=data, headers={"content-type": content_type}) + + +def test_is_pdf_detects_by_content_type_and_magic_bytes(): + # Either signal alone is sufficient... + assert webpage._is_pdf(b"not a pdf", "application/pdf; charset=binary") + assert webpage._is_pdf(b"%PDF-1.7\n...", "application/octet-stream") + # ...and an HTML response is not a PDF. + assert not webpage._is_pdf(b"", "text/html; charset=utf-8") + + +def test_fetch_article_extracts_pdf_text_and_title(monkeypatch): + pdf = _make_pdf("Hello from the PDF body text", title="A PDF Report") + _client_returning(monkeypatch, lambda request: _pdf_response(pdf)) + article = webpage.fetch_article("https://example.com/report") + assert "Hello from the PDF body text" in article.text + assert article.title == "A PDF Report" + assert article.url == "https://example.com/report" + + +def test_fetch_article_dispatches_pdf_by_magic_bytes_despite_generic_type(monkeypatch): + # A server mislabeling the PDF as octet-stream still takes the PDF path. + pdf = _make_pdf("Magic-byte routed body") + _client_returning( + monkeypatch, lambda request: _pdf_response(pdf, content_type="application/octet-stream") + ) + assert "Magic-byte routed body" in webpage.fetch_article("https://example.com/x").text + + +def test_fetch_article_scanned_pdf_without_text_is_a_usage_error(monkeypatch): + # An image-only PDF has no text layer -> usage error with the OCR-shaped hint. + pdf = _make_pdf("") + _client_returning(monkeypatch, lambda request: _pdf_response(pdf)) + with pytest.raises(UsageError) as exc: + webpage.fetch_article("https://example.com/scanned.pdf") + assert "Couldn't find readable text" in exc.value.message + assert "scanned" in (exc.value.suggestion or "") + + +def test_fetch_article_corrupt_pdf_is_a_usage_error(monkeypatch): + # Passes the %PDF- magic check but isn't a parseable PDF -> usage error. + _client_returning(monkeypatch, lambda request: _pdf_response(b"%PDF-1.4\nnot really a pdf")) + with pytest.raises(UsageError) as exc: + webpage.fetch_article("https://example.com/broken.pdf") + assert "PDF" in exc.value.message diff --git a/uv.lock b/uv.lock index e73c0e56..53e2d924 100644 --- a/uv.lock +++ b/uv.lock @@ -31,6 +31,7 @@ dependencies = [ { name = "packaging" }, { name = "platformdirs" }, { name = "pydantic" }, + { name = "pypdf" }, { name = "questionary" }, { name = "rich" }, { name = "sounddevice" }, @@ -85,6 +86,7 @@ requires-dist = [ { name = "packaging", specifier = ">=24.0" }, { name = "platformdirs", specifier = ">=4.10.0" }, { name = "pydantic", specifier = ">=2.13.4" }, + { name = "pypdf", specifier = ">=5.1.0" }, { name = "questionary", specifier = ">=2.0.1" }, { name = "rich", specifier = ">=15.0.0" }, { name = "sounddevice", specifier = ">=0.5.5" }, @@ -1648,6 +1650,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" }, ] +[[package]] +name = "pypdf" +version = "6.13.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/99/0a/48fe05c6bb3aa4bb4d2a4079a383d33c0dfec1edf613a642f07d8b8b5c2e/pypdf-6.13.2.tar.gz", hash = "sha256:5a96a17dbdfbf9c2ab24c0a13fa0aba182be22ba6f283098712c16fc242f509f", size = 6479250, upload-time = "2026-06-10T16:42:34.5Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/17/378943705992f74e451a06de3401ce68e3213763c81e44d0614559c45599/pypdf-6.13.2-py3-none-any.whl", hash = "sha256:6eeb9e57693f29d41bd01255d02660cbbb41fd7fc818a982677389a35e4f2083", size = 346555, upload-time = "2026-06-10T16:42:32.37Z" }, +] + [[package]] name = "pyright" version = "1.1.410"