Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions aai_cli/commands/speak/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def speak(
url: str | None = typer.Option(
None,
"--url",
help="Read a web page aloud: fetch the URL and narrate its main text "
help="Read a web page or PDF aloud: fetch the URL and narrate its main text "
"(boilerplate stripped). Mutually exclusive with the text argument",
),
voice: list[str] = typer.Option(
Expand Down Expand Up @@ -82,7 +82,7 @@ def speak(
) -> None:
r"""\[sandbox] Synthesize speech from text with AssemblyAI streaming TTS

Reads text from the argument, piped stdin, or a web page with --url
Reads text from the argument, piped stdin, or a web page or PDF with --url
(its main content is extracted and the boilerplate stripped). Plays the
audio through your speakers by default, or writes a WAV with --out.
Speaker-labeled input (from 'assembly transcribe --speaker-labels') is
Expand Down
77 changes: 60 additions & 17 deletions aai_cli/core/webpage.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""Fetch a web page and extract its main article text.
"""Fetch a web page (or PDF) and extract its main readable text.

Backs ``assembly speak --url``: httpx2 (the project's pinned client) fetches the
HTML and trafilatura strips the boilerplate — nav, sidebars, cookie banners,
footers, comment threads — down to the readable article body, so text-to-speech
narrates the piece rather than the page chrome. trafilatura (and its lxml
backend) is the heavy import, so it is deferred to call time to stay off the
resource, then the body is narrowed to the readable text. For HTML, trafilatura
strips the boilerplate — nav, sidebars, cookie banners, footers, comment threads
— down to the article body; for a PDF (detected by Content-Type or the ``%PDF-``
magic bytes) pypdf pulls the text layer out of every page. Either way text-to-speech
narrates the piece rather than the page chrome. trafilatura (and its lxml backend)
and pypdf are the heavy imports, so both are deferred to call time to stay off the
CLI's startup path.
"""

Expand All @@ -20,40 +22,56 @@
_TIMEOUT = 30.0 # pragma: no mutate -- request timeout; nothing observable to assert
# Browser-like UA: some sites serve a stub or block page to unknown clients.
_USER_AGENT = "Mozilla/5.0 (compatible; assembly-cli; +https://www.assemblyai.com)"
# Every PDF begins with this signature; the robust signal when a server mislabels
# the Content-Type (e.g. application/octet-stream) or the URL has no .pdf suffix.
_PDF_MAGIC = b"%PDF-"


@dataclass(frozen=True)
class Article:
"""The readable content extracted from a web page."""
"""The readable content extracted from a web page or PDF."""

text: str
title: str | None
url: str


def fetch_article(url: str) -> Article:
"""Fetch ``url`` and return its main article text with boilerplate removed.
"""Fetch ``url`` and return its main readable text with boilerplate removed.

Raises a :class:`UsageError` when ``url`` isn't an http(s) address or the
page yields no readable text, and an :class:`APIError` when the fetch itself
HTML pages go through trafilatura; PDFs go through pypdf. Raises a
:class:`UsageError` when ``url`` isn't an http(s) address or the resource
yields no readable text, and an :class:`APIError` when the fetch itself
fails (DNS, timeout, non-2xx).
"""
if not url.startswith(("http://", "https://")):
raise UsageError(
f"Not a web page URL: {url}",
suggestion="Pass an http(s) URL, e.g. assembly speak --url https://example.com/post.",
)
text, title = _extract(_fetch_html(url))
if not text:
raise UsageError(
f"Couldn't find readable text at {url}.",
suggestion="The page may be paywalled, JavaScript-rendered, or not an article.",
response = _fetch(url)
content_type = response.headers.get("content-type", "").lower()
data = response.content
if _is_pdf(data, content_type):
text, title = _extract_pdf(data)
empty_hint = (
"The PDF may be scanned or image-only — there's no text layer to read "
"(that needs OCR, which speak doesn't do)."
)
else:
text, title = _extract(response.text)
empty_hint = "The page may be paywalled, JavaScript-rendered, or not an article."
if not text:
raise UsageError(f"Couldn't find readable text at {url}.", suggestion=empty_hint)
return Article(text=text, title=title, url=url)


def _fetch_html(url: str) -> str:
"""GET the raw HTML for ``url``, mapping any network/HTTP failure to APIError."""
def _fetch(url: str) -> httpx.Response:
"""GET ``url``, mapping any network/HTTP failure to APIError.

Returns the fully-read response so the caller can read it as text (HTML) or
bytes (PDF) depending on the content type.
"""
try:
with httpx.Client(
timeout=_TIMEOUT,
Expand All @@ -62,11 +80,16 @@ def _fetch_html(url: str) -> str:
) as client:
response = client.get(url)
response.raise_for_status()
return response.text
return response
except httpx.HTTPError as exc:
raise APIError(f"Couldn't fetch {url}: {exc}") from exc


def _is_pdf(data: bytes, content_type: str) -> bool:
"""True when ``data`` is a PDF, by Content-Type or the ``%PDF-`` magic bytes."""
return "application/pdf" in content_type or data.startswith(_PDF_MAGIC)


def _extract(html: str) -> tuple[str | None, str | None]:
"""Pull the main text and title out of ``html`` (trafilatura, imported lazily)."""
import trafilatura
Expand All @@ -81,3 +104,23 @@ def _extract(html: str) -> tuple[str | None, str | None]:
)
title = getattr(trafilatura.extract_metadata(html), "title", None)
return text, title


def _extract_pdf(data: bytes) -> tuple[str | None, str | None]:
"""Pull the text layer and title out of a PDF (pypdf, imported lazily)."""
from io import BytesIO

from pypdf import PdfReader
from pypdf.errors import PyPdfError

try:
reader = PdfReader(BytesIO(data))
pages = [page.extract_text() for page in reader.pages]
except PyPdfError as exc:
raise UsageError(
f"Couldn't read the PDF at the URL: {exc}",
suggestion="The file may be encrypted, corrupt, or not a valid PDF.",
) from exc
text = "\n\n".join(page for page in pages if page).strip() or None
title = getattr(reader.metadata, "title", None)
return text, title
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ dependencies = [
# lazily). Strips boilerplate down to the readable body; ships prebuilt wheels
# (lxml included), so it adds no source-compile step to Homebrew bottling.
"trafilatura>=2.1.0",
# PDF text extraction for `assembly speak --url` when the URL serves a PDF
# (webpage.py, imported lazily). Pure-Python, permissively licensed, ships a
# universal wheel, so it adds no source-compile step to Homebrew bottling.
"pypdf>=5.1.0",
]

[project.urls]
Expand Down
12 changes: 6 additions & 6 deletions tests/__snapshots__/test_snapshots_help_run.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,7 @@

[sandbox] Synthesize speech from text with AssemblyAI streaming TTS

Reads text from the argument, piped stdin, or a web page with --url
Reads text from the argument, piped stdin, or a web page or PDF with --url
(its main content is extracted and the boilerplate stripped). Plays the
audio through your speakers by default, or writes a WAV with --out.
Speaker-labeled input (from 'assembly transcribe --speaker-labels') is
Expand All @@ -718,11 +718,11 @@
│ text [TEXT] Text to speak. Omit to read from stdin. │
╰──────────────────────────────────────────────────────────────────────────────╯
╭─ Options ────────────────────────────────────────────────────────────────────╮
│ --url TEXT Read a web page aloud: fetch
│ the URL and narrate its main
│ text (boilerplate stripped).
│ Mutually exclusive with the
text argument
│ --url TEXT Read a web page or PDF aloud:
fetch the URL and narrate its │
main text (boilerplate
stripped). Mutually exclusive │
with the text argument
│ --voice TEXT Voice id (e.g. jane, michael, │
│ mary, paul, eve, george), or │
│ SPEAKER=VOICE for diarized │
Expand Down
119 changes: 109 additions & 10 deletions tests/test_webpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,20 +45,20 @@ def test_article_is_immutable():
setattr(article, field_name, "tampered")


def test_fetch_html_returns_body_and_sends_browser_user_agent(monkeypatch):
def test_fetch_returns_body_and_sends_browser_user_agent(monkeypatch):
seen: dict[str, str] = {}

def handler(request: httpx.Request) -> httpx.Response:
seen["ua"] = request.headers["user-agent"]
return httpx.Response(200, text="<html>ok</html>")

_client_returning(monkeypatch, handler)
assert webpage._fetch_html("https://example.com/post") == "<html>ok</html>"
assert webpage._fetch("https://example.com/post").text == "<html>ok</html>"
# The browser-like UA is sent so sites don't serve a stub/block page.
assert "assembly-cli" in seen["ua"]


def test_fetch_html_follows_redirects(monkeypatch):
def test_fetch_follows_redirects(monkeypatch):
# A 301 must be followed to the final 200; without follow_redirects the
# client would return the empty 301 body instead of the article.
def handler(request: httpx.Request) -> httpx.Response:
Expand All @@ -67,23 +67,23 @@ def handler(request: httpx.Request) -> httpx.Response:
return httpx.Response(200, text="final body")

_client_returning(monkeypatch, handler)
assert webpage._fetch_html("https://example.com/start") == "final body"
assert webpage._fetch("https://example.com/start").text == "final body"


def test_fetch_html_non_2xx_becomes_api_error(monkeypatch):
def test_fetch_non_2xx_becomes_api_error(monkeypatch):
_client_returning(monkeypatch, lambda request: httpx.Response(404, text="nope"))
with pytest.raises(APIError) as exc:
webpage._fetch_html("https://example.com/missing")
webpage._fetch("https://example.com/missing")
assert "https://example.com/missing" in exc.value.message


def test_fetch_html_connect_error_becomes_api_error(monkeypatch):
def test_fetch_connect_error_becomes_api_error(monkeypatch):
def handler(request: httpx.Request) -> httpx.Response:
raise httpx.ConnectError("boom")

_client_returning(monkeypatch, handler)
with pytest.raises(APIError):
webpage._fetch_html("https://example.com/post")
webpage._fetch("https://example.com/post")


def test_extract_strips_boilerplate_and_comments_and_reads_title():
Expand All @@ -106,7 +106,12 @@ def test_fetch_article_rejects_non_http_url():


def test_fetch_article_returns_extracted_text_and_title(monkeypatch):
monkeypatch.setattr(webpage, "_fetch_html", lambda url: ARTICLE_HTML)
_client_returning(
monkeypatch,
lambda request: httpx.Response(
200, text=ARTICLE_HTML, headers={"content-type": "text/html; charset=utf-8"}
),
)
article = webpage.fetch_article("https://example.com/post")
assert "first real paragraph of the article body" in article.text
assert article.title == "The Real Headline"
Expand All @@ -115,7 +120,101 @@ def test_fetch_article_returns_extracted_text_and_title(monkeypatch):

def test_fetch_article_without_readable_text_is_a_usage_error(monkeypatch):
# A page trafilatura can't extract an article from yields no text -> usage error.
monkeypatch.setattr(webpage, "_fetch_html", lambda url: "<html><body></body></html>")
_client_returning(
monkeypatch,
lambda request: httpx.Response(
200, text="<html><body></body></html>", headers={"content-type": "text/html"}
),
)
with pytest.raises(UsageError) as exc:
webpage.fetch_article("https://example.com/empty")
assert "Couldn't find readable text" in exc.value.message
# The HTML-specific hint, not the scanned-PDF one.
assert "paywalled" in (exc.value.suggestion or "")


def _make_pdf(body_text: str, *, title: str | None = None) -> bytes:
"""Build a minimal one-page PDF whose content stream shows ``body_text``.

Offsets in the xref table are computed as the file is assembled so pypdf reads
it without falling back to recovery — enough of a real PDF to exercise the
text-layer extraction path end to end.
"""
content = b"BT /F1 24 Tf 72 120 Td (" + body_text.encode("latin-1") + b") Tj ET"
info = b"<< /Title (" + title.encode("latin-1") + b") >>" if title else None
page = (
b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 300 200] /Contents 4 0 R "
b"/Resources << /Font << /F1 5 0 R >> >> >>"
)
objects = [
b"<< /Type /Catalog /Pages 2 0 R >>",
b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
page,
b"<< /Length %d >>\nstream\n%s\nendstream" % (len(content), content),
b"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>",
]
if info is not None:
objects.append(info)
out = bytearray(b"%PDF-1.4\n")
offsets: list[int] = []
for i, obj in enumerate(objects, start=1):
offsets.append(len(out))
out += b"%d 0 obj\n%s\nendobj\n" % (i, obj)
xref_pos = len(out)
out += b"xref\n0 %d\n0000000000 65535 f \n" % (len(objects) + 1)
for off in offsets:
out += b"%010d 00000 n \n" % off
trailer = b"<< /Size %d /Root 1 0 R" % (len(objects) + 1)
if info is not None:
trailer += b" /Info %d 0 R" % len(objects)
trailer += b" >>"
out += b"trailer\n%s\nstartxref\n%d\n%%%%EOF" % (trailer, xref_pos)
return bytes(out)


def _pdf_response(data: bytes, content_type: str = "application/pdf") -> httpx.Response:
return httpx.Response(200, content=data, headers={"content-type": content_type})


def test_is_pdf_detects_by_content_type_and_magic_bytes():
# Either signal alone is sufficient...
assert webpage._is_pdf(b"not a pdf", "application/pdf; charset=binary")
assert webpage._is_pdf(b"%PDF-1.7\n...", "application/octet-stream")
# ...and an HTML response is not a PDF.
assert not webpage._is_pdf(b"<html></html>", "text/html; charset=utf-8")


def test_fetch_article_extracts_pdf_text_and_title(monkeypatch):
pdf = _make_pdf("Hello from the PDF body text", title="A PDF Report")
_client_returning(monkeypatch, lambda request: _pdf_response(pdf))
article = webpage.fetch_article("https://example.com/report")
assert "Hello from the PDF body text" in article.text
assert article.title == "A PDF Report"
assert article.url == "https://example.com/report"


def test_fetch_article_dispatches_pdf_by_magic_bytes_despite_generic_type(monkeypatch):
# A server mislabeling the PDF as octet-stream still takes the PDF path.
pdf = _make_pdf("Magic-byte routed body")
_client_returning(
monkeypatch, lambda request: _pdf_response(pdf, content_type="application/octet-stream")
)
assert "Magic-byte routed body" in webpage.fetch_article("https://example.com/x").text


def test_fetch_article_scanned_pdf_without_text_is_a_usage_error(monkeypatch):
# An image-only PDF has no text layer -> usage error with the OCR-shaped hint.
pdf = _make_pdf("")
_client_returning(monkeypatch, lambda request: _pdf_response(pdf))
with pytest.raises(UsageError) as exc:
webpage.fetch_article("https://example.com/scanned.pdf")
assert "Couldn't find readable text" in exc.value.message
assert "scanned" in (exc.value.suggestion or "")


def test_fetch_article_corrupt_pdf_is_a_usage_error(monkeypatch):
# Passes the %PDF- magic check but isn't a parseable PDF -> usage error.
_client_returning(monkeypatch, lambda request: _pdf_response(b"%PDF-1.4\nnot really a pdf"))
with pytest.raises(UsageError) as exc:
webpage.fetch_article("https://example.com/broken.pdf")
assert "PDF" in exc.value.message
11 changes: 11 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading