Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 18 additions & 6 deletions aai_cli/commands/speak/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@
"Override a speaker's voice",
"… | assembly --sandbox speak --voice A=vera --voice B=paul",
),
(
"Read a web page aloud (boilerplate stripped)",
"assembly --sandbox speak --url https://example.com/post",
),
(
"Save to a WAV instead of playing",
'assembly --sandbox speak "Hello" --out /tmp/hello.wav',
Expand All @@ -47,6 +51,12 @@
def speak(
ctx: typer.Context,
text: str | None = typer.Argument(None, help="Text to speak. Omit to read from stdin."),
url: str | None = typer.Option(
None,
"--url",
help="Read a web page aloud: fetch the URL and narrate its main text "
"(boilerplate stripped). Mutually exclusive with the text argument",
),
voice: list[str] = typer.Option(
[],
"--voice",
Expand All @@ -72,16 +82,18 @@ def speak(
) -> None:
r"""\[sandbox] Synthesize speech from text with AssemblyAI streaming TTS

Plays the audio through your speakers by default, or writes a WAV with
--out. Speaker-labeled input (from 'assembly transcribe
--speaker-labels') is detected automatically: the labels are stripped
and each speaker gets a different voice. This feature only exists in
the sandbox today — run it as 'assembly --sandbox speak' (--sandbox
goes before the subcommand).
Reads text from the argument, piped stdin, or a web page with --url
(its main content is extracted and the boilerplate stripped). Plays the
audio through your speakers by default, or writes a WAV with --out.
Speaker-labeled input (from 'assembly transcribe --speaker-labels') is
detected automatically: the labels are stripped and each speaker gets a
different voice. This feature only exists in the sandbox today — run it
as 'assembly --sandbox speak' (--sandbox goes before the subcommand).
"""

opts = speak_exec.SpeakOptions(
text=text,
url=url,
voice=voice,
language=language,
sample_rate=sample_rate,
Expand Down
20 changes: 17 additions & 3 deletions aai_cli/commands/speak/_exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from pathlib import Path

from aai_cli.app.context import AppState
from aai_cli.core import signals, stdio
from aai_cli.core.errors import UsageError
from aai_cli.core import signals, stdio, webpage
from aai_cli.core.errors import UsageError, mutually_exclusive
from aai_cli.tts import audio, dialogue, session, voices
from aai_cli.ui import output

Expand All @@ -30,12 +30,26 @@ class SpeakOptions:
resolves it into the ``json_mode`` argument)."""

text: str | None
url: str | None
voice: list[str]
language: str
sample_rate: int | None
out: Path | None


def _resolve_input(opts: SpeakOptions) -> str:
"""The text to speak: a fetched web page when --url is given, else the
argument or piped stdin. --url is mutually exclusive with both."""
mutually_exclusive(
("the text argument", opts.text),
("--url", opts.url),
suggestion="Speak a URL or literal text, not both.",
)
if opts.url is not None:
return webpage.fetch_article(opts.url).text
return _read_text(opts.text)


def _read_text(text: str | None) -> str:
"""The text to speak: the non-blank argument, or piped stdin when the argument
is omitted entirely. A *blank* argument (e.g. "") is a usage error, never a
Expand Down Expand Up @@ -179,7 +193,7 @@ def _speak_dialogue(
def run_speak(opts: SpeakOptions, state: AppState, *, json_mode: bool) -> None:
"""Execute one `assembly speak` invocation from already-parsed flags."""
session.require_available("speak")
spoken = _read_text(opts.text)
spoken = _resolve_input(opts)
api_key = state.resolve_api_key()
bare_voice, overrides = dialogue.parse_voice_overrides(opts.voice)
# SIGTERM aborts synthesis/playback the same way Ctrl-C does, so an external
Expand Down
83 changes: 83 additions & 0 deletions aai_cli/core/webpage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""Fetch a web page and extract its main article text.

Backs ``assembly speak --url``: httpx2 (the project's pinned client) fetches the
HTML and trafilatura strips the boilerplate — nav, sidebars, cookie banners,
footers, comment threads — down to the readable article body, so text-to-speech
narrates the piece rather than the page chrome. trafilatura (and its lxml
backend) is the heavy import, so it is deferred to call time to stay off the
CLI's startup path.
"""

from __future__ import annotations

from dataclasses import dataclass

import httpx2 as httpx

from aai_cli.core.errors import APIError, UsageError

# A page fetch shouldn't hang a TTS run; cap it.
_TIMEOUT = 30.0 # pragma: no mutate -- request timeout; nothing observable to assert
# Browser-like UA: some sites serve a stub or block page to unknown clients.
_USER_AGENT = "Mozilla/5.0 (compatible; assembly-cli; +https://www.assemblyai.com)"


@dataclass(frozen=True)
class Article:
"""The readable content extracted from a web page."""

text: str
title: str | None
url: str


def fetch_article(url: str) -> Article:
"""Fetch ``url`` and return its main article text with boilerplate removed.

Raises a :class:`UsageError` when ``url`` isn't an http(s) address or the
page yields no readable text, and an :class:`APIError` when the fetch itself
fails (DNS, timeout, non-2xx).
"""
if not url.startswith(("http://", "https://")):
raise UsageError(
f"Not a web page URL: {url}",
suggestion="Pass an http(s) URL, e.g. assembly speak --url https://example.com/post.",
)
text, title = _extract(_fetch_html(url))
if not text:
raise UsageError(
f"Couldn't find readable text at {url}.",
suggestion="The page may be paywalled, JavaScript-rendered, or not an article.",
)
return Article(text=text, title=title, url=url)


def _fetch_html(url: str) -> str:
"""GET the raw HTML for ``url``, mapping any network/HTTP failure to APIError."""
try:
with httpx.Client(
timeout=_TIMEOUT,
follow_redirects=True,
headers={"User-Agent": _USER_AGENT},
) as client:
response = client.get(url)
response.raise_for_status()
return response.text
except httpx.HTTPError as exc:
raise APIError(f"Couldn't fetch {url}: {exc}") from exc

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

APIError includes the raw user-provided URL and exception in its message (f"Couldn't fetch {url}: {exc}"). Avoid embedding unsanitized URLs in error text; sanitize or redact before including in messages.

Details

✨ AI Reasoning
​The exception handler constructs an APIError embedding the requested URL and the HTTP exception (f"Couldn't fetch {url}: {exc}"). If these errors are logged or displayed, the raw URL (and possibly sensitive query strings) will be exposed and may allow log injection via crafted input.

🔧 How do I fix it?
Keep sensitive data such as emails, passwords, and tokens out of logs. When logging values tied to a user, prefer a safe identifier like a user ID over the raw input, and strip line breaks from any user-provided text you do log.

Reply @AikidoSec feedback: [FEEDBACK] to get better review comments in the future.
Reply @AikidoSec ignore: [REASON] to ignore this issue.
More info



def _extract(html: str) -> tuple[str | None, str | None]:
"""Pull the main text and title out of ``html`` (trafilatura, imported lazily)."""
import trafilatura

text = trafilatura.extract(
html,
output_format="txt",
# Don't narrate the comment thread. trafilatura's comment classifier keys
# off real-world markup (Disqus, microformats), which synthetic test
# fixtures can't reproduce, so this flag stays unasserted by the suite.
include_comments=False, # pragma: no mutate
)
title = getattr(trafilatura.extract_metadata(html), "title", None)
return text, title
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ dependencies = [
# deps. We hand it already-fetched bytes (never a URL) so our bounded, safe
# httpx fetch stays the only network path.
"feedparser>=6.0.11",
# Web-page article extraction for `assembly speak --url` (webpage.py, imported
# lazily). Strips boilerplate down to the readable body; ships prebuilt wheels
# (lxml included), so it adds no source-compile step to Homebrew bottling.
"trafilatura>=2.1.0",
]

[project.urls]
Expand Down
20 changes: 14 additions & 6 deletions tests/__snapshots__/test_snapshots_help_run.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -659,17 +659,23 @@

[sandbox] Synthesize speech from text with AssemblyAI streaming TTS

Plays the audio through your speakers by default, or writes a WAV with
--out. Speaker-labeled input (from 'assembly transcribe
--speaker-labels') is detected automatically: the labels are stripped
and each speaker gets a different voice. This feature only exists in
the sandbox today — run it as 'assembly --sandbox speak' (--sandbox
goes before the subcommand).
Reads text from the argument, piped stdin, or a web page with --url
(its main content is extracted and the boilerplate stripped). Plays the
audio through your speakers by default, or writes a WAV with --out.
Speaker-labeled input (from 'assembly transcribe --speaker-labels') is
detected automatically: the labels are stripped and each speaker gets a
different voice. This feature only exists in the sandbox today — run it
as 'assembly --sandbox speak' (--sandbox goes before the subcommand).

╭─ Arguments ──────────────────────────────────────────────────────────────────╮
│ text [TEXT] Text to speak. Omit to read from stdin. │
╰──────────────────────────────────────────────────────────────────────────────╯
╭─ Options ────────────────────────────────────────────────────────────────────╮
│ --url TEXT Read a web page aloud: fetch │
│ the URL and narrate its main │
│ text (boilerplate stripped). │
│ Mutually exclusive with the │
│ text argument │
│ --voice TEXT Voice id (e.g. jane, michael, │
│ mary, paul, eve, george), or │
│ SPEAKER=VOICE for diarized │
Expand Down Expand Up @@ -699,6 +705,8 @@
$ assembly transcribe meeting.mp3 --speaker-labels | assembly --sandbox speak
Override a speaker's voice
$ … | assembly --sandbox speak --voice A=vera --voice B=paul
Read a web page aloud (boilerplate stripped)
$ assembly --sandbox speak --url https://example.com/post
Save to a WAV instead of playing
$ assembly --sandbox speak "Hello" --out /tmp/hello.wav

Expand Down
1 change: 1 addition & 0 deletions tests/test_command_options_seam.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@

SPEAK_DEFAULTS = speak_exec.SpeakOptions(
text=None,
url=None,
voice=[],
language=speak_exec.DEFAULT_LANGUAGE,
sample_rate=None,
Expand Down
25 changes: 25 additions & 0 deletions tests/test_speak.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,31 @@ def test_out_writes_wav_and_does_not_play(monkeypatch, tmp_path, fake_synthesize
assert "played" not in result.stderr


def test_url_reads_web_page_aloud(monkeypatch, fake_synthesize):
# --url fetches a page and narrates its extracted main text.
from aai_cli.core import webpage

monkeypatch.setattr("aai_cli.commands.speak._exec.audio.play_pcm", lambda *a, **k: None)
monkeypatch.setattr(
webpage,
"fetch_article",
lambda url: webpage.Article(text="The article body.", title="Headline", url=url),
)
result = runner.invoke(app, ["--sandbox", "speak", "--url", "https://example.com/post"])
assert result.exit_code == 0
assert fake_synthesize["cfg"].text == "The article body."


def test_url_and_text_argument_are_mutually_exclusive(monkeypatch):
result = runner.invoke(
app, ["--sandbox", "speak", "Hello", "--url", "https://example.com/post"]
)
assert result.exit_code == 2
# Both conflicting inputs are named so the fix is unambiguous.
assert "the text argument" in result.output
assert "--url" in result.output


def test_installs_sigterm_handler_around_synthesis(monkeypatch):
captured: dict[str, object] = {}

Expand Down
121 changes: 121 additions & 0 deletions tests/test_webpage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
from __future__ import annotations

import dataclasses

import httpx2 as httpx
import pytest

from aai_cli.core import webpage
from aai_cli.core.errors import APIError, UsageError

# An article wrapped in the usual page chrome: nav, a comment thread, and a
# <title>. The extractor should keep the body and drop the rest.
ARTICLE_HTML = """<!DOCTYPE html><html><head><title>The Real Headline</title></head>
<body>
<nav>Home | About | SubscribeNavBoilerplate</nav>
<article>
<h1>The Real Headline</h1>
<p>This is the first real paragraph of the article body that we care about.</p>
<p>Here is a second substantive paragraph with more content to extract.</p>
</article>
<section class="comments"><p>UserCommentText that appears in the discussion thread below.</p></section>
<footer>FooterBoilerplate copyright 2026.</footer>
</body></html>"""


def _client_returning(monkeypatch, handler):
"""Patch webpage.httpx.Client to route requests through a MockTransport handler
(the test_eval_data_hf.py pattern) so no real socket is opened."""
real_client = httpx.Client

def fake_client(*args, **kwargs):
kwargs["transport"] = httpx.MockTransport(handler)
return real_client(*args, **kwargs)

monkeypatch.setattr(webpage.httpx, "Client", fake_client)


def test_article_is_immutable():
# frozen=True: a fetched Article can't be mutated out from under a caller.
article = webpage.Article(text="body", title="T", url="https://example.com/p")
# A dynamic field name (not a literal) keeps pyright from resolving the
# assignment to the read-only attribute — see test_command_options_seam.py.
field_name = dataclasses.fields(article)[0].name
with pytest.raises(dataclasses.FrozenInstanceError):
setattr(article, field_name, "tampered")


def test_fetch_html_returns_body_and_sends_browser_user_agent(monkeypatch):
seen: dict[str, str] = {}

def handler(request: httpx.Request) -> httpx.Response:
seen["ua"] = request.headers["user-agent"]
return httpx.Response(200, text="<html>ok</html>")

_client_returning(monkeypatch, handler)
assert webpage._fetch_html("https://example.com/post") == "<html>ok</html>"
# The browser-like UA is sent so sites don't serve a stub/block page.
assert "assembly-cli" in seen["ua"]


def test_fetch_html_follows_redirects(monkeypatch):
# A 301 must be followed to the final 200; without follow_redirects the
# client would return the empty 301 body instead of the article.
def handler(request: httpx.Request) -> httpx.Response:
if request.url.path == "/start":
return httpx.Response(301, headers={"Location": "https://example.com/final"})
return httpx.Response(200, text="final body")

_client_returning(monkeypatch, handler)
assert webpage._fetch_html("https://example.com/start") == "final body"


def test_fetch_html_non_2xx_becomes_api_error(monkeypatch):
_client_returning(monkeypatch, lambda request: httpx.Response(404, text="nope"))
with pytest.raises(APIError) as exc:
webpage._fetch_html("https://example.com/missing")
assert "https://example.com/missing" in exc.value.message


def test_fetch_html_connect_error_becomes_api_error(monkeypatch):
def handler(request: httpx.Request) -> httpx.Response:
raise httpx.ConnectError("boom")

_client_returning(monkeypatch, handler)
with pytest.raises(APIError):
webpage._fetch_html("https://example.com/post")


def test_extract_strips_boilerplate_and_comments_and_reads_title():
text, title = webpage._extract(ARTICLE_HTML)
assert text is not None
# The article body survives...
assert "first real paragraph of the article body" in text
# ...while the nav and footer chrome are dropped.
assert "NavBoilerplate" not in text
assert "FooterBoilerplate" not in text
# The <title> drives the extracted title.
assert title == "The Real Headline"


def test_fetch_article_rejects_non_http_url():
with pytest.raises(UsageError) as exc:
webpage.fetch_article("ftp://example.com/file")
assert "Not a web page URL" in exc.value.message
assert "http" in (exc.value.suggestion or "")


def test_fetch_article_returns_extracted_text_and_title(monkeypatch):
monkeypatch.setattr(webpage, "_fetch_html", lambda url: ARTICLE_HTML)
article = webpage.fetch_article("https://example.com/post")
assert "first real paragraph of the article body" in article.text
assert article.title == "The Real Headline"
assert article.url == "https://example.com/post"


def test_fetch_article_without_readable_text_is_a_usage_error(monkeypatch):
# A page trafilatura can't extract an article from yields no text -> usage error.
monkeypatch.setattr(webpage, "_fetch_html", lambda url: "<html><body></body></html>")
with pytest.raises(UsageError) as exc:
webpage.fetch_article("https://example.com/empty")
assert "Couldn't find readable text" in exc.value.message
Loading
Loading