Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 24 additions & 52 deletions aai_cli/tts/text.py
Original file line number Diff line number Diff line change
@@ -1,73 +1,45 @@
"""Pure text helpers for streaming TTS: sentence splitting and chunking.
"""Text chunking for streaming TTS, built on LangChain's text splitters.

PocketTTS (the streaming-TTS model behind ``assembly speak``) is fed incrementally —
a whole document in a single ``Generate`` frame stalls the server. ``chunk_text`` breaks
the input into sentence-aligned chunks small enough to synthesize one connection at a
time (see ``tts.session.synthesize_chunked``). Kept Rich-free and dependency-light so it
is trivially unit-testable.
time (see ``tts.session.synthesize_chunked``). The splitting is delegated to LangChain's
``RecursiveCharacterTextSplitter`` (https://docs.langchain.com/oss/python/integrations/splitters)
rather than a hand-rolled sentence scanner: it recurses through the separator list below,
preferring sentence boundaries and only falling back to words/characters when a single
sentence overflows the budget.
"""

from __future__ import annotations

# A sentence ends at one of these terminators (mirrors the agent-cascade splitter).
_TERMINATORS = ".!?"
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Conservative upper bound on the characters in a single Generate frame. PocketTTS is a
# streaming model with a bounded context; everywhere else in the codebase it is fed one
# sentence at a time. Sentences are packed up to this budget to keep the connection count
# down on a long page while keeping each frame comfortably small.
_MAX_CHUNK_CHARS = 500 # pragma: no mutate -- a +-1 char budget is immaterial


def split_sentences(text: str) -> list[str]:
"""Split ``text`` into sentences, each ending in ``.``/``!``/``?``.

A terminator ends a sentence only when it is the last character or is followed by
whitespace — so a ``.`` inside a number ("$3.50") or stacked terminators ("..."/"?!")
don't fragment one spoken sentence. A trailing fragment with no terminal punctuation
is kept, so no text is ever dropped; empty/whitespace-only pieces are discarded.
"""
sentences: list[str] = []
start = 0
for index, char in enumerate(text):
if char in _TERMINATORS and (index + 1 == len(text) or text[index + 1].isspace()):
# The two `+ 1`s below are equivalent under mutation: a confirmed boundary means
# index+1 is whitespace or end-of-text, so widening the slice / advancing start by
# one extra char only ever spans whitespace that .strip() removes.
sentences.append(text[start : index + 1].strip()) # pragma: no mutate
start = index + 1 # pragma: no mutate
tail = text[start:].strip()
if tail:
sentences.append(tail)
return sentences


def _bounded(sentence: str, max_chars: int) -> list[str]:
"""Slice ``sentence`` into ``<= max_chars`` pieces. A sentence within the budget comes
back as a single piece (the one slice covers it); an over-long one (e.g. a PDF blob
with no sentence terminators) is split so no single Generate frame can stall the
server."""
return [sentence[i : i + max_chars] for i in range(0, len(sentence), max_chars)]
# Separators in descending priority: a sentence terminator followed by a space is the
# preferred break, then paragraph/line breaks, then a word boundary, and finally a bare
# character split for an over-long blob with none of the above (e.g. a PDF with no
# terminators). Keeping the terminator with the preceding chunk ("end") preserves the
# punctuation the model needs for prosody.
_SEPARATORS = [". ", "! ", "? ", "\n\n", "\n", " ", ""]


def chunk_text(text: str, max_chars: int = _MAX_CHUNK_CHARS) -> list[str]:
"""Split ``text`` into sentence-aligned chunks, each ``<= max_chars``.

Sentences are packed greedily so short ones share a chunk (and thus one connection);
packing never breaks mid-sentence unless a single sentence exceeds the budget, in
which case that sentence alone is sliced. Whitespace-only input yields no chunks.
Short sentences are packed together so they share a chunk (and thus one connection);
a break never lands mid-sentence unless a single sentence exceeds the budget, in which
case that sentence alone is sliced. Whitespace-only input yields no chunks, and no
text is dropped — rejoining the chunks recovers every word.
"""
chunks: list[str] = []
current = ""
for sentence in split_sentences(text):
for piece in _bounded(sentence, max_chars):
if not current:
current = piece
elif len(current) + 1 + len(piece) <= max_chars:
current = f"{current} {piece}"
else:
chunks.append(current)
current = piece
if current:
chunks.append(current)
return chunks
splitter = RecursiveCharacterTextSplitter(
chunk_size=max_chars,
chunk_overlap=0,
separators=_SEPARATORS,
keep_separator="end",
)
return splitter.split_text(text)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ dependencies = [
"langchain-tavily>=0.2.18",
"langgraph-checkpoint-sqlite>=3.1.0",
"pyperclip>=1.11.0",
"langchain-text-splitters>=1.0.0",
]

[project.urls]
Expand Down
38 changes: 15 additions & 23 deletions tests/test_tts_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,45 +3,36 @@
from aai_cli.tts import text


def test_split_sentences_keeps_terminators_and_drops_blanks():
assert text.split_sentences("Hello there. How are you?") == ["Hello there.", "How are you?"]


def test_split_sentences_does_not_break_on_mid_number_period():
# A "." inside "$3.50" is not a sentence boundary (no following whitespace).
assert text.split_sentences("It costs $3.50 today.") == ["It costs $3.50 today."]


def test_split_sentences_keeps_unterminated_tail():
# PDF/article text often has no closing punctuation — the tail is kept, not dropped.
assert text.split_sentences("an extracted blob with no terminator") == [
"an extracted blob with no terminator"
]


def test_chunk_text_packs_short_sentences_into_one_chunk():
# Several short sentences well under the budget ride in a single chunk (one
# connection) rather than one connection per sentence.
out = text.chunk_text("One. Two. Three.", max_chars=100)
assert out == ["One. Two. Three."]


def test_chunk_text_does_not_break_on_mid_number_period():
# A "." inside "$3.50" is not a sentence boundary (no following whitespace), so the
# whole sentence stays in one chunk.
assert text.chunk_text("It costs $3.50 today.", max_chars=100) == ["It costs $3.50 today."]


def test_chunk_text_packs_two_sentences_exactly_at_the_budget():
# "ab." + " " + "cd." == 7 chars: at a budget of 7 they pack into one chunk. Pins the
# space-joiner (+1) and the inclusive `<=` boundary — a budget of 7 must still pack.
# "ab." + " " + "cd." == 7 chars: at a budget of 7 they pack into one chunk. Pins that
# the sentence-terminator separator packs rather than splitting at the boundary.
assert text.chunk_text("ab. cd.", max_chars=7) == ["ab. cd."]


def test_chunk_text_splits_two_sentences_one_over_the_budget():
# The same two sentences need 7 chars joined; a budget of 6 can't hold both, so the
# second rolls to its own chunk (packing never breaks mid-sentence). Pins that the
# joiner counts the separating space — without it 3+3 would wrongly fit in 6.
# second rolls to its own chunk (the break lands on the sentence terminator, never
# mid-word).
assert text.chunk_text("ab. cd.", max_chars=6) == ["ab.", "cd."]


def test_chunk_text_slices_a_single_oversized_sentence():
# A lone "sentence" longer than the budget (no terminators — the PDF case) is sliced
# so no single Generate frame can blow past the server's input ceiling.
# A lone "sentence" longer than the budget (no terminators or spaces — the PDF blob
# case) falls all the way to the bare-character separator so no single Generate frame
# can blow past the server's input ceiling.
out = text.chunk_text("abcdefghij", max_chars=4)
assert out == ["abcd", "efgh", "ij"]
assert all(len(piece) <= 4 for piece in out)
Expand All @@ -56,5 +47,6 @@ def test_chunk_text_every_chunk_within_budget_for_a_long_paragraph():
out = text.chunk_text(para, max_chars=120)
assert len(out) > 1 # a long paragraph really is chunked
assert all(len(piece) <= 120 for piece in out)
# No text is lost: rejoining the chunks recovers every word in order.
# No text is lost (and none is duplicated by overlap): rejoining the chunks recovers
# every word in order.
assert out and " ".join(out).split() == para.split()
14 changes: 14 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading