AssemblyAI · alexkroman · Jun 18, 2026 · Jun 18, 2026
diff --git a/aai_cli/tts/text.py b/aai_cli/tts/text.py
@@ -1,73 +1,45 @@
-"""Pure text helpers for streaming TTS: sentence splitting and chunking.
+"""Text chunking for streaming TTS, built on LangChain's text splitters.
 
 PocketTTS (the streaming-TTS model behind ``assembly speak``) is fed incrementally —
 a whole document in a single ``Generate`` frame stalls the server. ``chunk_text`` breaks
 the input into sentence-aligned chunks small enough to synthesize one connection at a
-time (see ``tts.session.synthesize_chunked``). Kept Rich-free and dependency-light so it
-is trivially unit-testable.
+time (see ``tts.session.synthesize_chunked``). The splitting is delegated to LangChain's
+``RecursiveCharacterTextSplitter`` (https://docs.langchain.com/oss/python/integrations/splitters)
+rather than a hand-rolled sentence scanner: it recurses through the separator list below,
+preferring sentence boundaries and only falling back to words/characters when a single
+sentence overflows the budget.
 """
 
 from __future__ import annotations
 
-# A sentence ends at one of these terminators (mirrors the agent-cascade splitter).
-_TERMINATORS = ".!?"
+from langchain_text_splitters import RecursiveCharacterTextSplitter
 
 # Conservative upper bound on the characters in a single Generate frame. PocketTTS is a
 # streaming model with a bounded context; everywhere else in the codebase it is fed one
 # sentence at a time. Sentences are packed up to this budget to keep the connection count
 # down on a long page while keeping each frame comfortably small.
 _MAX_CHUNK_CHARS = 500  # pragma: no mutate -- a +-1 char budget is immaterial
 
-
-def split_sentences(text: str) -> list[str]:
-    """Split ``text`` into sentences, each ending in ``.``/``!``/``?``.
-
-    A terminator ends a sentence only when it is the last character or is followed by
-    whitespace — so a ``.`` inside a number ("$3.50") or stacked terminators ("..."/"?!")
-    don't fragment one spoken sentence. A trailing fragment with no terminal punctuation
-    is kept, so no text is ever dropped; empty/whitespace-only pieces are discarded.
-    """
-    sentences: list[str] = []
-    start = 0
-    for index, char in enumerate(text):
-        if char in _TERMINATORS and (index + 1 == len(text) or text[index + 1].isspace()):
-            # The two `+ 1`s below are equivalent under mutation: a confirmed boundary means
-            # index+1 is whitespace or end-of-text, so widening the slice / advancing start by
-            # one extra char only ever spans whitespace that .strip() removes.
-            sentences.append(text[start : index + 1].strip())  # pragma: no mutate
-            start = index + 1  # pragma: no mutate
-    tail = text[start:].strip()
-    if tail:
-        sentences.append(tail)
-    return sentences
-
-
-def _bounded(sentence: str, max_chars: int) -> list[str]:
-    """Slice ``sentence`` into ``<= max_chars`` pieces. A sentence within the budget comes
-    back as a single piece (the one slice covers it); an over-long one (e.g. a PDF blob
-    with no sentence terminators) is split so no single Generate frame can stall the
-    server."""
-    return [sentence[i : i + max_chars] for i in range(0, len(sentence), max_chars)]
+# Separators in descending priority: a sentence terminator followed by a space is the
+# preferred break, then paragraph/line breaks, then a word boundary, and finally a bare
+# character split for an over-long blob with none of the above (e.g. a PDF with no
+# terminators). Keeping the terminator with the preceding chunk ("end") preserves the
+# punctuation the model needs for prosody.
+_SEPARATORS = [". ", "! ", "? ", "\n\n", "\n", " ", ""]
 
 
 def chunk_text(text: str, max_chars: int = _MAX_CHUNK_CHARS) -> list[str]:
     """Split ``text`` into sentence-aligned chunks, each ``<= max_chars``.
 
-    Sentences are packed greedily so short ones share a chunk (and thus one connection);
-    packing never breaks mid-sentence unless a single sentence exceeds the budget, in
-    which case that sentence alone is sliced. Whitespace-only input yields no chunks.
+    Short sentences are packed together so they share a chunk (and thus one connection);
+    a break never lands mid-sentence unless a single sentence exceeds the budget, in which
+    case that sentence alone is sliced. Whitespace-only input yields no chunks, and no
+    text is dropped — rejoining the chunks recovers every word.
     """
-    chunks: list[str] = []
-    current = ""
-    for sentence in split_sentences(text):
-        for piece in _bounded(sentence, max_chars):
-            if not current:
-                current = piece
-            elif len(current) + 1 + len(piece) <= max_chars:
-                current = f"{current} {piece}"
-            else:
-                chunks.append(current)
-                current = piece
-    if current:
-        chunks.append(current)
-    return chunks
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=max_chars,
+        chunk_overlap=0,
+        separators=_SEPARATORS,
+        keep_separator="end",
+    )
+    return splitter.split_text(text)
diff --git a/pyproject.toml b/pyproject.toml
@@ -83,6 +83,7 @@ dependencies = [
     "langchain-tavily>=0.2.18",
     "langgraph-checkpoint-sqlite>=3.1.0",
     "pyperclip>=1.11.0",
+    "langchain-text-splitters>=1.0.0",
 ]
 
 [project.urls]

diff --git a/tests/test_tts_text.py b/tests/test_tts_text.py
@@ -3,45 +3,36 @@
 from aai_cli.tts import text
 
 
-def test_split_sentences_keeps_terminators_and_drops_blanks():
-    assert text.split_sentences("Hello there. How are you?") == ["Hello there.", "How are you?"]
-
-
-def test_split_sentences_does_not_break_on_mid_number_period():
-    # A "." inside "$3.50" is not a sentence boundary (no following whitespace).
-    assert text.split_sentences("It costs $3.50 today.") == ["It costs $3.50 today."]
-
-
-def test_split_sentences_keeps_unterminated_tail():
-    # PDF/article text often has no closing punctuation — the tail is kept, not dropped.
-    assert text.split_sentences("an extracted blob with no terminator") == [
-        "an extracted blob with no terminator"
-    ]
-
-
 def test_chunk_text_packs_short_sentences_into_one_chunk():
     # Several short sentences well under the budget ride in a single chunk (one
     # connection) rather than one connection per sentence.
     out = text.chunk_text("One. Two. Three.", max_chars=100)
     assert out == ["One. Two. Three."]
 
 
+def test_chunk_text_does_not_break_on_mid_number_period():
+    # A "." inside "$3.50" is not a sentence boundary (no following whitespace), so the
+    # whole sentence stays in one chunk.
+    assert text.chunk_text("It costs $3.50 today.", max_chars=100) == ["It costs $3.50 today."]
+
+
 def test_chunk_text_packs_two_sentences_exactly_at_the_budget():
-    # "ab." + " " + "cd." == 7 chars: at a budget of 7 they pack into one chunk. Pins the
-    # space-joiner (+1) and the inclusive `<=` boundary — a budget of 7 must still pack.
+    # "ab." + " " + "cd." == 7 chars: at a budget of 7 they pack into one chunk. Pins that
+    # the sentence-terminator separator packs rather than splitting at the boundary.
     assert text.chunk_text("ab. cd.", max_chars=7) == ["ab. cd."]
 
 
 def test_chunk_text_splits_two_sentences_one_over_the_budget():
     # The same two sentences need 7 chars joined; a budget of 6 can't hold both, so the
-    # second rolls to its own chunk (packing never breaks mid-sentence). Pins that the
-    # joiner counts the separating space — without it 3+3 would wrongly fit in 6.
+    # second rolls to its own chunk (the break lands on the sentence terminator, never
+    # mid-word).
     assert text.chunk_text("ab. cd.", max_chars=6) == ["ab.", "cd."]
 
 
 def test_chunk_text_slices_a_single_oversized_sentence():
-    # A lone "sentence" longer than the budget (no terminators — the PDF case) is sliced
-    # so no single Generate frame can blow past the server's input ceiling.
+    # A lone "sentence" longer than the budget (no terminators or spaces — the PDF blob
+    # case) falls all the way to the bare-character separator so no single Generate frame
+    # can blow past the server's input ceiling.
     out = text.chunk_text("abcdefghij", max_chars=4)
     assert out == ["abcd", "efgh", "ij"]
     assert all(len(piece) <= 4 for piece in out)
@@ -56,5 +47,6 @@ def test_chunk_text_every_chunk_within_budget_for_a_long_paragraph():
     out = text.chunk_text(para, max_chars=120)
     assert len(out) > 1  # a long paragraph really is chunked
     assert all(len(piece) <= 120 for piece in out)
-    # No text is lost: rejoining the chunks recovers every word in order.
+    # No text is lost (and none is duplicated by overlap): rejoining the chunks recovers
+    # every word in order.
     assert out and " ".join(out).split() == para.split()
diff --git a/uv.lock b/uv.lock