From 7ba1839e8210f0f4823c12ce2a77b1049ef846cf Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 18 Jun 2026 17:46:50 +0000
Subject: [PATCH] Use LangChain RecursiveCharacterTextSplitter for TTS chunking

Replace the hand-rolled sentence scanner + greedy packer in tts/text.py
with LangChain's RecursiveCharacterTextSplitter, configured with
sentence-terminator separators so chunks stay sentence-aligned and within
the per-frame char budget. Behavior is preserved across the existing cases
(sentence packing, mid-number periods, oversized-blob slicing, lossless
rejoin); the split_sentences helper is dropped since chunk_text was its only
caller. Adds the langchain-text-splitters dependency.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01He2iJSWkEB5U3ZhxyAFnRc
---
 aai_cli/tts/text.py    | 76 +++++++++++++-----------------------------
 pyproject.toml         |  1 +
 tests/test_tts_text.py | 38 +++++++++------------
 uv.lock                | 14 ++++++++
 4 files changed, 54 insertions(+), 75 deletions(-)

diff --git a/aai_cli/tts/text.py b/aai_cli/tts/text.py
index c637027..af99781 100644
--- a/aai_cli/tts/text.py
+++ b/aai_cli/tts/text.py
@@ -1,16 +1,18 @@
-"""Pure text helpers for streaming TTS: sentence splitting and chunking.
+"""Text chunking for streaming TTS, built on LangChain's text splitters.
 
 PocketTTS (the streaming-TTS model behind ``assembly speak``) is fed incrementally —
 a whole document in a single ``Generate`` frame stalls the server. ``chunk_text`` breaks
 the input into sentence-aligned chunks small enough to synthesize one connection at a
-time (see ``tts.session.synthesize_chunked``). Kept Rich-free and dependency-light so it
-is trivially unit-testable.
+time (see ``tts.session.synthesize_chunked``). The splitting is delegated to LangChain's
+``RecursiveCharacterTextSplitter`` (https://docs.langchain.com/oss/python/integrations/splitters)
+rather than a hand-rolled sentence scanner: it recurses through the separator list below,
+preferring sentence boundaries and only falling back to words/characters when a single
+sentence overflows the budget.
 """
 
 from __future__ import annotations
 
-# A sentence ends at one of these terminators (mirrors the agent-cascade splitter).
-_TERMINATORS = ".!?"
+from langchain_text_splitters import RecursiveCharacterTextSplitter
 
 # Conservative upper bound on the characters in a single Generate frame. PocketTTS is a
 # streaming model with a bounded context; everywhere else in the codebase it is fed one
@@ -18,56 +20,26 @@
 # down on a long page while keeping each frame comfortably small.
 _MAX_CHUNK_CHARS = 500  # pragma: no mutate -- a +-1 char budget is immaterial
 
-
-def split_sentences(text: str) -> list[str]:
-    """Split ``text`` into sentences, each ending in ``.``/``!``/``?``.
-
-    A terminator ends a sentence only when it is the last character or is followed by
-    whitespace — so a ``.`` inside a number ("$3.50") or stacked terminators ("..."/"?!")
-    don't fragment one spoken sentence. A trailing fragment with no terminal punctuation
-    is kept, so no text is ever dropped; empty/whitespace-only pieces are discarded.
-    """
-    sentences: list[str] = []
-    start = 0
-    for index, char in enumerate(text):
-        if char in _TERMINATORS and (index + 1 == len(text) or text[index + 1].isspace()):
-            # The two `+ 1`s below are equivalent under mutation: a confirmed boundary means
-            # index+1 is whitespace or end-of-text, so widening the slice / advancing start by
-            # one extra char only ever spans whitespace that .strip() removes.
-            sentences.append(text[start : index + 1].strip())  # pragma: no mutate
-            start = index + 1  # pragma: no mutate
-    tail = text[start:].strip()
-    if tail:
-        sentences.append(tail)
-    return sentences
-
-
-def _bounded(sentence: str, max_chars: int) -> list[str]:
-    """Slice ``sentence`` into ``<= max_chars`` pieces. A sentence within the budget comes
-    back as a single piece (the one slice covers it); an over-long one (e.g. a PDF blob
-    with no sentence terminators) is split so no single Generate frame can stall the
-    server."""
-    return [sentence[i : i + max_chars] for i in range(0, len(sentence), max_chars)]
+# Separators in descending priority: a sentence terminator followed by a space is the
+# preferred break, then paragraph/line breaks, then a word boundary, and finally a bare
+# character split for an over-long blob with none of the above (e.g. a PDF with no
+# terminators). Keeping the terminator with the preceding chunk ("end") preserves the
+# punctuation the model needs for prosody.
+_SEPARATORS = [". ", "! ", "? ", "\n\n", "\n", " ", ""]
 
 
 def chunk_text(text: str, max_chars: int = _MAX_CHUNK_CHARS) -> list[str]:
     """Split ``text`` into sentence-aligned chunks, each ``<= max_chars``.
 
-    Sentences are packed greedily so short ones share a chunk (and thus one connection);
-    packing never breaks mid-sentence unless a single sentence exceeds the budget, in
-    which case that sentence alone is sliced. Whitespace-only input yields no chunks.
+    Short sentences are packed together so they share a chunk (and thus one connection);
+    a break never lands mid-sentence unless a single sentence exceeds the budget, in which
+    case that sentence alone is sliced. Whitespace-only input yields no chunks, and no
+    text is dropped — rejoining the chunks recovers every word.
     """
-    chunks: list[str] = []
-    current = ""
-    for sentence in split_sentences(text):
-        for piece in _bounded(sentence, max_chars):
-            if not current:
-                current = piece
-            elif len(current) + 1 + len(piece) <= max_chars:
-                current = f"{current} {piece}"
-            else:
-                chunks.append(current)
-                current = piece
-    if current:
-        chunks.append(current)
-    return chunks
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=max_chars,
+        chunk_overlap=0,
+        separators=_SEPARATORS,
+        keep_separator="end",
+    )
+    return splitter.split_text(text)
diff --git a/pyproject.toml b/pyproject.toml
index e2da9d8..f330876 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -83,6 +83,7 @@ dependencies = [
     "langchain-tavily>=0.2.18",
     "langgraph-checkpoint-sqlite>=3.1.0",
     "pyperclip>=1.11.0",
+    "langchain-text-splitters>=1.0.0",
 ]
 
 [project.urls]
diff --git a/tests/test_tts_text.py b/tests/test_tts_text.py
index d6f3414..fc64c7f 100644
--- a/tests/test_tts_text.py
+++ b/tests/test_tts_text.py
@@ -3,22 +3,6 @@
 from aai_cli.tts import text
 
 
-def test_split_sentences_keeps_terminators_and_drops_blanks():
-    assert text.split_sentences("Hello there. How are you?") == ["Hello there.", "How are you?"]
-
-
-def test_split_sentences_does_not_break_on_mid_number_period():
-    # A "." inside "$3.50" is not a sentence boundary (no following whitespace).
-    assert text.split_sentences("It costs $3.50 today.") == ["It costs $3.50 today."]
-
-
-def test_split_sentences_keeps_unterminated_tail():
-    # PDF/article text often has no closing punctuation — the tail is kept, not dropped.
-    assert text.split_sentences("an extracted blob with no terminator") == [
-        "an extracted blob with no terminator"
-    ]
-
-
 def test_chunk_text_packs_short_sentences_into_one_chunk():
     # Several short sentences well under the budget ride in a single chunk (one
     # connection) rather than one connection per sentence.
@@ -26,22 +10,29 @@ def test_chunk_text_packs_short_sentences_into_one_chunk():
     assert out == ["One. Two. Three."]
 
 
+def test_chunk_text_does_not_break_on_mid_number_period():
+    # A "." inside "$3.50" is not a sentence boundary (no following whitespace), so the
+    # whole sentence stays in one chunk.
+    assert text.chunk_text("It costs $3.50 today.", max_chars=100) == ["It costs $3.50 today."]
+
+
 def test_chunk_text_packs_two_sentences_exactly_at_the_budget():
-    # "ab." + " " + "cd." == 7 chars: at a budget of 7 they pack into one chunk. Pins the
-    # space-joiner (+1) and the inclusive `<=` boundary — a budget of 7 must still pack.
+    # "ab." + " " + "cd." == 7 chars: at a budget of 7 they pack into one chunk. Pins that
+    # the sentence-terminator separator packs rather than splitting at the boundary.
     assert text.chunk_text("ab. cd.", max_chars=7) == ["ab. cd."]
 
 
 def test_chunk_text_splits_two_sentences_one_over_the_budget():
     # The same two sentences need 7 chars joined; a budget of 6 can't hold both, so the
-    # second rolls to its own chunk (packing never breaks mid-sentence). Pins that the
-    # joiner counts the separating space — without it 3+3 would wrongly fit in 6.
+    # second rolls to its own chunk (the break lands on the sentence terminator, never
+    # mid-word).
     assert text.chunk_text("ab. cd.", max_chars=6) == ["ab.", "cd."]
 
 
 def test_chunk_text_slices_a_single_oversized_sentence():
-    # A lone "sentence" longer than the budget (no terminators — the PDF case) is sliced
-    # so no single Generate frame can blow past the server's input ceiling.
+    # A lone "sentence" longer than the budget (no terminators or spaces — the PDF blob
+    # case) falls all the way to the bare-character separator so no single Generate frame
+    # can blow past the server's input ceiling.
     out = text.chunk_text("abcdefghij", max_chars=4)
     assert out == ["abcd", "efgh", "ij"]
     assert all(len(piece) <= 4 for piece in out)
@@ -56,5 +47,6 @@ def test_chunk_text_every_chunk_within_budget_for_a_long_paragraph():
     out = text.chunk_text(para, max_chars=120)
     assert len(out) > 1  # a long paragraph really is chunked
     assert all(len(piece) <= 120 for piece in out)
-    # No text is lost: rejoining the chunks recovers every word in order.
+    # No text is lost (and none is duplicated by overlap): rejoining the chunks recovers
+    # every word in order.
     assert out and " ".join(out).split() == para.split()
diff --git a/uv.lock b/uv.lock
index cf2218a..6673121 100644
--- a/uv.lock
+++ b/uv.lock
@@ -32,6 +32,7 @@ dependencies = [
     { name = "langchain-mcp-adapters" },
     { name = "langchain-openai" },
     { name = "langchain-tavily" },
+    { name = "langchain-text-splitters" },
     { name = "langgraph" },
     { name = "langgraph-checkpoint-sqlite" },
     { name = "openai" },
@@ -96,6 +97,7 @@ requires-dist = [
     { name = "langchain-mcp-adapters", specifier = ">=0.3.0" },
     { name = "langchain-openai", specifier = ">=1.3.2" },
     { name = "langchain-tavily", specifier = ">=0.2.18" },
+    { name = "langchain-text-splitters", specifier = ">=1.0.0" },
     { name = "langgraph", specifier = ">=1.2.2" },
     { name = "langgraph-checkpoint-sqlite", specifier = ">=3.1.0" },
     { name = "openai", specifier = ">=2.41.0" },
@@ -1654,6 +1656,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/71/9c/0c043e4434b1823f0ac194f66036cbb0569275a99dcb890e0891ecd34fb2/langchain_tavily-0.2.18-py3-none-any.whl", hash = "sha256:dccf3ad1c50e2cb2a89bec11727555805c9df8abd42c1f3ad42ccad86e28aa44", size = 30814, upload-time = "2026-04-16T15:23:23.424Z" },
 ]
 
+[[package]]
+name = "langchain-text-splitters"
+version = "1.1.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "langchain-core" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/26/9f/6c545900fefb7b00ddfa3f16b80d61338a0ec68c31c5451eeeab99082760/langchain_text_splitters-1.1.2.tar.gz", hash = "sha256:782a723db0a4746ac91e251c7c1d57fd23636e4f38ed733074e28d7a86f41627", size = 293580, upload-time = "2026-04-16T14:20:39.162Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d3/26/1ef06f56198d631296d646a6223de35bcc6cf9795ceb2442816bc963b84c/langchain_text_splitters-1.1.2-py3-none-any.whl", hash = "sha256:a2de0d799ff31886429fd6e2e0032df275b60ec817c19059a7b46181cc1c2f10", size = 35903, upload-time = "2026-04-16T14:20:38.243Z" },
+]
+
 [[package]]
 name = "langgraph"
 version = "1.2.5"