From 7ba1839e8210f0f4823c12ce2a77b1049ef846cf Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 17:46:50 +0000 Subject: [PATCH] Use LangChain RecursiveCharacterTextSplitter for TTS chunking Replace the hand-rolled sentence scanner + greedy packer in tts/text.py with LangChain's RecursiveCharacterTextSplitter, configured with sentence-terminator separators so chunks stay sentence-aligned and within the per-frame char budget. Behavior is preserved across the existing cases (sentence packing, mid-number periods, oversized-blob slicing, lossless rejoin); the split_sentences helper is dropped since chunk_text was its only caller. Adds the langchain-text-splitters dependency. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01He2iJSWkEB5U3ZhxyAFnRc --- aai_cli/tts/text.py | 76 +++++++++++++----------------------------- pyproject.toml | 1 + tests/test_tts_text.py | 38 +++++++++------------ uv.lock | 14 ++++++++ 4 files changed, 54 insertions(+), 75 deletions(-) diff --git a/aai_cli/tts/text.py b/aai_cli/tts/text.py index c637027..af99781 100644 --- a/aai_cli/tts/text.py +++ b/aai_cli/tts/text.py @@ -1,16 +1,18 @@ -"""Pure text helpers for streaming TTS: sentence splitting and chunking. +"""Text chunking for streaming TTS, built on LangChain's text splitters. PocketTTS (the streaming-TTS model behind ``assembly speak``) is fed incrementally — a whole document in a single ``Generate`` frame stalls the server. ``chunk_text`` breaks the input into sentence-aligned chunks small enough to synthesize one connection at a -time (see ``tts.session.synthesize_chunked``). Kept Rich-free and dependency-light so it -is trivially unit-testable. +time (see ``tts.session.synthesize_chunked``). The splitting is delegated to LangChain's +``RecursiveCharacterTextSplitter`` (https://docs.langchain.com/oss/python/integrations/splitters) +rather than a hand-rolled sentence scanner: it recurses through the separator list below, +preferring sentence boundaries and only falling back to words/characters when a single +sentence overflows the budget. """ from __future__ import annotations -# A sentence ends at one of these terminators (mirrors the agent-cascade splitter). -_TERMINATORS = ".!?" +from langchain_text_splitters import RecursiveCharacterTextSplitter # Conservative upper bound on the characters in a single Generate frame. PocketTTS is a # streaming model with a bounded context; everywhere else in the codebase it is fed one @@ -18,56 +20,26 @@ # down on a long page while keeping each frame comfortably small. _MAX_CHUNK_CHARS = 500 # pragma: no mutate -- a +-1 char budget is immaterial - -def split_sentences(text: str) -> list[str]: - """Split ``text`` into sentences, each ending in ``.``/``!``/``?``. - - A terminator ends a sentence only when it is the last character or is followed by - whitespace — so a ``.`` inside a number ("$3.50") or stacked terminators ("..."/"?!") - don't fragment one spoken sentence. A trailing fragment with no terminal punctuation - is kept, so no text is ever dropped; empty/whitespace-only pieces are discarded. - """ - sentences: list[str] = [] - start = 0 - for index, char in enumerate(text): - if char in _TERMINATORS and (index + 1 == len(text) or text[index + 1].isspace()): - # The two `+ 1`s below are equivalent under mutation: a confirmed boundary means - # index+1 is whitespace or end-of-text, so widening the slice / advancing start by - # one extra char only ever spans whitespace that .strip() removes. - sentences.append(text[start : index + 1].strip()) # pragma: no mutate - start = index + 1 # pragma: no mutate - tail = text[start:].strip() - if tail: - sentences.append(tail) - return sentences - - -def _bounded(sentence: str, max_chars: int) -> list[str]: - """Slice ``sentence`` into ``<= max_chars`` pieces. A sentence within the budget comes - back as a single piece (the one slice covers it); an over-long one (e.g. a PDF blob - with no sentence terminators) is split so no single Generate frame can stall the - server.""" - return [sentence[i : i + max_chars] for i in range(0, len(sentence), max_chars)] +# Separators in descending priority: a sentence terminator followed by a space is the +# preferred break, then paragraph/line breaks, then a word boundary, and finally a bare +# character split for an over-long blob with none of the above (e.g. a PDF with no +# terminators). Keeping the terminator with the preceding chunk ("end") preserves the +# punctuation the model needs for prosody. +_SEPARATORS = [". ", "! ", "? ", "\n\n", "\n", " ", ""] def chunk_text(text: str, max_chars: int = _MAX_CHUNK_CHARS) -> list[str]: """Split ``text`` into sentence-aligned chunks, each ``<= max_chars``. - Sentences are packed greedily so short ones share a chunk (and thus one connection); - packing never breaks mid-sentence unless a single sentence exceeds the budget, in - which case that sentence alone is sliced. Whitespace-only input yields no chunks. + Short sentences are packed together so they share a chunk (and thus one connection); + a break never lands mid-sentence unless a single sentence exceeds the budget, in which + case that sentence alone is sliced. Whitespace-only input yields no chunks, and no + text is dropped — rejoining the chunks recovers every word. """ - chunks: list[str] = [] - current = "" - for sentence in split_sentences(text): - for piece in _bounded(sentence, max_chars): - if not current: - current = piece - elif len(current) + 1 + len(piece) <= max_chars: - current = f"{current} {piece}" - else: - chunks.append(current) - current = piece - if current: - chunks.append(current) - return chunks + splitter = RecursiveCharacterTextSplitter( + chunk_size=max_chars, + chunk_overlap=0, + separators=_SEPARATORS, + keep_separator="end", + ) + return splitter.split_text(text) diff --git a/pyproject.toml b/pyproject.toml index e2da9d8..f330876 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,6 +83,7 @@ dependencies = [ "langchain-tavily>=0.2.18", "langgraph-checkpoint-sqlite>=3.1.0", "pyperclip>=1.11.0", + "langchain-text-splitters>=1.0.0", ] [project.urls] diff --git a/tests/test_tts_text.py b/tests/test_tts_text.py index d6f3414..fc64c7f 100644 --- a/tests/test_tts_text.py +++ b/tests/test_tts_text.py @@ -3,22 +3,6 @@ from aai_cli.tts import text -def test_split_sentences_keeps_terminators_and_drops_blanks(): - assert text.split_sentences("Hello there. How are you?") == ["Hello there.", "How are you?"] - - -def test_split_sentences_does_not_break_on_mid_number_period(): - # A "." inside "$3.50" is not a sentence boundary (no following whitespace). - assert text.split_sentences("It costs $3.50 today.") == ["It costs $3.50 today."] - - -def test_split_sentences_keeps_unterminated_tail(): - # PDF/article text often has no closing punctuation — the tail is kept, not dropped. - assert text.split_sentences("an extracted blob with no terminator") == [ - "an extracted blob with no terminator" - ] - - def test_chunk_text_packs_short_sentences_into_one_chunk(): # Several short sentences well under the budget ride in a single chunk (one # connection) rather than one connection per sentence. @@ -26,22 +10,29 @@ def test_chunk_text_packs_short_sentences_into_one_chunk(): assert out == ["One. Two. Three."] +def test_chunk_text_does_not_break_on_mid_number_period(): + # A "." inside "$3.50" is not a sentence boundary (no following whitespace), so the + # whole sentence stays in one chunk. + assert text.chunk_text("It costs $3.50 today.", max_chars=100) == ["It costs $3.50 today."] + + def test_chunk_text_packs_two_sentences_exactly_at_the_budget(): - # "ab." + " " + "cd." == 7 chars: at a budget of 7 they pack into one chunk. Pins the - # space-joiner (+1) and the inclusive `<=` boundary — a budget of 7 must still pack. + # "ab." + " " + "cd." == 7 chars: at a budget of 7 they pack into one chunk. Pins that + # the sentence-terminator separator packs rather than splitting at the boundary. assert text.chunk_text("ab. cd.", max_chars=7) == ["ab. cd."] def test_chunk_text_splits_two_sentences_one_over_the_budget(): # The same two sentences need 7 chars joined; a budget of 6 can't hold both, so the - # second rolls to its own chunk (packing never breaks mid-sentence). Pins that the - # joiner counts the separating space — without it 3+3 would wrongly fit in 6. + # second rolls to its own chunk (the break lands on the sentence terminator, never + # mid-word). assert text.chunk_text("ab. cd.", max_chars=6) == ["ab.", "cd."] def test_chunk_text_slices_a_single_oversized_sentence(): - # A lone "sentence" longer than the budget (no terminators — the PDF case) is sliced - # so no single Generate frame can blow past the server's input ceiling. + # A lone "sentence" longer than the budget (no terminators or spaces — the PDF blob + # case) falls all the way to the bare-character separator so no single Generate frame + # can blow past the server's input ceiling. out = text.chunk_text("abcdefghij", max_chars=4) assert out == ["abcd", "efgh", "ij"] assert all(len(piece) <= 4 for piece in out) @@ -56,5 +47,6 @@ def test_chunk_text_every_chunk_within_budget_for_a_long_paragraph(): out = text.chunk_text(para, max_chars=120) assert len(out) > 1 # a long paragraph really is chunked assert all(len(piece) <= 120 for piece in out) - # No text is lost: rejoining the chunks recovers every word in order. + # No text is lost (and none is duplicated by overlap): rejoining the chunks recovers + # every word in order. assert out and " ".join(out).split() == para.split() diff --git a/uv.lock b/uv.lock index cf2218a..6673121 100644 --- a/uv.lock +++ b/uv.lock @@ -32,6 +32,7 @@ dependencies = [ { name = "langchain-mcp-adapters" }, { name = "langchain-openai" }, { name = "langchain-tavily" }, + { name = "langchain-text-splitters" }, { name = "langgraph" }, { name = "langgraph-checkpoint-sqlite" }, { name = "openai" }, @@ -96,6 +97,7 @@ requires-dist = [ { name = "langchain-mcp-adapters", specifier = ">=0.3.0" }, { name = "langchain-openai", specifier = ">=1.3.2" }, { name = "langchain-tavily", specifier = ">=0.2.18" }, + { name = "langchain-text-splitters", specifier = ">=1.0.0" }, { name = "langgraph", specifier = ">=1.2.2" }, { name = "langgraph-checkpoint-sqlite", specifier = ">=3.1.0" }, { name = "openai", specifier = ">=2.41.0" }, @@ -1654,6 +1656,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/71/9c/0c043e4434b1823f0ac194f66036cbb0569275a99dcb890e0891ecd34fb2/langchain_tavily-0.2.18-py3-none-any.whl", hash = "sha256:dccf3ad1c50e2cb2a89bec11727555805c9df8abd42c1f3ad42ccad86e28aa44", size = 30814, upload-time = "2026-04-16T15:23:23.424Z" }, ] +[[package]] +name = "langchain-text-splitters" +version = "1.1.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "langchain-core" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/26/9f/6c545900fefb7b00ddfa3f16b80d61338a0ec68c31c5451eeeab99082760/langchain_text_splitters-1.1.2.tar.gz", hash = "sha256:782a723db0a4746ac91e251c7c1d57fd23636e4f38ed733074e28d7a86f41627", size = 293580, upload-time = "2026-04-16T14:20:39.162Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d3/26/1ef06f56198d631296d646a6223de35bcc6cf9795ceb2442816bc963b84c/langchain_text_splitters-1.1.2-py3-none-any.whl", hash = "sha256:a2de0d799ff31886429fd6e2e0032df275b60ec817c19059a7b46181cc1c2f10", size = 35903, upload-time = "2026-04-16T14:20:38.243Z" }, +] + [[package]] name = "langgraph" version = "1.2.5"