From ddcfeedce3acef00a2ebe21a8a8b6973557a0043 Mon Sep 17 00:00:00 2001 From: mrveiss Date: Mon, 23 Mar 2026 21:25:43 +0200 Subject: [PATCH] fix(rag): handle abbreviations in sentence splitter to prevent false splits (#2170) --- .../neural_mesh/evidence_extractor.py | 25 ++++++++++++--- .../neural_mesh/evidence_extractor_test.py | 32 +++++++++++++++++++ 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/autobot-backend/services/neural_mesh/evidence_extractor.py b/autobot-backend/services/neural_mesh/evidence_extractor.py index 3c4b737b6..055137fbd 100644 --- a/autobot-backend/services/neural_mesh/evidence_extractor.py +++ b/autobot-backend/services/neural_mesh/evidence_extractor.py @@ -49,6 +49,21 @@ class Evidence: relevance: float +# Negative-lookbehind anchors that prevent splitting after common abbreviations. +# Each alternative is a fixed-width lookbehind (Python requires this). +# Referenced by EvidenceExtractor._split_sentences (#2170). +_ABBREVS = ( + r"(? list[tuple[str, str]]: def _split_sentences(self, text: str) -> list[str]: """Split text into sentences on terminal punctuation. - Splits on a period, question mark, or exclamation mark followed - by whitespace or end-of-string. + Splits on a period, question mark, or exclamation mark followed by + whitespace, but NOT after common abbreviations such as Dr., Mr., Ms., + Mrs., St., vs., e.g., i.e., or U.S. (#2170). Args: text: Raw paragraph or chunk content. Returns: - List of sentence strings (may include empty strings). + List of sentence strings with leading/trailing whitespace stripped. """ - return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()] + pattern = _ABBREVS + r"(?<=[.!?])\s+" + return [s.strip() for s in re.split(pattern, text) if s.strip()] async def _score_sentences( self, query: str, sentences: list[tuple[str, str]] diff --git a/autobot-backend/services/neural_mesh/evidence_extractor_test.py b/autobot-backend/services/neural_mesh/evidence_extractor_test.py index e008435ce..2b97bd432 100644 --- a/autobot-backend/services/neural_mesh/evidence_extractor_test.py +++ b/autobot-backend/services/neural_mesh/evidence_extractor_test.py @@ -210,6 +210,38 @@ def test_split_sentences_empty_string_returns_empty(self): assert parts == [] + def test_split_does_not_break_on_dr(self): + """'Dr. Smith is here.' must not be split into two fragments (#2170).""" + extractor = EvidenceExtractor(reranker=AsyncMock(), max_evidence=15) + + parts = extractor._split_sentences("Dr. Smith is here.") + + assert parts == ["Dr. Smith is here."] + + def test_split_does_not_break_on_eg(self): + """'e.g. this example is valid.' must remain as one sentence (#2170).""" + extractor = EvidenceExtractor(reranker=AsyncMock(), max_evidence=15) + + parts = extractor._split_sentences("e.g. this example is valid.") + + assert parts == ["e.g. this example is valid."] + + def test_split_does_not_break_on_us(self): + """'U.S. is a country.' must not be split after the abbreviation (#2170).""" + extractor = EvidenceExtractor(reranker=AsyncMock(), max_evidence=15) + + parts = extractor._split_sentences("U.S. is a country.") + + assert parts == ["U.S. is a country."] + + def test_split_still_breaks_on_real_sentence_end(self): + """Two genuine sentences still split correctly even with fix applied (#2170).""" + extractor = EvidenceExtractor(reranker=AsyncMock(), max_evidence=15) + + parts = extractor._split_sentences("First sentence. Second sentence.") + + assert parts == ["First sentence.", "Second sentence."] + # ============================================================================= # Relevance ordering