From ddcfeedce3acef00a2ebe21a8a8b6973557a0043 Mon Sep 17 00:00:00 2001
From: mrveiss <martins.veiss@gmail.com>
Date: Mon, 23 Mar 2026 21:25:43 +0200
Subject: [PATCH] fix(rag): handle abbreviations in sentence splitter to
 prevent false splits (#2170)

---
 .../neural_mesh/evidence_extractor.py         | 25 ++++++++++++---
 .../neural_mesh/evidence_extractor_test.py    | 32 +++++++++++++++++++
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/autobot-backend/services/neural_mesh/evidence_extractor.py b/autobot-backend/services/neural_mesh/evidence_extractor.py
index 3c4b737b6..055137fbd 100644
--- a/autobot-backend/services/neural_mesh/evidence_extractor.py
+++ b/autobot-backend/services/neural_mesh/evidence_extractor.py
@@ -49,6 +49,21 @@ class Evidence:
     relevance: float
 
 
+# Negative-lookbehind anchors that prevent splitting after common abbreviations.
+# Each alternative is a fixed-width lookbehind (Python requires this).
+# Referenced by EvidenceExtractor._split_sentences (#2170).
+_ABBREVS = (
+    r"(?<!Dr\.)"
+    r"(?<!Mr\.)"
+    r"(?<!Ms\.)"
+    r"(?<!Mrs\.)"
+    r"(?<!St\.)"
+    r"(?<!vs\.)"
+    r"(?<!e\.g\.)"
+    r"(?<!i\.e\.)"
+    r"(?<!U\.S\.)"
+)
+
 # =============================================================================
 # Extractor
 # =============================================================================
@@ -127,16 +142,18 @@ def _collect_sentences(self, chunks: list[dict]) -> list[tuple[str, str]]:
     def _split_sentences(self, text: str) -> list[str]:
         """Split text into sentences on terminal punctuation.
 
-        Splits on a period, question mark, or exclamation mark followed
-        by whitespace or end-of-string.
+        Splits on a period, question mark, or exclamation mark followed by
+        whitespace, but NOT after common abbreviations such as Dr., Mr., Ms.,
+        Mrs., St., vs., e.g., i.e., or U.S. (#2170).
 
         Args:
             text: Raw paragraph or chunk content.
 
         Returns:
-            List of sentence strings (may include empty strings).
+            List of sentence strings with leading/trailing whitespace stripped.
         """
-        return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
+        pattern = _ABBREVS + r"(?<=[.!?])\s+"
+        return [s.strip() for s in re.split(pattern, text) if s.strip()]
 
     async def _score_sentences(
         self, query: str, sentences: list[tuple[str, str]]
diff --git a/autobot-backend/services/neural_mesh/evidence_extractor_test.py b/autobot-backend/services/neural_mesh/evidence_extractor_test.py
index e008435ce..2b97bd432 100644
--- a/autobot-backend/services/neural_mesh/evidence_extractor_test.py
+++ b/autobot-backend/services/neural_mesh/evidence_extractor_test.py
@@ -210,6 +210,38 @@ def test_split_sentences_empty_string_returns_empty(self):
 
         assert parts == []
 
+    def test_split_does_not_break_on_dr(self):
+        """'Dr. Smith is here.' must not be split into two fragments (#2170)."""
+        extractor = EvidenceExtractor(reranker=AsyncMock(), max_evidence=15)
+
+        parts = extractor._split_sentences("Dr. Smith is here.")
+
+        assert parts == ["Dr. Smith is here."]
+
+    def test_split_does_not_break_on_eg(self):
+        """'e.g. this example is valid.' must remain as one sentence (#2170)."""
+        extractor = EvidenceExtractor(reranker=AsyncMock(), max_evidence=15)
+
+        parts = extractor._split_sentences("e.g. this example is valid.")
+
+        assert parts == ["e.g. this example is valid."]
+
+    def test_split_does_not_break_on_us(self):
+        """'U.S. is a country.' must not be split after the abbreviation (#2170)."""
+        extractor = EvidenceExtractor(reranker=AsyncMock(), max_evidence=15)
+
+        parts = extractor._split_sentences("U.S. is a country.")
+
+        assert parts == ["U.S. is a country."]
+
+    def test_split_still_breaks_on_real_sentence_end(self):
+        """Two genuine sentences still split correctly even with fix applied (#2170)."""
+        extractor = EvidenceExtractor(reranker=AsyncMock(), max_evidence=15)
+
+        parts = extractor._split_sentences("First sentence. Second sentence.")
+
+        assert parts == ["First sentence.", "Second sentence."]
+
 
 # =============================================================================
 # Relevance ordering