Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 21 additions & 4 deletions autobot-backend/services/neural_mesh/evidence_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,21 @@ class Evidence:
relevance: float


# Negative-lookbehind anchors that prevent splitting after common abbreviations.
# Each alternative is a fixed-width lookbehind (Python requires this).
# Referenced by EvidenceExtractor._split_sentences (#2170).
_ABBREVS = (
r"(?<!Dr\.)"
r"(?<!Mr\.)"
r"(?<!Ms\.)"
r"(?<!Mrs\.)"
r"(?<!St\.)"
r"(?<!vs\.)"
r"(?<!e\.g\.)"
r"(?<!i\.e\.)"
r"(?<!U\.S\.)"
)

# =============================================================================
# Extractor
# =============================================================================
Expand Down Expand Up @@ -127,16 +142,18 @@ def _collect_sentences(self, chunks: list[dict]) -> list[tuple[str, str]]:
def _split_sentences(self, text: str) -> list[str]:
"""Split text into sentences on terminal punctuation.

Splits on a period, question mark, or exclamation mark followed
by whitespace or end-of-string.
Splits on a period, question mark, or exclamation mark followed by
whitespace, but NOT after common abbreviations such as Dr., Mr., Ms.,
Mrs., St., vs., e.g., i.e., or U.S. (#2170).

Args:
text: Raw paragraph or chunk content.

Returns:
List of sentence strings (may include empty strings).
List of sentence strings with leading/trailing whitespace stripped.
"""
return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
pattern = _ABBREVS + r"(?<=[.!?])\s+"
return [s.strip() for s in re.split(pattern, text) if s.strip()]

async def _score_sentences(
self, query: str, sentences: list[tuple[str, str]]
Expand Down
32 changes: 32 additions & 0 deletions autobot-backend/services/neural_mesh/evidence_extractor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,38 @@ def test_split_sentences_empty_string_returns_empty(self):

assert parts == []

def test_split_does_not_break_on_dr(self):
"""'Dr. Smith is here.' must not be split into two fragments (#2170)."""
extractor = EvidenceExtractor(reranker=AsyncMock(), max_evidence=15)

parts = extractor._split_sentences("Dr. Smith is here.")

assert parts == ["Dr. Smith is here."]

def test_split_does_not_break_on_eg(self):
"""'e.g. this example is valid.' must remain as one sentence (#2170)."""
extractor = EvidenceExtractor(reranker=AsyncMock(), max_evidence=15)

parts = extractor._split_sentences("e.g. this example is valid.")

assert parts == ["e.g. this example is valid."]

def test_split_does_not_break_on_us(self):
"""'U.S. is a country.' must not be split after the abbreviation (#2170)."""
extractor = EvidenceExtractor(reranker=AsyncMock(), max_evidence=15)

parts = extractor._split_sentences("U.S. is a country.")

assert parts == ["U.S. is a country."]

def test_split_still_breaks_on_real_sentence_end(self):
"""Two genuine sentences still split correctly even with fix applied (#2170)."""
extractor = EvidenceExtractor(reranker=AsyncMock(), max_evidence=15)

parts = extractor._split_sentences("First sentence. Second sentence.")

assert parts == ["First sentence.", "Second sentence."]


# =============================================================================
# Relevance ordering
Expand Down
Loading