From f97de938bea7d92cd6301d8387eecb4fbbf2a0b2 Mon Sep 17 00:00:00 2001 From: RohanExploit <178623867+RohanExploit@users.noreply.github.com> Date: Wed, 29 Apr 2026 13:52:39 +0000 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20Bolt:=20optimize=20RAG=20retrieval?= =?UTF-8?q?=20performance?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Pre-calculate token counts for policies during initialization. - Use `isdisjoint()` for fast early-exit on non-matching policies. - Use inclusion-exclusion principle to calculate union size mathematically, avoiding expensive `set.union()` allocations. --- .jules/bolt.md | 4 ++++ backend/rag_service.py | 26 ++++++++++++++++---------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/.jules/bolt.md b/.jules/bolt.md index 02a6e1a2..2b5873d2 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -85,3 +85,7 @@ ## 2026-05-16 - Pre-processing for RAG Retrieval **Learning:** In RAG (Retrieval-Augmented Generation) systems with static or semi-static policy datasets, performing tokenization, regex substitution, and string formatting inside the retrieval loop is a significant bottleneck that scales with the number of policies. **Action:** Move all deterministic operations (tokenization, formatting, regex matching prep) to a one-time initialization step to ensure the retrieval hot-path only performs necessary set intersections and similarity calculations. + +## 2026-05-18 - Mathematical Set Operations for Jaccard Similarity +**Learning:** Calculating Jaccard similarity (|A ∩ B| / |A ∪ B|) using `set.union()` inside a retrieval loop incurs significant O(N) memory allocation and population overhead. Since |A ∪ B| = |A| + |B| - |A ∩ B|, the union size can be calculated via O(1) arithmetic if set sizes are pre-calculated. +**Action:** Pre-calculate set lengths for static data. In retrieval loops, use `isdisjoint()` for early exits and the inclusion-exclusion formula to avoid explicit set union operations. diff --git a/backend/rag_service.py b/backend/rag_service.py index 0793943d..6dc18398 100644 --- a/backend/rag_service.py +++ b/backend/rag_service.py @@ -46,10 +46,12 @@ def _prepare_policies(self): source = policy.get('source', 'Unknown') content = f"{title} {text}" + content_tokens = self._tokenize(content) self._prepared_policies.append({ 'title_tokens': self._tokenize(title), - 'content_tokens': self._tokenize(content), + 'content_tokens': content_tokens, + 'content_token_count': len(content_tokens), 'formatted': f"**{title}**: {text} (Source: {source})", 'original': policy }) @@ -65,12 +67,14 @@ def retrieve(self, query: str, threshold: float = 0.05) -> Optional[str]: """ Retrieve the most relevant policy based on Jaccard similarity of tokens. Returns the formatted policy string or None if below threshold. + Optimized: Uses pre-calculated token lengths and mathematical union to avoid O(N) union. """ if not query or not self._prepared_policies: return None query_tokens = self._tokenize(query) - if not query_tokens: + query_token_count = len(query_tokens) + if query_token_count == 0: return None best_score = 0.0 @@ -79,19 +83,21 @@ def retrieve(self, query: str, threshold: float = 0.05) -> Optional[str]: for prepared in self._prepared_policies: policy_tokens = prepared['content_tokens'] - if not policy_tokens: + # Performance: Use isdisjoint for fast early-exit when there is no overlap + if query_tokens.isdisjoint(policy_tokens): continue - # Jaccard Similarity - intersection = query_tokens.intersection(policy_tokens) - # Use pre-calculated set for union if possible? - # Union depends on query_tokens, so must be calculated. - union = query_tokens.union(policy_tokens) + # Jaccard Similarity: |A ∩ B| / |A ∪ B| + intersection_count = len(query_tokens.intersection(policy_tokens)) - if not union: + # Performance: Use mathematical formula for union length: |A ∪ B| = |A| + |B| - |A ∩ B| + # This avoids O(N) allocation and population of a new union set. + union_count = query_token_count + prepared['content_token_count'] - intersection_count + + if union_count == 0: continue - score = len(intersection) / len(union) + score = intersection_count / union_count # Boost score if title words match (weighted) title_tokens = prepared['title_tokens']