From 6f20dc95b494c0b67f9d876f8b1907d79c231f60 Mon Sep 17 00:00:00 2001 From: RohanExploit <178623867+RohanExploit@users.noreply.github.com> Date: Tue, 28 Apr 2026 14:01:45 +0000 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20Bolt:=20[performance=20improvement]?= =?UTF-8?q?=20Optimize=20RAG=20retrieval=20path?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Pre-calculate policy token lengths during initialization. - Implement isdisjoint() early exit for non-matching policies. - Optimize Jaccard similarity using mathematical union length formula to avoid set construction overhead. - Use isdisjoint() for faster title match bonus check. --- .jules/bolt.md | 8 ++++++++ backend/rag_service.py | 25 +++++++++++++++---------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/.jules/bolt.md b/.jules/bolt.md index 02a6e1a2..7b60ebc6 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -85,3 +85,11 @@ ## 2026-05-16 - Pre-processing for RAG Retrieval **Learning:** In RAG (Retrieval-Augmented Generation) systems with static or semi-static policy datasets, performing tokenization, regex substitution, and string formatting inside the retrieval loop is a significant bottleneck that scales with the number of policies. **Action:** Move all deterministic operations (tokenization, formatting, regex matching prep) to a one-time initialization step to ensure the retrieval hot-path only performs necessary set intersections and similarity calculations. + +## 2025-05-18 - Optimized Jaccard Similarity for RAG +**Learning:** Calculating Jaccard similarity in a hot loop can be optimized by using the inclusion-exclusion principle (|A ∪ B| = |A| + |B| - |A ∩ B|) to avoid the overhead of set union construction. Combining this with for early exits significantly reduces CPU cycles for non-matching documents. +**Action:** Use mathematical union length and for set similarity comparisons in high-frequency retrieval paths. + +## 2025-05-18 - Optimized Jaccard Similarity for RAG +**Learning:** Calculating Jaccard similarity in a hot loop can be optimized by using the inclusion-exclusion principle (|A ∪ B| = |A| + |B| - |A ∩ B|) to avoid the overhead of set union construction. Combining this with `isdisjoint()` for early exits significantly reduces CPU cycles for non-matching documents. +**Action:** Use mathematical union length and `isdisjoint()` for set similarity comparisons in high-frequency retrieval paths. diff --git a/backend/rag_service.py b/backend/rag_service.py index 0793943d..cb98a56f 100644 --- a/backend/rag_service.py +++ b/backend/rag_service.py @@ -46,10 +46,12 @@ def _prepare_policies(self): source = policy.get('source', 'Unknown') content = f"{title} {text}" + content_tokens = self._tokenize(content) self._prepared_policies.append({ 'title_tokens': self._tokenize(title), - 'content_tokens': self._tokenize(content), + 'content_tokens': content_tokens, + 'content_tokens_len': len(content_tokens), 'formatted': f"**{title}**: {text} (Source: {source})", 'original': policy }) @@ -73,30 +75,33 @@ def retrieve(self, query: str, threshold: float = 0.05) -> Optional[str]: if not query_tokens: return None + query_tokens_len = len(query_tokens) best_score = 0.0 best_formatted = None for prepared in self._prepared_policies: policy_tokens = prepared['content_tokens'] - if not policy_tokens: + # Optimization: Use isdisjoint() for fast early exit + if query_tokens.isdisjoint(policy_tokens): continue # Jaccard Similarity + # Optimization: Mathematical union length |A union B| = |A| + |B| - |A intersection B| + # This avoids the overhead of building a new set with .union() intersection = query_tokens.intersection(policy_tokens) - # Use pre-calculated set for union if possible? - # Union depends on query_tokens, so must be calculated. - union = query_tokens.union(policy_tokens) + intersection_len = len(intersection) - if not union: + union_len = query_tokens_len + prepared['content_tokens_len'] - intersection_len + + if union_len == 0: continue - score = len(intersection) / len(union) + score = intersection_len / union_len # Boost score if title words match (weighted) - title_tokens = prepared['title_tokens'] - title_match = len(query_tokens.intersection(title_tokens)) - if title_match > 0: + # Optimization: Use isdisjoint() for faster boolean check + if not query_tokens.isdisjoint(prepared['title_tokens']): score += 0.2 # Bonus for title match if score > best_score: