-
Notifications
You must be signed in to change notification settings - Fork 35
β‘ Bolt: optimize RAG retrieval performance #712
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -46,10 +46,12 @@ def _prepare_policies(self): | |
| source = policy.get('source', 'Unknown') | ||
|
|
||
| content = f"{title} {text}" | ||
| content_tokens = self._tokenize(content) | ||
|
|
||
| self._prepared_policies.append({ | ||
| 'title_tokens': self._tokenize(title), | ||
| 'content_tokens': self._tokenize(content), | ||
| 'content_tokens': content_tokens, | ||
| 'content_token_count': len(content_tokens), | ||
| 'formatted': f"**{title}**: {text} (Source: {source})", | ||
| 'original': policy | ||
| }) | ||
|
|
@@ -65,12 +67,14 @@ def retrieve(self, query: str, threshold: float = 0.05) -> Optional[str]: | |
| """ | ||
| Retrieve the most relevant policy based on Jaccard similarity of tokens. | ||
| Returns the formatted policy string or None if below threshold. | ||
| Optimized: Uses pre-calculated token lengths and mathematical union to avoid O(N) union. | ||
| """ | ||
| if not query or not self._prepared_policies: | ||
| return None | ||
|
|
||
| query_tokens = self._tokenize(query) | ||
| if not query_tokens: | ||
| query_token_count = len(query_tokens) | ||
| if query_token_count == 0: | ||
| return None | ||
|
|
||
| best_score = 0.0 | ||
|
|
@@ -79,19 +83,21 @@ def retrieve(self, query: str, threshold: float = 0.05) -> Optional[str]: | |
| for prepared in self._prepared_policies: | ||
| policy_tokens = prepared['content_tokens'] | ||
|
|
||
| if not policy_tokens: | ||
| # Performance: Use isdisjoint for fast early-exit when there is no overlap | ||
| if query_tokens.isdisjoint(policy_tokens): | ||
| continue | ||
|
|
||
| # Jaccard Similarity | ||
| intersection = query_tokens.intersection(policy_tokens) | ||
| # Use pre-calculated set for union if possible? | ||
| # Union depends on query_tokens, so must be calculated. | ||
| union = query_tokens.union(policy_tokens) | ||
| # Jaccard Similarity: |A β© B| / |A βͺ B| | ||
| intersection_count = len(query_tokens.intersection(policy_tokens)) | ||
|
|
||
| if not union: | ||
| # Performance: Use mathematical formula for union length: |A βͺ B| = |A| + |B| - |A β© B| | ||
| # This avoids O(N) allocation and population of a new union set. | ||
| union_count = query_token_count + prepared['content_token_count'] - intersection_count | ||
|
Comment on lines
+90
to
+95
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Replace the Unicode union symbol in the comment. Ruff is already flagging Suggested tweak- # Jaccard Similarity: |A β© B| / |A βͺ B|
+ # Jaccard Similarity: |A β© B| / |A union B|π§° Toolsπͺ Ruff (0.15.12)[warning] 90-90: Comment contains ambiguous (RUF003) [warning] 93-93: Comment contains ambiguous (RUF003) π€ Prompt for AI Agents |
||
|
|
||
| if union_count == 0: | ||
| continue | ||
|
|
||
| score = len(intersection) / len(union) | ||
| score = intersection_count / union_count | ||
|
|
||
| # Boost score if title words match (weighted) | ||
| title_tokens = prepared['title_tokens'] | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Avoid future-dating this note.
Line 89 uses
2026-05-18, which is after this PRβs current date. That makes the note order look inconsistent and can confuse readers/tools that sort these entries chronologically.π€ Prompt for AI Agents