-
Notifications
You must be signed in to change notification settings - Fork 35
⚡ Bolt: [performance improvement] Optimize CivicRAG retrieval with pre-tokenization #699
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -8,6 +8,9 @@ | |||||||||||||||||||||
|
|
||||||||||||||||||||||
| class CivicRAG: | ||||||||||||||||||||||
| def __init__(self, policies_path: str = "backend/data/civic_policies.json"): | ||||||||||||||||||||||
| # Performance Boost: Pre-compile regex for faster tokenization | ||||||||||||||||||||||
| self._token_regex = re.compile(r'[^a-z0-9\s]') | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| # Try to locate the file robustly | ||||||||||||||||||||||
| if not os.path.exists(policies_path): | ||||||||||||||||||||||
| # Try relative to this file | ||||||||||||||||||||||
|
|
@@ -22,11 +25,22 @@ def __init__(self, policies_path: str = "backend/data/civic_policies.json"): | |||||||||||||||||||||
| policies_path = alt_path_root | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| self.policies = [] | ||||||||||||||||||||||
| self.pretokenized_policies = [] | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| try: | ||||||||||||||||||||||
| if os.path.exists(policies_path): | ||||||||||||||||||||||
| with open(policies_path, 'r') as f: | ||||||||||||||||||||||
| self.policies = json.load(f) | ||||||||||||||||||||||
| logger.info(f"Loaded {len(self.policies)} civic policies for RAG.") | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| # Performance Boost: Pre-tokenize all policies during initialization | ||||||||||||||||||||||
| # to avoid redundant O(N) processing on every retrieve call. | ||||||||||||||||||||||
| for policy in self.policies: | ||||||||||||||||||||||
| content = f"{policy.get('title', '')} {policy.get('text', '')}" | ||||||||||||||||||||||
| self.pretokenized_policies.append({ | ||||||||||||||||||||||
| "content_tokens": self._tokenize(content), | ||||||||||||||||||||||
| "title_tokens": self._tokenize(policy.get('title', '')) | ||||||||||||||||||||||
| }) | ||||||||||||||||||||||
|
Comment on lines
+36
to
+43
|
||||||||||||||||||||||
| else: | ||||||||||||||||||||||
| logger.warning(f"Civic policies file not found at {policies_path}") | ||||||||||||||||||||||
| except Exception as e: | ||||||||||||||||||||||
|
|
@@ -35,8 +49,8 @@ def __init__(self, policies_path: str = "backend/data/civic_policies.json"): | |||||||||||||||||||||
| def _tokenize(self, text: str) -> set: | ||||||||||||||||||||||
| """Simple tokenizer: lowercase, remove non-alphanumeric, split.""" | ||||||||||||||||||||||
| text = text.lower() | ||||||||||||||||||||||
| # Keep only alphanumeric and spaces | ||||||||||||||||||||||
| text = re.sub(r'[^a-z0-9\s]', '', text) | ||||||||||||||||||||||
| # Performance Boost: Use pre-compiled regex | ||||||||||||||||||||||
| text = self._token_regex.sub('', text) | ||||||||||||||||||||||
| return set(text.split()) | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| def retrieve(self, query: str, threshold: float = 0.05) -> Optional[str]: | ||||||||||||||||||||||
|
|
@@ -54,10 +68,9 @@ def retrieve(self, query: str, threshold: float = 0.05) -> Optional[str]: | |||||||||||||||||||||
| best_score = 0.0 | ||||||||||||||||||||||
| best_policy = None | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| for policy in self.policies: | ||||||||||||||||||||||
| # combine title and text for matching | ||||||||||||||||||||||
| policy_content = f"{policy.get('title', '')} {policy.get('text', '')}" | ||||||||||||||||||||||
| policy_tokens = self._tokenize(policy_content) | ||||||||||||||||||||||
| for policy, pretokenized in zip(self.policies, self.pretokenized_policies): | ||||||||||||||||||||||
| # Performance Boost: Use pre-calculated token sets | ||||||||||||||||||||||
| policy_tokens = pretokenized["content_tokens"] | ||||||||||||||||||||||
|
Comment on lines
+71
to
+73
|
||||||||||||||||||||||
| for policy, pretokenized in zip(self.policies, self.pretokenized_policies): | |
| # Performance Boost: Use pre-calculated token sets | |
| policy_tokens = pretokenized["content_tokens"] | |
| for idx, policy in enumerate(self.policies): | |
| pretokenized = self.pretokenized_policies[idx] if idx < len(self.pretokenized_policies) else { | |
| "content_tokens": self._tokenize(f"{policy.get('title', '')} {policy.get('text', '')}"), | |
| "title_tokens": self._tokenize(policy.get('title', '')) | |
| } | |
| # Performance Boost: Use pre-calculated token sets | |
| policy_tokens = pretokenized["content_tokens"] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Playbook entry dated in the future.
The new entry is dated
2026-05-16, but this PR was opened on2026-04-24. Other entries use the date the learning was added, so consider correcting this to the actual authoring date to keep the playbook's chronology accurate.🤖 Prompt for AI Agents