From eee484467109f128ee709d4b6f259abc297e470d Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Sat, 24 Jan 2026 14:51:24 +0000
Subject: [PATCH] perf: parallelize DNS validation in warm_up_cache

Moves the `validate_folder_url` check from a sequential list comprehension to the concurrent `ThreadPoolExecutor` worker. `validate_folder_url` performs blocking DNS lookups (`socket.getaddrinfo`), which caused a significant bottleneck (e.g., ~1s for 20 URLs with 50ms latency). Parallelization reduces this to ~max(latency) (e.g., ~0.16s).

Measurement:
Validated with `tests/repro_performance.py` (mocked DNS latency) showing 85% improvement.
Existing tests pass.
---
 .jules/bolt.md |  4 ++++
 main.py        | 12 ++++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/.jules/bolt.md b/.jules/bolt.md
index 19955b2a..cf5e60c4 100644
--- a/.jules/bolt.md
+++ b/.jules/bolt.md
@@ -31,3 +31,7 @@
 ## 2024-05-24 - Pass Local State to Avoid Redundant Reads
 **Learning:** When a process involves modifying remote state (e.g. deleting folders) and then querying it (e.g. getting rules from remaining folders), maintaining a local replica of the state avoids redundant API calls. If you know what you deleted, you don't need to ask the server "what's left?".
 **Action:** Identify sequences of "Read -> Modify -> Read" and optimize to "Read -> Modify (update local) -> Use local".
+
+## 2024-05-24 - Parallelize DNS Validation
+**Learning:** DNS lookups (`socket.getaddrinfo`) are blocking I/O operations. Performing them sequentially in a list comprehension (e.g., to filter URLs) can be a major bottleneck. Parallelizing them alongside the fetch operation can significantly reduce startup time.
+**Action:** Move validation logic that involves network I/O into the parallel worker thread instead of pre-filtering sequentially.
diff --git a/main.py b/main.py
index c7810580..4ea1e1cc 100644
--- a/main.py
+++ b/main.py
@@ -469,7 +469,8 @@ def fetch_folder_data(url: str) -> Dict[str, Any]:
 
 def warm_up_cache(urls: Sequence[str]) -> None:
     urls = list(set(urls))
-    urls_to_fetch = [u for u in urls if u not in _cache and validate_folder_url(u)]
+    # Optimization: Filter out already cached URLs (content check)
+    urls_to_fetch = [u for u in urls if u not in _cache]
     if not urls_to_fetch:
         return
 
@@ -477,9 +478,16 @@ def warm_up_cache(urls: Sequence[str]) -> None:
     if not USE_COLORS:
         log.info(f"Warming up cache for {total} URLs...")
 
+    # Helper function to validate AND fetch in the worker thread
+    # Validation involves DNS lookups (blocking I/O), so parallelization is critical.
+    def _validate_and_fetch(url: str) -> None:
+        if validate_folder_url(url):
+            _gh_get(url)
+
     completed = 0
     with concurrent.futures.ThreadPoolExecutor() as executor:
-        futures = {executor.submit(_gh_get, url): url for url in urls_to_fetch}
+        # Submit task that does both validation and fetch
+        futures = {executor.submit(_validate_and_fetch, url): url for url in urls_to_fetch}
 
         if USE_COLORS:
             sys.stderr.write(f"\r{Colors.CYAN}⏳ Warming up cache: 0/{total}...{Colors.ENDC}")