abhimehro · abhimehro · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026
diff --git a/.jules/bolt.md b/.jules/bolt.md
@@ -35,3 +35,7 @@
 ## 2024-05-24 - Parallelize DNS Validation
 **Learning:** DNS lookups (`socket.getaddrinfo`) are blocking I/O operations. Performing them sequentially in a list comprehension (e.g., to filter URLs) can be a major bottleneck. Parallelizing them alongside the fetch operation can significantly reduce startup time.
 **Action:** Move validation logic that involves network I/O into the parallel worker thread instead of pre-filtering sequentially.
+
+## 2026-01-27 - Redundant Validation for Cached Data
+**Learning:** Re-validating resource properties (like DNS/IP) when using *cached content* is pure overhead. If the content is served from memory (proven safe at fetch time), checking the *current* state of the source is disconnected from the data being used.
+**Action:** When using a multi-stage pipeline (Warmup -> Process), ensure validation state persists alongside the data cache. Avoid clearing validation caches between stages if the data cache is not also cleared.
diff --git a/main.py b/main.py
@@ -24,6 +24,7 @@
 import socket
 import stat
 import sys
+import threading
 import time
 from functools import lru_cache
 from typing import Any, Callable, Dict, List, Optional, Sequence, Set
@@ -315,13 +316,16 @@
 # 3. Helpers
 # --------------------------------------------------------------------------- #
 _cache: Dict[str, Dict] = {}
+# Use RLock (reentrant lock) to allow nested acquisitions by the same thread
+# This prevents deadlocks when _fetch_if_valid calls fetch_folder_data which calls _gh_get
+_cache_lock = threading.RLock()
 
 
 @lru_cache(maxsize=128)
 def validate_folder_url(url: str) -> bool:
    """
    Validates a folder URL.
    Cached to avoid repeated DNS lookups (socket.getaddrinfo) for the same URL
    during warm-up and sync phases.
    """
    if not url.startswith("https://"):
@@ -485,7 +489,7 @@
 def _api_get(client: httpx.Client, url: str) -> httpx.Response:
    return _retry_request(lambda: client.get(url))


 def _api_delete(client: httpx.Client, url: str) -> httpx.Response:
    return _retry_request(lambda: client.delete(url))

@@ -521,51 +525,62 @@
            )
            time.sleep(wait_time)
 
 
 def _gh_get(url: str) -> Dict:
-    if url not in _cache:
-        # Explicitly let HTTPError propagate (no need to catch just to re-raise)
-        with _gh.stream("GET", url) as r:
-            r.raise_for_status()
-
-            # 1. Check Content-Length header if present
-            cl = r.headers.get("Content-Length")
-            if cl:
-                try:
-                    if int(cl) > MAX_RESPONSE_SIZE:
-                        raise ValueError(
-                            f"Response too large from {sanitize_for_log(url)} "
-                            f"({int(cl) / (1024 * 1024):.2f} MB)"
-                        )
-                except ValueError as e:
-                    # Only catch the conversion error, let the size error propagate
-                    if "Response too large" in str(e):
-                        raise e
-                    log.warning(
-                        f"Malformed Content-Length header from {sanitize_for_log(url)}: {cl!r}. "
-                        "Falling back to streaming size check."
-                    )
-
-            # 2. Stream and check actual size
-            chunks = []
-            current_size = 0
-            for chunk in r.iter_bytes():
-                current_size += len(chunk)
-                if current_size > MAX_RESPONSE_SIZE:
+    # First check: Quick check without holding lock for long
+    with _cache_lock:
+        if url in _cache:
+            return _cache[url]
+
+    # Fetch data if not cached
+    # Explicitly let HTTPError propagate (no need to catch just to re-raise)
+    with _gh.stream("GET", url) as r:
+        r.raise_for_status()
+
+        # 1. Check Content-Length header if present
+        cl = r.headers.get("Content-Length")
+        if cl:
+            try:
+                if int(cl) > MAX_RESPONSE_SIZE:
                     raise ValueError(
                         f"Response too large from {sanitize_for_log(url)} "
-                        f"(> {MAX_RESPONSE_SIZE / (1024 * 1024):.2f} MB)"
+                        f"({int(cl) / (1024 * 1024):.2f} MB)"
                     )
-                chunks.append(chunk)
+            except ValueError as e:
+                # Only catch the conversion error, let the size error propagate
+                if "Response too large" in str(e):
+                    raise e
+                log.warning(
+                    f"Malformed Content-Length header from {sanitize_for_log(url)}: {cl!r}. "
+                    "Falling back to streaming size check."
+                )
 
-            try:
-                _cache[url] = json.loads(b"".join(chunks))
-            except json.JSONDecodeError as e:
+        # 2. Stream and check actual size
+        chunks = []
+        current_size = 0
+        for chunk in r.iter_bytes():
+            current_size += len(chunk)
+            if current_size > MAX_RESPONSE_SIZE:
                 raise ValueError(
-                    f"Invalid JSON response from {sanitize_for_log(url)}"
-                ) from e
+                    f"Response too large from {sanitize_for_log(url)} "
+                    f"(> {MAX_RESPONSE_SIZE / (1024 * 1024):.2f} MB)"
+                )
+            chunks.append(chunk)
 
-    return _cache[url]
+        try:
+            data = json.loads(b"".join(chunks))
+        except json.JSONDecodeError as e:
+            raise ValueError(
+                f"Invalid JSON response from {sanitize_for_log(url)}"
+            ) from e
+
+    # Double-checked locking: Check again after fetch to avoid duplicate fetches
+    # If another thread already cached it while we were fetching, use theirs
+    # for consistency (return _cache[url] instead of data to ensure single source of truth)
+    with _cache_lock:
+        if url not in _cache:
+            _cache[url] = data
+        return _cache[url]
 
 
 def check_api_access(client: httpx.Client, profile_id: str) -> bool:
@@ -693,7 +708,8 @@
 
 def warm_up_cache(urls: Sequence[str]) -> None:
     urls = list(set(urls))
-    urls_to_process = [u for u in urls if u not in _cache]
+    with _cache_lock:
+        urls_to_process = [u for u in urls if u not in _cache]
     if not urls_to_process:
         return
 
@@ -719,7 +735,7 @@
        for future in concurrent.futures.as_completed(futures):
            completed += 1
            render_progress_bar(completed, total, "Warming up cache", prefix="⏳")
            try:
                future.result()
            except Exception as e:
                if USE_COLORS:
@@ -1038,6 +1054,13 @@
         # OPTIMIZATION: Move validation inside the thread pool to parallelize DNS lookups.
         # Previously, sequential validation blocked the main thread.
         def _fetch_if_valid(url: str):
+            # Optimization: If we already have the content in cache, return it directly.
+            # The content was validated at the time of fetch (warm_up_cache).
+            # Read directly from cache to avoid calling fetch_folder_data while holding lock.
+            with _cache_lock:
+                if url in _cache:
+                    return _cache[url]
+
             if validate_folder_url(url):
                 return fetch_folder_data(url)
             return None