From a295b11f2c42d9d139d0081e710cd22229da091f Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Sun, 22 Feb 2026 15:01:33 +0000 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20Bolt:=20Optimize=20cache=20warm-up?= =?UTF-8?q?=20and=20fix=20validation=20bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Optimized `warm_up_cache` to skip DNS validation for fresh cache entries. - Fixed a bug in `validate_folder_data` where it returned `None` instead of `True`. - Fixed corrupted code in `main.py`. - Added `tests/test_warm_up_cache_perf.py` to verify optimization. Co-authored-by: abhimehro <84992105+abhimehro@users.noreply.github.com> --- .jules/bolt.md | 4 ++ main.py | 51 ++++++++++--- tests/test_warm_up_cache_perf.py | 118 +++++++++++++++++++++++++++++++ 3 files changed, 162 insertions(+), 11 deletions(-) create mode 100644 tests/test_warm_up_cache_perf.py diff --git a/.jules/bolt.md b/.jules/bolt.md index a538c1b2..a581afe3 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -69,3 +69,7 @@ ## 2024-03-24 - Thread Pool Churn **Learning:** Python's `ThreadPoolExecutor` incurs measurable overhead (thread creation/shutdown) when created/destroyed repeatedly inside loops, even with small worker counts. **Action:** Lift `ThreadPoolExecutor` creation to the highest possible scope and pass it down as a dependency (using `contextlib.nullcontext` for flexible ownership). + +## 2026-02-22 - [Stale Module References in Tests] +**Learning:** When tests manually reload modules (e.g. `del sys.modules['main']`), other test files that imported the module at the top level will hold references to the *stale* module object. `patch('module.func')` patches the *new* module in `sys.modules`, so the old functions (called by stale tests) remain unpatched. +**Action:** In test suites where module reloading occurs, avoid top-level imports of the reloaded module. Import it inside `setUp` or test methods using `sys.modules.get('main')` or `import main` locally. diff --git a/main.py b/main.py index 7e758616..41d2e634 100644 --- a/main.py +++ b/main.py @@ -680,6 +680,27 @@ def _api_client() -> httpx.Client: _rate_limit_lock = threading.Lock() # Protect _rate_limit_info updates +def _is_cache_fresh(url: str) -> bool: + """ + Checks if the URL is in the persistent cache and within TTL. + + This optimization allows skipping expensive DNS validation for + content that is already known to be safe (validated at fetch time). + """ + # Check in-memory cache first + with _cache_lock: + if url in _cache: + return True + + # Check disk cache + entry = _disk_cache.get(url) + if entry: + last_validated = entry.get("last_validated", 0) + if time.time() - last_validated < CACHE_TTL_SECONDS: + return True + return False + + def get_cache_dir() -> Path: """ Returns platform-specific cache directory for ctrld-sync. @@ -1131,19 +1152,22 @@ def validate_folder_data(data: Dict[str, Any], url: str) -> bool: ) return False if "rules" in rg: - if not isinstance (rg["rules"], list): - log. error ( - f"Invalid data from {sanitize_for_log(url)} : rule_groups[fil].rules must be a list." + if not isinstance(rg["rules"], list): + log.error( + f"Invalid data from {sanitize_for_log(url)}: rule_groups[{i}].rules must be a list." ) return False -# Ensure each rule within the group is an object (dict), -# because later code treats each rule as a mapping (e.g., rule.get(...)). -for j, rule in enumerate (rgi"rules"1): -if not isinstance (rule, dict): - log. error ( - f"Invalid data from {sanitize_for_log(u rl)}: rule_groups[fiłl.rules[kił] must be an object." - ) - return False + # Ensure each rule within the group is an object (dict), + # because later code treats each rule as a mapping (e.g., rule.get(...)). + for j, rule in enumerate(rg["rules"]): + if not isinstance(rule, dict): + log.error( + f"Invalid data from {sanitize_for_log(url)}: rule_groups[{i}].rules[{j}] must be an object." + ) + return False + + return True + # Lock to protect updates to _api_stats in multi-threaded contexts. # Without this, concurrent increments can lose updates because `+=` is not atomic. @@ -1788,6 +1812,11 @@ def warm_up_cache(urls: Sequence[str]) -> None: # OPTIMIZATION: Combine validation (DNS) and fetching (HTTP) in one task # to allow validation latency to be parallelized. def _validate_and_fetch(url: str): + # Optimization: Skip DNS validation if cache is fresh + # This saves blocking I/O for known-good content + if _is_cache_fresh(url): + return _gh_get(url) + if validate_folder_url(url): return _gh_get(url) return None diff --git a/tests/test_warm_up_cache_perf.py b/tests/test_warm_up_cache_perf.py new file mode 100644 index 00000000..0b50760f --- /dev/null +++ b/tests/test_warm_up_cache_perf.py @@ -0,0 +1,118 @@ +""" +Tests for the warm_up_cache optimization. + +This module verifies that warm_up_cache skips DNS validation when the +URL is already fresh in the disk cache. +""" +import unittest +from unittest.mock import patch, MagicMock +import sys +import os +import time + +# Add root to path to import main +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +# We do NOT import main at top level to avoid holding a stale reference +# if other tests reload the module. + +class TestWarmUpCachePerf(unittest.TestCase): + def setUp(self): + """Reset cache state before each test.""" + # Get the current main module from sys.modules + # If not present, import it + if 'main' in sys.modules: + self.main = sys.modules['main'] + else: + import main + self.main = main + + self.main._cache.clear() + self.main._disk_cache.clear() + self.main.validate_folder_url.cache_clear() + + def tearDown(self): + """Clean up after each test.""" + self.main._cache.clear() + self.main._disk_cache.clear() + self.main.validate_folder_url.cache_clear() + + def test_warm_up_skips_validation_for_fresh_cache(self): + """ + Test that warm_up_cache does NOT call validate_folder_url + if the URL is present in _disk_cache and fresh (within TTL). + """ + test_url = "https://example.com/fresh.json" + test_data = {"group": {"group": "Fresh Folder"}, "rules": []} + + # Populate disk cache with a FRESH entry + with self.main._cache_lock: + self.main._disk_cache[test_url] = { + "data": test_data, + "fetched_at": time.time(), + "last_validated": time.time(), # Just now + "etag": "123", + "last_modified": "Tue, 15 Nov 1994 12:45:26 GMT" + } + + # Mock validate_folder_url to track calls + with patch('main.validate_folder_url') as mock_validate: + # Mock _gh_get to return data without network + with patch('main._gh_get', return_value=test_data) as mock_gh_get: + + self.main.warm_up_cache([test_url]) + + # VERIFICATION: validate_folder_url should NOT be called + mock_validate.assert_not_called() + + # _gh_get should still be called (it handles cache retrieval) + mock_gh_get.assert_called_with(test_url) + + def test_warm_up_calls_validation_for_stale_cache(self): + """ + Test that warm_up_cache DOES call validate_folder_url + if the URL is present in _disk_cache but STALE (expired TTL). + """ + test_url = "https://example.com/stale.json" + test_data = {"group": {"group": "Stale Folder"}, "rules": []} + + # Populate disk cache with a STALE entry (older than TTL) + stale_time = time.time() - (self.main.CACHE_TTL_SECONDS + 3600) # 1 hour past TTL + with self.main._cache_lock: + self.main._disk_cache[test_url] = { + "data": test_data, + "fetched_at": stale_time, + "last_validated": stale_time, + "etag": "123", + "last_modified": "Tue, 15 Nov 1994 12:45:26 GMT" + } + + with patch('main.validate_folder_url', return_value=True) as mock_validate: + with patch('main._gh_get', return_value=test_data): + + self.main.warm_up_cache([test_url]) + + # VERIFICATION: validate_folder_url SHOULD be called for stale entry + mock_validate.assert_called_with(test_url) + + def test_warm_up_calls_validation_for_missing_cache(self): + """ + Test that warm_up_cache DOES call validate_folder_url + if the URL is NOT present in _disk_cache. + """ + test_url = "https://example.com/missing.json" + test_data = {"group": {"group": "Missing Folder"}, "rules": []} + + # Ensure cache is empty + self.assertNotIn(test_url, self.main._disk_cache) + + with patch('main.validate_folder_url', return_value=True) as mock_validate: + with patch('main._gh_get', return_value=test_data): + + self.main.warm_up_cache([test_url]) + + # VERIFICATION: validate_folder_url SHOULD be called for missing entry + mock_validate.assert_called_with(test_url) + +if __name__ == '__main__': + unittest.main()