diff --git a/README.md b/README.md index 163a56ee..4947b32d 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,31 @@ https://controld.com/dashboard/profiles/741861frakbm/filters - Python 3.13+ - Runtime dependencies (install with `pip install -r requirements.txt` or `uv sync`) +## Blocklist Cache + +ctrld-sync maintains a persistent on-disk cache of downloaded blocklist data to speed up subsequent syncs. + +### Cache location + +| Platform | Default path | +|----------|-------------| +| Linux / Unix | `~/.cache/ctrld-sync/blocklists.json` (or `$XDG_CACHE_HOME/ctrld-sync/`) | +| macOS | `~/Library/Caches/ctrld-sync/blocklists.json` | +| Windows | `%LOCALAPPDATA%\ctrld-sync\cache\blocklists.json` | + +### How it works + +1. **Within TTL (24 hours):** cached data is returned immediately—no HTTP request is made. +2. **TTL expired:** a conditional request is sent using `If-None-Match` (ETag) or `If-Modified-Since`. A `304 Not Modified` response reuses the cached data with no download. +3. **Changed or missing:** the full blocklist is downloaded and the cache is updated. + +### Cache CLI flags + +```bash +python main.py --clear-cache # delete the cache file and exit +python main.py --no-cache # disable the cache for this run (data is fetched fresh, cache is not updated) +``` + ## Testing This project includes a comprehensive test suite to ensure code quality and correctness. diff --git a/main.py b/main.py index c835ff1b..0b327f4c 100644 --- a/main.py +++ b/main.py @@ -665,6 +665,7 @@ def _api_client() -> httpx.Client: CACHE_TTL_SECONDS = 24 * 60 * 60 # 24 hours: within TTL, serve from disk without HTTP request _disk_cache: Dict[str, Dict[str, Any]] = {} # Loaded from disk on startup _cache_stats = {"hits": 0, "misses": 0, "validations": 0, "errors": 0} +_no_cache: bool = False # Set to True when --no-cache flag is passed _api_stats = {"control_d_api_calls": 0, "blocklist_fetches": 0} # --------------------------------------------------------------------------- # @@ -788,7 +789,11 @@ def save_disk_cache() -> None: SECURITY: Creates cache directory with user-only permissions (0o700) to prevent other users from reading cached blocklist data. + + No-op when --no-cache is active. """ + if _no_cache: + return try: cache_dir = get_cache_dir() cache_dir.mkdir(parents=True, exist_ok=True) @@ -1377,13 +1382,14 @@ def _gh_get(url: str) -> Dict: last_modified = r_retry.headers.get("Last-Modified") # Update disk cache with new data and headers - _disk_cache[url] = { - "data": data, - "etag": etag, - "last_modified": last_modified, - "fetched_at": time.time(), - "last_validated": time.time(), - } + if not _no_cache: + _disk_cache[url] = { + "data": data, + "etag": etag, + "last_modified": last_modified, + "fetched_at": time.time(), + "last_validated": time.time(), + } _cache_stats["misses"] += 1 return data @@ -1444,13 +1450,14 @@ def _gh_get(url: str) -> Dict: last_modified = r.headers.get("Last-Modified") # Update disk cache with new data and headers - _disk_cache[url] = { - "data": data, - "etag": etag, - "last_modified": last_modified, - "fetched_at": time.time(), - "last_validated": time.time(), - } + if not _no_cache: + _disk_cache[url] = { + "data": data, + "etag": etag, + "last_modified": last_modified, + "fetched_at": time.time(), + "last_validated": time.time(), + } _cache_stats["misses"] += 1 @@ -2421,6 +2428,9 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--clear-cache", action="store_true", help="Clear the persistent blocklist cache and exit" ) + parser.add_argument( + "--no-cache", action="store_true", help="Disable the persistent blocklist cache for this run" + ) return parser.parse_args() @@ -2438,7 +2448,7 @@ def main(): check_env_permissions() load_dotenv() - global TOKEN + global TOKEN, _no_cache # Re-initialize TOKEN to pick up values from .env (since load_dotenv was delayed) TOKEN = _clean_env_kv(os.getenv("TOKEN"), "TOKEN") @@ -2449,9 +2459,14 @@ def main(): # argument errors do not perform unnecessary filesystem I/O or logging. load_disk_cache() + # Handle --no-cache: disable disk cache for this run + if args.no_cache: + _no_cache = True + _disk_cache.clear() + log.info("Persistent disk cache disabled for this run (--no-cache)") + # Handle --clear-cache: delete cache file and exit immediately if args.clear_cache: - global _disk_cache cache_file = get_cache_dir() / "blocklists.json" if cache_file.exists(): try: diff --git a/tests/test_disk_cache.py b/tests/test_disk_cache.py index 6dd0ec07..f82a6e99 100644 --- a/tests/test_disk_cache.py +++ b/tests/test_disk_cache.py @@ -34,6 +34,8 @@ def setUp(self): main.validate_folder_url.cache_clear() # Reset stats main._cache_stats = {"hits": 0, "misses": 0, "validations": 0, "errors": 0} + # Ensure no-cache flag is off for each test + main._no_cache = False # Create temporary cache directory for testing self.temp_dir = tempfile.mkdtemp() @@ -45,6 +47,8 @@ def tearDown(self): main._disk_cache.clear() main.validate_folder_url.cache_clear() main._cache_stats = {"hits": 0, "misses": 0, "validations": 0, "errors": 0} + # Restore no-cache flag + main._no_cache = False # Clean up temp directory import shutil @@ -401,6 +405,63 @@ def test_clear_cache_deletes_file(self): # In-memory disk cache should be empty self.assertEqual(len(main._disk_cache), 0) + def test_no_cache_skips_disk_cache_write(self): + """Test that --no-cache prevents writing to disk cache.""" + test_url = "https://example.com/no-cache-test.json" + test_data = {"group": {"group": "Test"}, "domains": ["example.com"]} + + original_no_cache = main._no_cache + try: + main._no_cache = True + + def mock_stream(method, url, headers=None): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.raise_for_status = MagicMock() + mock_response.headers = { + "Content-Type": "application/json", + "ETag": "etag999", + } + json_bytes = json.dumps(test_data).encode() + mock_response.iter_bytes = MagicMock(return_value=[json_bytes]) + mock_response.__enter__ = MagicMock(return_value=mock_response) + mock_response.__exit__ = MagicMock(return_value=False) + return mock_response + + with patch.object(main._gh, 'stream', side_effect=mock_stream): + result = main._gh_get(test_url) + + # Data should be returned correctly + self.assertEqual(result, test_data) + # Disk cache should NOT have been updated + self.assertNotIn(test_url, main._disk_cache) + finally: + main._no_cache = original_no_cache + + def test_no_cache_skips_save(self): + """Test that save_disk_cache() is a no-op when --no-cache is active.""" + cache_dir = Path(self.temp_dir) + main.get_cache_dir = lambda: cache_dir + + main._disk_cache["https://example.com/test.json"] = { + "data": {"group": {"group": "Test"}, "domains": ["test.com"]}, + "etag": "xyz", + "last_modified": None, + "fetched_at": 1234567890.0, + "last_validated": 1234567890.0, + } + + original_no_cache = main._no_cache + try: + main._no_cache = True + main.save_disk_cache() + finally: + main._no_cache = original_no_cache + + # Cache file should NOT have been created + cache_file = cache_dir / "blocklists.json" + self.assertFalse(cache_file.exists()) + if __name__ == '__main__': unittest.main()