diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 48608f29e..54a7f4391 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -73,7 +73,8 @@ development command line options. are `clear` - would clear the cache, `ignore` - would ignore it. Note that for metadata cache we use only released portion of `dandi.__version__` as a token. If handling of metadata has changed while developing, set this env var to - `clear` to have cache `clear()`ed before use. + `clear` to have cache `clear()`ed before use. This variable also controls the + API metadata cache used by `DandiAPIClient(cache=True)`. - `DANDI_INSTANCEHOST` -- defaults to `localhost`. Point to host/IP which hosts a local instance of dandiarchive. diff --git a/dandi/apicache.py b/dandi/apicache.py new file mode 100644 index 000000000..a1b1326f2 --- /dev/null +++ b/dandi/apicache.py @@ -0,0 +1,148 @@ +"""Persistent sqlite3-backed cache for DANDI API metadata responses. + +The cache stores metadata keyed by ``(api_url, entity_type, entity_id)`` and +validates entries against a *modified* timestamp so that stale data is +automatically discarded without extra API calls. +""" + +from __future__ import annotations + +import json +import os +from pathlib import Path +import sqlite3 + +from platformdirs import user_cache_dir + +from . import get_logger + +lgr = get_logger() + +_SCHEMA = """\ +CREATE TABLE IF NOT EXISTS metadata_cache ( + api_url TEXT NOT NULL, + entity_type TEXT NOT NULL, + entity_id TEXT NOT NULL, + modified TEXT NOT NULL, + metadata TEXT NOT NULL, + PRIMARY KEY (api_url, entity_type, entity_id) +); +""" + + +class APIMetadataCache: + """A lightweight, persistent metadata cache backed by sqlite3. + + Parameters + ---------- + db_path : Path or None + Explicit path for the sqlite database. When *None* (the default) the + database is placed under ``platformdirs.user_cache_dir("dandi")``. + """ + + def __init__(self, db_path: Path | None = None) -> None: + if db_path is None: + db_path = Path(user_cache_dir("dandi")) / "api_metadata_cache.sqlite" + db_path.parent.mkdir(parents=True, exist_ok=True) + self._db_path = db_path + + dandi_cache = os.environ.get("DANDI_CACHE", "").lower() + if dandi_cache == "ignore": + lgr.debug("DANDI_CACHE=ignore: API metadata cache disabled") + self._enabled = False + return + + self._enabled = True + self._con = sqlite3.connect(str(db_path), check_same_thread=False) + self._con.execute("PRAGMA journal_mode=WAL;") + self._con.execute(_SCHEMA) + self._con.commit() + + if dandi_cache == "clear": + lgr.debug("DANDI_CACHE=clear: clearing API metadata cache") + self.clear() + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def get( + self, + api_url: str, + entity_type: str, + entity_id: str, + modified: str, + ) -> dict | None: + """Return cached metadata if *modified* matches, else ``None``. + + Parameters + ---------- + api_url : str + Base URL of the DANDI API server. + entity_type : str + ``"dandiset"`` or ``"asset"``. + entity_id : str + Unique identifier for the entity (asset UUID, or + ``"/"`` for Dandisets). + modified : str + ISO-8601 timestamp of the entity's last modification. A cache + hit is only returned when this value matches the stored entry. + + Returns + ------- + dict or None + The cached metadata dict, or ``None`` on a cache miss. + """ + if not self._enabled: + return None + row = self._con.execute( + "SELECT metadata FROM metadata_cache " + "WHERE api_url = ? AND entity_type = ? AND entity_id = ? AND modified = ?", + (api_url, entity_type, entity_id, modified), + ).fetchone() + if row is not None: + lgr.debug("API cache hit: %s %s %s", entity_type, entity_id, modified) + return json.loads(row[0]) # type: ignore[no-any-return] + return None + + def set( + self, + api_url: str, + entity_type: str, + entity_id: str, + modified: str, + metadata: dict, + ) -> None: + """Insert or replace a cache entry. + + Parameters + ---------- + api_url : str + Base URL of the DANDI API server. + entity_type : str + ``"dandiset"`` or ``"asset"``. + entity_id : str + Unique identifier for the entity. + modified : str + ISO-8601 timestamp of the entity's last modification. + metadata : dict + The raw metadata dict to cache. + """ + if not self._enabled: + return + self._con.execute( + "INSERT OR REPLACE INTO metadata_cache " + "(api_url, entity_type, entity_id, modified, metadata) " + "VALUES (?, ?, ?, ?, ?)", + (api_url, entity_type, entity_id, modified, json.dumps(metadata)), + ) + self._con.commit() + lgr.debug("API cache set: %s %s %s", entity_type, entity_id, modified) + + def clear(self) -> None: + """Delete all cached entries from the database.""" + if not self._enabled: + return + self._con.execute("DELETE FROM metadata_cache") + self._con.commit() + lgr.debug("API metadata cache cleared") diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 9a5e625c3..ed635f5c8 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -66,6 +66,8 @@ if TYPE_CHECKING: from typing_extensions import Self + from .apicache import APIMetadataCache + lgr = get_logger() @@ -426,6 +428,7 @@ def __init__( api_url: str | None = None, token: str | None = None, dandi_instance: DandiInstance | None = None, + cache: bool = False, ) -> None: """ Construct a client instance for the given API URL or DANDI instance @@ -439,6 +442,11 @@ def __init__( ``"https://api.sandbox.dandiarchive.org/api"`` :param str token: User API Key. Note that different instance APIs have different keys. + :param bool cache: When ``True``, API metadata responses are cached + persistently to disk (in an sqlite3 database) and validated against + ``modified`` timestamps. Controlled by the :envvar:`DANDI_CACHE` + environment variable (``"ignore"`` disables, ``"clear"`` wipes the + cache on first access). """ check_dandi_version() if api_url is None: @@ -458,6 +466,17 @@ def __init__( self.dandi_instance: DandiInstance = dandi_instance if token is not None: self.authenticate(token) + if cache: + from .apicache import APIMetadataCache + + self._cache: APIMetadataCache | None = APIMetadataCache() + else: + self._cache = None + + @property + def cache(self) -> APIMetadataCache | None: + """The persistent API metadata cache, or ``None`` if caching is disabled.""" + return self._cache @classmethod def for_dandi_instance( @@ -1152,14 +1171,28 @@ def get_metadata(self) -> models.Dandiset: def get_raw_metadata(self) -> dict[str, Any]: """ Fetch the metadata for this version of the Dandiset as an unprocessed - `dict` - """ + `dict`. + + When the client was created with ``cache=True``, results are served + from a persistent on-disk cache whenever the version's ``modified`` + timestamp has not changed. + """ + cache = self.client.cache + entity_id = f"{self.identifier}/{self.version_id}" + if cache is not None and self._version is not None: + modified = self._version.modified.isoformat() + cached = cache.get(self.client.api_url, "dandiset", entity_id, modified) + if cached is not None: + return cached try: data = self.client.get(self.version_api_path) assert isinstance(data, dict) - return data except HTTP404Error: raise NotFoundError(f"No such asset: {self}") + if cache is not None: + modified = self.version.modified.isoformat() + cache.set(self.client.api_url, "dandiset", entity_id, modified, data) + return data def set_metadata(self, metadata: models.Dandiset) -> None: """ @@ -1558,16 +1591,28 @@ def get_metadata(self) -> models.Asset: return models.Asset.model_validate(self.get_raw_metadata()) def get_raw_metadata(self) -> dict[str, Any]: - """Fetch the metadata for the asset as an unprocessed `dict`""" + """Fetch the metadata for the asset as an unprocessed `dict`. + + When the client was created with ``cache=True``, results are served + from a persistent on-disk cache whenever the asset's ``modified`` + timestamp has not changed. + """ if self._metadata is not None: return self._metadata - else: - try: - data = self.client.get(self.api_path) - assert isinstance(data, dict) - return data - except HTTP404Error: - raise NotFoundError(f"No such asset: {self}") + cache = self.client.cache + modified = self.modified.isoformat() + if cache is not None: + cached = cache.get(self.client.api_url, "asset", self.identifier, modified) + if cached is not None: + return cached + try: + data = self.client.get(self.api_path) + assert isinstance(data, dict) + except HTTP404Error: + raise NotFoundError(f"No such asset: {self}") + if cache is not None: + cache.set(self.client.api_url, "asset", self.identifier, modified, data) + return data def get_raw_digest(self, digest_type: str | models.DigestType | None = None) -> str: """ diff --git a/dandi/tests/test_apicache.py b/dandi/tests/test_apicache.py new file mode 100644 index 000000000..972183002 --- /dev/null +++ b/dandi/tests/test_apicache.py @@ -0,0 +1,102 @@ +"""Tests for the persistent API metadata cache.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from dandi.apicache import APIMetadataCache + + +@pytest.mark.ai_generated +class TestAPIMetadataCache: + API_URL = "https://api.dandiarchive.org/api" + + def test_cache_miss(self, tmp_path: Path) -> None: + cache = APIMetadataCache(db_path=tmp_path / "cache.sqlite") + result = cache.get(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z") + assert result is None + + def test_set_then_get(self, tmp_path: Path) -> None: + cache = APIMetadataCache(db_path=tmp_path / "cache.sqlite") + metadata = {"name": "test-asset", "size": 42} + cache.set(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z", metadata) + result = cache.get(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z") + assert result == metadata + + def test_stale_modified_returns_none(self, tmp_path: Path) -> None: + cache = APIMetadataCache(db_path=tmp_path / "cache.sqlite") + metadata = {"name": "test-asset", "size": 42} + cache.set(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z", metadata) + # Different modified timestamp -> cache miss + result = cache.get(self.API_URL, "asset", "abc-123", "2024-06-15T12:00:00Z") + assert result is None + + def test_update_replaces_entry(self, tmp_path: Path) -> None: + cache = APIMetadataCache(db_path=tmp_path / "cache.sqlite") + cache.set(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z", {"v": 1}) + cache.set(self.API_URL, "asset", "abc-123", "2024-06-15T12:00:00Z", {"v": 2}) + # Old modified no longer matches + assert ( + cache.get(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z") is None + ) + # New modified matches + assert cache.get(self.API_URL, "asset", "abc-123", "2024-06-15T12:00:00Z") == { + "v": 2 + } + + def test_clear(self, tmp_path: Path) -> None: + cache = APIMetadataCache(db_path=tmp_path / "cache.sqlite") + cache.set(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z", {"a": 1}) + cache.clear() + assert ( + cache.get(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z") is None + ) + + def test_different_entity_types(self, tmp_path: Path) -> None: + cache = APIMetadataCache(db_path=tmp_path / "cache.sqlite") + cache.set( + self.API_URL, "asset", "id1", "2024-01-01T00:00:00Z", {"type": "asset"} + ) + cache.set( + self.API_URL, + "dandiset", + "id1", + "2024-01-01T00:00:00Z", + {"type": "dandiset"}, + ) + assert cache.get(self.API_URL, "asset", "id1", "2024-01-01T00:00:00Z") == { + "type": "asset" + } + assert cache.get(self.API_URL, "dandiset", "id1", "2024-01-01T00:00:00Z") == { + "type": "dandiset" + } + + def test_dandi_cache_ignore( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setenv("DANDI_CACHE", "ignore") + cache = APIMetadataCache(db_path=tmp_path / "cache.sqlite") + cache.set(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z", {"a": 1}) + # Even after set, get returns None because cache is disabled + assert ( + cache.get(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z") is None + ) + + def test_dandi_cache_clear( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + # First, populate the cache normally + cache1 = APIMetadataCache(db_path=tmp_path / "cache.sqlite") + cache1.set(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z", {"a": 1}) + assert cache1.get(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z") == { + "a": 1 + } + + # Now open with DANDI_CACHE=clear — old data should be gone + monkeypatch.setenv("DANDI_CACHE", "clear") + cache2 = APIMetadataCache(db_path=tmp_path / "cache.sqlite") + assert ( + cache2.get(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z") is None + ) diff --git a/docs/source/modref/apicache.rst b/docs/source/modref/apicache.rst new file mode 100644 index 000000000..c3121c564 --- /dev/null +++ b/docs/source/modref/apicache.rst @@ -0,0 +1,22 @@ +.. module:: dandi.apicache + +``dandi.apicache`` +================== + +This module provides a persistent, sqlite3-backed cache for metadata returned +by the DANDI REST API. It is used internally by `~dandi.dandiapi.DandiAPIClient` +when ``cache=True`` is passed at construction time. + +Cached entries are keyed by ``(api_url, entity_type, entity_id)`` and validated +against the ``modified`` timestamp already present on every Dandiset version and +asset — no extra API calls are needed to check freshness. + +The cache database is stored at +``platformdirs.user_cache_dir("dandi") / "api_metadata_cache.sqlite"`` +by default and is controlled by the :envvar:`DANDI_CACHE` environment variable: + +* ``DANDI_CACHE=ignore`` — disables the cache entirely (reads always miss). +* ``DANDI_CACHE=clear`` — wipes existing entries when the cache is opened. + +.. autoclass:: APIMetadataCache + :members: diff --git a/docs/source/modref/dandiapi.rst b/docs/source/modref/dandiapi.rst index 8212711d0..002cf97eb 100644 --- a/docs/source/modref/dandiapi.rst +++ b/docs/source/modref/dandiapi.rst @@ -36,6 +36,35 @@ be passed to functions of pynwb etc. You can see more usages of DANDI API to assist with data streaming at `PyNWB: Streaming NWB files `_. +Metadata Caching +---------------- + +`DandiAPIClient` supports optional persistent caching of metadata returned by +`RemoteDandiset.get_raw_metadata()` and `BaseRemoteAsset.get_raw_metadata()`. +When enabled, metadata is stored in a local sqlite3 database and validated +against ``modified`` timestamps — no extra API calls are needed to check +freshness. + +Enable caching by passing ``cache=True`` when constructing the client:: + + from dandi.dandiapi import DandiAPIClient + + with DandiAPIClient("https://api.dandiarchive.org/api", cache=True) as client: + ds = client.get_dandiset("000027") + # First call hits the API and stores the result: + meta = ds.get_raw_metadata() + # Subsequent calls with the same modified timestamp are served from cache: + meta = ds.get_raw_metadata() + +The cache is stored at +``platformdirs.user_cache_dir("dandi") / "api_metadata_cache.sqlite"`` +and is controlled by the :envvar:`DANDI_CACHE` environment variable: + +* ``DANDI_CACHE=ignore`` — disables the cache entirely. +* ``DANDI_CACHE=clear`` — wipes existing entries when the cache is opened. + +See `~dandi.apicache.APIMetadataCache` for implementation details. + Client ------ diff --git a/docs/source/modref/index.rst b/docs/source/modref/index.rst index 373b48502..f218e10a3 100644 --- a/docs/source/modref/index.rst +++ b/docs/source/modref/index.rst @@ -49,6 +49,7 @@ Support functionality .. toctree:: + apicache consts utils support.digests