From aa84f4537f619b9a3eb6267a11a2cf7b44b0e18a Mon Sep 17 00:00:00 2001
From: Marco Varrone <marco.varrone@unil.ch>
Date: Thu, 12 Feb 2026 10:35:25 +0100
Subject: [PATCH 1/8] Refactor dataset handling in _dataset.py

---
 src/cellcharter/datasets/_dataset.py | 115 +++++++++++++++++++++++++--
 1 file changed, 107 insertions(+), 8 deletions(-)

diff --git a/src/cellcharter/datasets/_dataset.py b/src/cellcharter/datasets/_dataset.py
index 7e35b7f..f60f6ee 100644
--- a/src/cellcharter/datasets/_dataset.py
+++ b/src/cellcharter/datasets/_dataset.py
@@ -1,18 +1,117 @@
-from copy import copy
+from __future__ import annotations
 
-from squidpy.datasets._utils import AMetadata
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from urllib.error import HTTPError, URLError
+from urllib.parse import urlparse
+from urllib.request import Request, urlopen
 
-_codex_mouse_spleen = AMetadata(
+import anndata as ad
+
+try:
+    from tqdm.auto import tqdm
+except ImportError:  # pragma: no cover - optional dependency
+    tqdm = None
+
+
+@dataclass(frozen=True)
+class DatasetMetadata:
+    name: str
+    doc_header: str
+    shape: tuple[int, int]
+    url: str
+    filename: str
+
+
+_codex_mouse_spleen = DatasetMetadata(
     name="codex_mouse_spleen",
     doc_header="Pre-processed CODEX dataset of mouse spleen from `Goltsev et al "
     "<https://doi.org/10.1016/j.cell.2018.07.010>`__.",
     shape=(707474, 29),
-    url="https://figshare.com/ndownloader/files/38538101",
+    url="https://ndownloader.figshare.com/files/38538101",
+    filename="codex_mouse_spleen.h5ad",
 )
 
-for name, var in copy(locals()).items():
-    if isinstance(var, AMetadata):
-        var._create_function(name, globals())
+
+def _default_datasets_dir() -> Path:
+    return Path(os.environ.get("CELLCHARTER_DATA_DIR", Path.home() / ".cache" / "cellcharter" / "datasets"))
+
+
+def _is_hdf5_file(path: Path) -> bool:
+    if not path.exists() or path.stat().st_size < 8:
+        return False
+    with path.open("rb") as input_file:
+        return input_file.read(8) == b"\x89HDF\r\n\x1a\n"
+
+
+def _normalize_figshare_url(url: str) -> str:
+    parsed = urlparse(url)
+    if parsed.netloc == "figshare.com" and parsed.path.startswith("/ndownloader/files/"):
+        return f"https://ndownloader.figshare.com{parsed.path}"
+    return url
+
+
+def _download_file(url: str, destination: Path) -> None:
+    destination.parent.mkdir(parents=True, exist_ok=True)
+    tmp_destination = destination.with_suffix(f"{destination.suffix}.part")
+
+    request = Request(_normalize_figshare_url(url), headers={"User-Agent": "cellcharter-datasets"})
+    progress = None
+    try:
+        with urlopen(request) as response, tmp_destination.open("wb") as output:
+            content_length = response.headers.get("Content-Length")
+            total = int(content_length) if content_length and content_length.isdigit() else None
+            if tqdm is not None:
+                progress = tqdm(total=total, unit="B", unit_scale=True, desc=f"Downloading {destination.name}")
+
+            chunk_size = 1024 * 1024
+            while True:
+                chunk = response.read(chunk_size)
+                if not chunk:
+                    break
+                output.write(chunk)
+                if progress is not None:
+                    progress.update(len(chunk))
+    except (HTTPError, URLError) as error:
+        if tmp_destination.exists():
+            tmp_destination.unlink()
+        raise RuntimeError(f"Failed to download dataset from {url}.") from error
+    finally:
+        if progress is not None:
+            progress.close()
+
+    if not _is_hdf5_file(tmp_destination):
+        tmp_destination.unlink(missing_ok=True)
+        raise RuntimeError(
+            f"Downloaded file from {url} is not a valid HDF5 file. " "Please check network access and dataset URL."
+        )
+
+    tmp_destination.replace(destination)
+
+
+def _resolve_dataset_path(metadata: DatasetMetadata, path: str | Path | None = None) -> Path:
+    base_dir = _default_datasets_dir() if path is None else Path(path)
+    if base_dir.suffix:
+        return base_dir
+    return base_dir / metadata.filename
+
+
+def _fetch_dataset_file(
+    metadata: DatasetMetadata, path: str | Path | None = None, force_download: bool = False
+) -> Path:
+    dataset_path = _resolve_dataset_path(metadata, path=path)
+    should_download = force_download or not dataset_path.exists() or not _is_hdf5_file(dataset_path)
+    if should_download:
+        dataset_path.unlink(missing_ok=True)
+        _download_file(metadata.url, dataset_path)
+    return dataset_path
+
+
+def codex_mouse_spleen(path: str | Path | None = None, force_download: bool = False) -> ad.AnnData:
+    """Pre-processed CODEX dataset of mouse spleen from `Goltsev et al <https://doi.org/10.1016/j.cell.2018.07.010>`__."""
+    dataset_path = _fetch_dataset_file(_codex_mouse_spleen, path=path, force_download=force_download)
+    return ad.read_h5ad(dataset_path)
 
 
-__all__ = ["codex_mouse_spleen"]  # noqa: F822
+__all__ = ["codex_mouse_spleen"]

From aa683cc32db0a5a70f0efcab3348b356e548bae1 Mon Sep 17 00:00:00 2001
From: Marco Varrone <marco.varrone@unil.ch>
Date: Thu, 12 Feb 2026 11:07:07 +0100
Subject: [PATCH 2/8] Enhance Figshare URL normalization and improve error
 handling in _dataset.py

---
 src/cellcharter/datasets/_dataset.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/cellcharter/datasets/_dataset.py b/src/cellcharter/datasets/_dataset.py
index f60f6ee..4c1d841 100644
--- a/src/cellcharter/datasets/_dataset.py
+++ b/src/cellcharter/datasets/_dataset.py
@@ -48,7 +48,8 @@ def _is_hdf5_file(path: Path) -> bool:
 def _normalize_figshare_url(url: str) -> str:
     parsed = urlparse(url)
     if parsed.netloc == "figshare.com" and parsed.path.startswith("/ndownloader/files/"):
-        return f"https://ndownloader.figshare.com{parsed.path}"
+        # Strip the /ndownloader prefix, keeping only /files/...
+        return f"https://ndownloader.figshare.com{parsed.path.removeprefix('/ndownloader')}"
     return url
 
 
@@ -59,7 +60,7 @@ def _download_file(url: str, destination: Path) -> None:
     request = Request(_normalize_figshare_url(url), headers={"User-Agent": "cellcharter-datasets"})
     progress = None
     try:
-        with urlopen(request) as response, tmp_destination.open("wb") as output:
+        with urlopen(request, timeout=60) as response, tmp_destination.open("wb") as output:
             content_length = response.headers.get("Content-Length")
             total = int(content_length) if content_length and content_length.isdigit() else None
             if tqdm is not None:
@@ -73,10 +74,12 @@ def _download_file(url: str, destination: Path) -> None:
                 output.write(chunk)
                 if progress is not None:
                     progress.update(len(chunk))
-    except (HTTPError, URLError) as error:
+    except Exception as error:  # noqa: B902
         if tmp_destination.exists():
             tmp_destination.unlink()
-        raise RuntimeError(f"Failed to download dataset from {url}.") from error
+        if isinstance(error, (HTTPError, URLError)):
+            raise RuntimeError(f"Failed to download dataset from {url}.") from error
+        raise error
     finally:
         if progress is not None:
             progress.close()

From dbf29b6b4b63efe14f02ff6302464ccbb6bdd5a7 Mon Sep 17 00:00:00 2001
From: Marco Varrone <marco.varrone@unil.ch>
Date: Thu, 12 Feb 2026 11:08:00 +0100
Subject: [PATCH 3/8] Update dependencies and bump to 0.3.6

---
 pyproject.toml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d6a4fe4..6923457 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,10 +10,10 @@ exclude = [
 
 [project]
 name = "cellcharter"
-version = "0.3.5"
+version = "0.3.6"
 description = "A Python package for the identification, characterization and comparison of spatial clusters from spatial -omics data."
 readme = "README.md"
-requires-python = ">=3.10,<3.14"
+requires-python = ">=3.11,<3.14" # include 3.13 once spatialdata allows it (waiting for multicale-spatial-image to allow it)
 license = {file = "LICENSE"}
 authors = [
     {name = "CSO group"},
@@ -31,10 +31,10 @@ dependencies = [
     "torchgmm >= 0.1.2",
     # for debug logging (referenced from the issue template)
     "session-info",
-    "spatialdata",
-    "spatialdata-plot",
     "rasterio",
     "sknw",
+    "spatialdata",
+    "spatialdata-plot",
 ]
 
 [project.optional-dependencies]

From 2dd3f8b31838ab4a368d1cfeefa7e43eef74abe5 Mon Sep 17 00:00:00 2001
From: Marco Varrone <marco.varrone@unil.ch>
Date: Thu, 12 Feb 2026 11:15:21 +0100
Subject: [PATCH 4/8] Update torchgmm dependency version to 0.1.4 in
 pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6923457..d160d90 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ dependencies = [
     "anndata",
     "scikit-learn",
     "squidpy >= 1.6.3",
-    "torchgmm >= 0.1.2",
+    "torchgmm >= 0.1.4",
     # for debug logging (referenced from the issue template)
     "session-info",
     "rasterio",

From 0c680f3f672bf98a6599279bc94546b4e2f4abbd Mon Sep 17 00:00:00 2001
From: Marco Varrone <marco.varrone@unil.ch>
Date: Thu, 12 Feb 2026 11:20:32 +0100
Subject: [PATCH 5/8] Update Python versions in GitHub Actions workflow to 3.11
 and 3.13

---
 .github/workflows/test.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 18f29b3..3c03f45 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -24,11 +24,11 @@ jobs:
       matrix:
         include:
           - os: ubuntu-latest
-            python: "3.10"
+            python: "3.11"
           - os: ubuntu-latest
-            python: "3.12"
+            python: "3.13"
           - os: ubuntu-latest
-            python: "3.12"
+            python: "3.13"
             pip-flags: "--pre"
             name: PRE-RELEASE DEPENDENCIES
 

From 68959945fb4299acdbdee7d0ae55fb12250ca27b Mon Sep 17 00:00:00 2001
From: Marco Varrone <marco.varrone@unil.ch>
Date: Thu, 12 Feb 2026 11:25:12 +0100
Subject: [PATCH 6/8] Remove comment

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index d160d90..de7718a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ name = "cellcharter"
 version = "0.3.6"
 description = "A Python package for the identification, characterization and comparison of spatial clusters from spatial -omics data."
 readme = "README.md"
-requires-python = ">=3.11,<3.14" # include 3.13 once spatialdata allows it (waiting for multicale-spatial-image to allow it)
+requires-python = ">=3.11,<3.14"
 license = {file = "LICENSE"}
 authors = [
     {name = "CSO group"},

From 9e7cc2841789a9ccf648b85c020b497b20c4adbf Mon Sep 17 00:00:00 2001
From: Marco Varrone <marco.varrone@unil.ch>
Date: Thu, 12 Feb 2026 11:42:36 +0100
Subject: [PATCH 7/8] Add functionality to download and validate codex_adata
 fixture in tests

- Introduced _is_hdf5_file and _download_codex functions to handle HDF5 file validation and downloading.
- Updated codex_adata fixture to check for the existence of the codex_adata file and download it if necessary.
- Enhanced error handling to include URLError and OSError during data retrieval.
---
 tests/conftest.py | 42 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 4dec851..c43c2c3 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,5 +1,7 @@
 import time
-from urllib.error import HTTPError
+from pathlib import Path
+from urllib.error import HTTPError, URLError
+from urllib.request import Request, urlopen
 
 import anndata as ad
 import numpy as np
@@ -9,6 +11,35 @@
 
 _adata = sc.read("tests/_data/test_data.h5ad")
 _adata.raw = _adata.copy()
+_CODEX_PATH = Path("tests/_data/codex_adata.h5ad")
+_CODEX_URL = "https://ndownloader.figshare.com/files/46832722"
+
+
+def _is_hdf5_file(path: Path) -> bool:
+    if not path.exists() or path.stat().st_size < 8:
+        return False
+    with path.open("rb") as input_file:
+        return input_file.read(8) == b"\x89HDF\r\n\x1a\n"
+
+
+def _download_codex(path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    tmp_path = path.with_suffix(f"{path.suffix}.part")
+    request = Request(_CODEX_URL, headers={"User-Agent": "cellcharter-tests"})
+
+    with urlopen(request) as response, tmp_path.open("wb") as output:
+        chunk_size = 1024 * 1024
+        while True:
+            chunk = response.read(chunk_size)
+            if not chunk:
+                break
+            output.write(chunk)
+
+    if not _is_hdf5_file(tmp_path):
+        tmp_path.unlink(missing_ok=True)
+        raise OSError("Downloaded codex fixture is not a valid HDF5 file.")
+
+    tmp_path.replace(path)
 
 
 @pytest.fixture()
@@ -31,12 +62,13 @@ def codex_adata() -> ad.AnnData:
 
     for attempt in range(max_retries):
         try:
-            adata = sc.read(
-                "tests/_data/codex_adata.h5ad", backup_url="https://figshare.com/ndownloader/files/46832722"
-            )
+            if not _is_hdf5_file(_CODEX_PATH):
+                _CODEX_PATH.unlink(missing_ok=True)
+                _download_codex(_CODEX_PATH)
+            adata = ad.read_h5ad(_CODEX_PATH)
             adata.obs_names_make_unique()
             return adata[adata.obs["sample"].isin(["BALBc-1", "MRL-5"])].copy()
-        except HTTPError as e:
+        except (HTTPError, URLError, OSError) as e:
             if attempt == max_retries - 1:  # Last attempt
                 pytest.skip(f"Failed to download test data after {max_retries} attempts: {str(e)}")
             time.sleep(retry_delay)

From 27cd77d5ca2271d2c38bdf6ac9ef216ab8a67ad2 Mon Sep 17 00:00:00 2001
From: Marco Varrone <marco.varrone@unil.ch>
Date: Thu, 12 Feb 2026 11:50:46 +0100
Subject: [PATCH 8/8] Increase timeout for codex download and ensure
 re-download on failure

---
 tests/conftest.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index c43c2c3..70c245d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -27,7 +27,7 @@ def _download_codex(path: Path) -> None:
     tmp_path = path.with_suffix(f"{path.suffix}.part")
     request = Request(_CODEX_URL, headers={"User-Agent": "cellcharter-tests"})
 
-    with urlopen(request) as response, tmp_path.open("wb") as output:
+    with urlopen(request, timeout=60) as response, tmp_path.open("wb") as output:
         chunk_size = 1024 * 1024
         while True:
             chunk = response.read(chunk_size)
@@ -69,6 +69,7 @@ def codex_adata() -> ad.AnnData:
             adata.obs_names_make_unique()
             return adata[adata.obs["sample"].isin(["BALBc-1", "MRL-5"])].copy()
         except (HTTPError, URLError, OSError) as e:
+            _CODEX_PATH.unlink(missing_ok=True)  # Force re-download on next attempt
             if attempt == max_retries - 1:  # Last attempt
                 pytest.skip(f"Failed to download test data after {max_retries} attempts: {str(e)}")
             time.sleep(retry_delay)