From aa84f4537f619b9a3eb6267a11a2cf7b44b0e18a Mon Sep 17 00:00:00 2001 From: Marco Varrone Date: Thu, 12 Feb 2026 10:35:25 +0100 Subject: [PATCH 1/8] Refactor dataset handling in _dataset.py --- src/cellcharter/datasets/_dataset.py | 115 +++++++++++++++++++++++++-- 1 file changed, 107 insertions(+), 8 deletions(-) diff --git a/src/cellcharter/datasets/_dataset.py b/src/cellcharter/datasets/_dataset.py index 7e35b7f..f60f6ee 100644 --- a/src/cellcharter/datasets/_dataset.py +++ b/src/cellcharter/datasets/_dataset.py @@ -1,18 +1,117 @@ -from copy import copy +from __future__ import annotations -from squidpy.datasets._utils import AMetadata +import os +from dataclasses import dataclass +from pathlib import Path +from urllib.error import HTTPError, URLError +from urllib.parse import urlparse +from urllib.request import Request, urlopen -_codex_mouse_spleen = AMetadata( +import anndata as ad + +try: + from tqdm.auto import tqdm +except ImportError: # pragma: no cover - optional dependency + tqdm = None + + +@dataclass(frozen=True) +class DatasetMetadata: + name: str + doc_header: str + shape: tuple[int, int] + url: str + filename: str + + +_codex_mouse_spleen = DatasetMetadata( name="codex_mouse_spleen", doc_header="Pre-processed CODEX dataset of mouse spleen from `Goltsev et al " "`__.", shape=(707474, 29), - url="https://figshare.com/ndownloader/files/38538101", + url="https://ndownloader.figshare.com/files/38538101", + filename="codex_mouse_spleen.h5ad", ) -for name, var in copy(locals()).items(): - if isinstance(var, AMetadata): - var._create_function(name, globals()) + +def _default_datasets_dir() -> Path: + return Path(os.environ.get("CELLCHARTER_DATA_DIR", Path.home() / ".cache" / "cellcharter" / "datasets")) + + +def _is_hdf5_file(path: Path) -> bool: + if not path.exists() or path.stat().st_size < 8: + return False + with path.open("rb") as input_file: + return input_file.read(8) == b"\x89HDF\r\n\x1a\n" + + +def _normalize_figshare_url(url: str) -> str: + parsed = urlparse(url) + if parsed.netloc == "figshare.com" and parsed.path.startswith("/ndownloader/files/"): + return f"https://ndownloader.figshare.com{parsed.path}" + return url + + +def _download_file(url: str, destination: Path) -> None: + destination.parent.mkdir(parents=True, exist_ok=True) + tmp_destination = destination.with_suffix(f"{destination.suffix}.part") + + request = Request(_normalize_figshare_url(url), headers={"User-Agent": "cellcharter-datasets"}) + progress = None + try: + with urlopen(request) as response, tmp_destination.open("wb") as output: + content_length = response.headers.get("Content-Length") + total = int(content_length) if content_length and content_length.isdigit() else None + if tqdm is not None: + progress = tqdm(total=total, unit="B", unit_scale=True, desc=f"Downloading {destination.name}") + + chunk_size = 1024 * 1024 + while True: + chunk = response.read(chunk_size) + if not chunk: + break + output.write(chunk) + if progress is not None: + progress.update(len(chunk)) + except (HTTPError, URLError) as error: + if tmp_destination.exists(): + tmp_destination.unlink() + raise RuntimeError(f"Failed to download dataset from {url}.") from error + finally: + if progress is not None: + progress.close() + + if not _is_hdf5_file(tmp_destination): + tmp_destination.unlink(missing_ok=True) + raise RuntimeError( + f"Downloaded file from {url} is not a valid HDF5 file. " "Please check network access and dataset URL." + ) + + tmp_destination.replace(destination) + + +def _resolve_dataset_path(metadata: DatasetMetadata, path: str | Path | None = None) -> Path: + base_dir = _default_datasets_dir() if path is None else Path(path) + if base_dir.suffix: + return base_dir + return base_dir / metadata.filename + + +def _fetch_dataset_file( + metadata: DatasetMetadata, path: str | Path | None = None, force_download: bool = False +) -> Path: + dataset_path = _resolve_dataset_path(metadata, path=path) + should_download = force_download or not dataset_path.exists() or not _is_hdf5_file(dataset_path) + if should_download: + dataset_path.unlink(missing_ok=True) + _download_file(metadata.url, dataset_path) + return dataset_path + + +def codex_mouse_spleen(path: str | Path | None = None, force_download: bool = False) -> ad.AnnData: + """Pre-processed CODEX dataset of mouse spleen from `Goltsev et al `__.""" + dataset_path = _fetch_dataset_file(_codex_mouse_spleen, path=path, force_download=force_download) + return ad.read_h5ad(dataset_path) -__all__ = ["codex_mouse_spleen"] # noqa: F822 +__all__ = ["codex_mouse_spleen"] From aa683cc32db0a5a70f0efcab3348b356e548bae1 Mon Sep 17 00:00:00 2001 From: Marco Varrone Date: Thu, 12 Feb 2026 11:07:07 +0100 Subject: [PATCH 2/8] Enhance Figshare URL normalization and improve error handling in _dataset.py --- src/cellcharter/datasets/_dataset.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/cellcharter/datasets/_dataset.py b/src/cellcharter/datasets/_dataset.py index f60f6ee..4c1d841 100644 --- a/src/cellcharter/datasets/_dataset.py +++ b/src/cellcharter/datasets/_dataset.py @@ -48,7 +48,8 @@ def _is_hdf5_file(path: Path) -> bool: def _normalize_figshare_url(url: str) -> str: parsed = urlparse(url) if parsed.netloc == "figshare.com" and parsed.path.startswith("/ndownloader/files/"): - return f"https://ndownloader.figshare.com{parsed.path}" + # Strip the /ndownloader prefix, keeping only /files/... + return f"https://ndownloader.figshare.com{parsed.path.removeprefix('/ndownloader')}" return url @@ -59,7 +60,7 @@ def _download_file(url: str, destination: Path) -> None: request = Request(_normalize_figshare_url(url), headers={"User-Agent": "cellcharter-datasets"}) progress = None try: - with urlopen(request) as response, tmp_destination.open("wb") as output: + with urlopen(request, timeout=60) as response, tmp_destination.open("wb") as output: content_length = response.headers.get("Content-Length") total = int(content_length) if content_length and content_length.isdigit() else None if tqdm is not None: @@ -73,10 +74,12 @@ def _download_file(url: str, destination: Path) -> None: output.write(chunk) if progress is not None: progress.update(len(chunk)) - except (HTTPError, URLError) as error: + except Exception as error: # noqa: B902 if tmp_destination.exists(): tmp_destination.unlink() - raise RuntimeError(f"Failed to download dataset from {url}.") from error + if isinstance(error, (HTTPError, URLError)): + raise RuntimeError(f"Failed to download dataset from {url}.") from error + raise error finally: if progress is not None: progress.close() From dbf29b6b4b63efe14f02ff6302464ccbb6bdd5a7 Mon Sep 17 00:00:00 2001 From: Marco Varrone Date: Thu, 12 Feb 2026 11:08:00 +0100 Subject: [PATCH 3/8] Update dependencies and bump to 0.3.6 --- pyproject.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d6a4fe4..6923457 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,10 +10,10 @@ exclude = [ [project] name = "cellcharter" -version = "0.3.5" +version = "0.3.6" description = "A Python package for the identification, characterization and comparison of spatial clusters from spatial -omics data." readme = "README.md" -requires-python = ">=3.10,<3.14" +requires-python = ">=3.11,<3.14" # include 3.13 once spatialdata allows it (waiting for multicale-spatial-image to allow it) license = {file = "LICENSE"} authors = [ {name = "CSO group"}, @@ -31,10 +31,10 @@ dependencies = [ "torchgmm >= 0.1.2", # for debug logging (referenced from the issue template) "session-info", - "spatialdata", - "spatialdata-plot", "rasterio", "sknw", + "spatialdata", + "spatialdata-plot", ] [project.optional-dependencies] From 2dd3f8b31838ab4a368d1cfeefa7e43eef74abe5 Mon Sep 17 00:00:00 2001 From: Marco Varrone Date: Thu, 12 Feb 2026 11:15:21 +0100 Subject: [PATCH 4/8] Update torchgmm dependency version to 0.1.4 in pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6923457..d160d90 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ dependencies = [ "anndata", "scikit-learn", "squidpy >= 1.6.3", - "torchgmm >= 0.1.2", + "torchgmm >= 0.1.4", # for debug logging (referenced from the issue template) "session-info", "rasterio", From 0c680f3f672bf98a6599279bc94546b4e2f4abbd Mon Sep 17 00:00:00 2001 From: Marco Varrone Date: Thu, 12 Feb 2026 11:20:32 +0100 Subject: [PATCH 5/8] Update Python versions in GitHub Actions workflow to 3.11 and 3.13 --- .github/workflows/test.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 18f29b3..3c03f45 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -24,11 +24,11 @@ jobs: matrix: include: - os: ubuntu-latest - python: "3.10" + python: "3.11" - os: ubuntu-latest - python: "3.12" + python: "3.13" - os: ubuntu-latest - python: "3.12" + python: "3.13" pip-flags: "--pre" name: PRE-RELEASE DEPENDENCIES From 68959945fb4299acdbdee7d0ae55fb12250ca27b Mon Sep 17 00:00:00 2001 From: Marco Varrone Date: Thu, 12 Feb 2026 11:25:12 +0100 Subject: [PATCH 6/8] Remove comment --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d160d90..de7718a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ name = "cellcharter" version = "0.3.6" description = "A Python package for the identification, characterization and comparison of spatial clusters from spatial -omics data." readme = "README.md" -requires-python = ">=3.11,<3.14" # include 3.13 once spatialdata allows it (waiting for multicale-spatial-image to allow it) +requires-python = ">=3.11,<3.14" license = {file = "LICENSE"} authors = [ {name = "CSO group"}, From 9e7cc2841789a9ccf648b85c020b497b20c4adbf Mon Sep 17 00:00:00 2001 From: Marco Varrone Date: Thu, 12 Feb 2026 11:42:36 +0100 Subject: [PATCH 7/8] Add functionality to download and validate codex_adata fixture in tests - Introduced _is_hdf5_file and _download_codex functions to handle HDF5 file validation and downloading. - Updated codex_adata fixture to check for the existence of the codex_adata file and download it if necessary. - Enhanced error handling to include URLError and OSError during data retrieval. --- tests/conftest.py | 42 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 4dec851..c43c2c3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,7 @@ import time -from urllib.error import HTTPError +from pathlib import Path +from urllib.error import HTTPError, URLError +from urllib.request import Request, urlopen import anndata as ad import numpy as np @@ -9,6 +11,35 @@ _adata = sc.read("tests/_data/test_data.h5ad") _adata.raw = _adata.copy() +_CODEX_PATH = Path("tests/_data/codex_adata.h5ad") +_CODEX_URL = "https://ndownloader.figshare.com/files/46832722" + + +def _is_hdf5_file(path: Path) -> bool: + if not path.exists() or path.stat().st_size < 8: + return False + with path.open("rb") as input_file: + return input_file.read(8) == b"\x89HDF\r\n\x1a\n" + + +def _download_codex(path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp_path = path.with_suffix(f"{path.suffix}.part") + request = Request(_CODEX_URL, headers={"User-Agent": "cellcharter-tests"}) + + with urlopen(request) as response, tmp_path.open("wb") as output: + chunk_size = 1024 * 1024 + while True: + chunk = response.read(chunk_size) + if not chunk: + break + output.write(chunk) + + if not _is_hdf5_file(tmp_path): + tmp_path.unlink(missing_ok=True) + raise OSError("Downloaded codex fixture is not a valid HDF5 file.") + + tmp_path.replace(path) @pytest.fixture() @@ -31,12 +62,13 @@ def codex_adata() -> ad.AnnData: for attempt in range(max_retries): try: - adata = sc.read( - "tests/_data/codex_adata.h5ad", backup_url="https://figshare.com/ndownloader/files/46832722" - ) + if not _is_hdf5_file(_CODEX_PATH): + _CODEX_PATH.unlink(missing_ok=True) + _download_codex(_CODEX_PATH) + adata = ad.read_h5ad(_CODEX_PATH) adata.obs_names_make_unique() return adata[adata.obs["sample"].isin(["BALBc-1", "MRL-5"])].copy() - except HTTPError as e: + except (HTTPError, URLError, OSError) as e: if attempt == max_retries - 1: # Last attempt pytest.skip(f"Failed to download test data after {max_retries} attempts: {str(e)}") time.sleep(retry_delay) From 27cd77d5ca2271d2c38bdf6ac9ef216ab8a67ad2 Mon Sep 17 00:00:00 2001 From: Marco Varrone Date: Thu, 12 Feb 2026 11:50:46 +0100 Subject: [PATCH 8/8] Increase timeout for codex download and ensure re-download on failure --- tests/conftest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index c43c2c3..70c245d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -27,7 +27,7 @@ def _download_codex(path: Path) -> None: tmp_path = path.with_suffix(f"{path.suffix}.part") request = Request(_CODEX_URL, headers={"User-Agent": "cellcharter-tests"}) - with urlopen(request) as response, tmp_path.open("wb") as output: + with urlopen(request, timeout=60) as response, tmp_path.open("wb") as output: chunk_size = 1024 * 1024 while True: chunk = response.read(chunk_size) @@ -69,6 +69,7 @@ def codex_adata() -> ad.AnnData: adata.obs_names_make_unique() return adata[adata.obs["sample"].isin(["BALBc-1", "MRL-5"])].copy() except (HTTPError, URLError, OSError) as e: + _CODEX_PATH.unlink(missing_ok=True) # Force re-download on next attempt if attempt == max_retries - 1: # Last attempt pytest.skip(f"Failed to download test data after {max_retries} attempts: {str(e)}") time.sleep(retry_delay)