diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 18f29b3..3c03f45 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -24,11 +24,11 @@ jobs: matrix: include: - os: ubuntu-latest - python: "3.10" + python: "3.11" - os: ubuntu-latest - python: "3.12" + python: "3.13" - os: ubuntu-latest - python: "3.12" + python: "3.13" pip-flags: "--pre" name: PRE-RELEASE DEPENDENCIES diff --git a/pyproject.toml b/pyproject.toml index d6a4fe4..de7718a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,10 +10,10 @@ exclude = [ [project] name = "cellcharter" -version = "0.3.5" +version = "0.3.6" description = "A Python package for the identification, characterization and comparison of spatial clusters from spatial -omics data." readme = "README.md" -requires-python = ">=3.10,<3.14" +requires-python = ">=3.11,<3.14" license = {file = "LICENSE"} authors = [ {name = "CSO group"}, @@ -28,13 +28,13 @@ dependencies = [ "anndata", "scikit-learn", "squidpy >= 1.6.3", - "torchgmm >= 0.1.2", + "torchgmm >= 0.1.4", # for debug logging (referenced from the issue template) "session-info", - "spatialdata", - "spatialdata-plot", "rasterio", "sknw", + "spatialdata", + "spatialdata-plot", ] [project.optional-dependencies] diff --git a/src/cellcharter/datasets/_dataset.py b/src/cellcharter/datasets/_dataset.py index 7e35b7f..4c1d841 100644 --- a/src/cellcharter/datasets/_dataset.py +++ b/src/cellcharter/datasets/_dataset.py @@ -1,18 +1,120 @@ -from copy import copy +from __future__ import annotations -from squidpy.datasets._utils import AMetadata +import os +from dataclasses import dataclass +from pathlib import Path +from urllib.error import HTTPError, URLError +from urllib.parse import urlparse +from urllib.request import Request, urlopen -_codex_mouse_spleen = AMetadata( +import anndata as ad + +try: + from tqdm.auto import tqdm +except ImportError: # pragma: no cover - optional dependency + tqdm = None + + +@dataclass(frozen=True) +class DatasetMetadata: + name: str + doc_header: str + shape: tuple[int, int] + url: str + filename: str + + +_codex_mouse_spleen = DatasetMetadata( name="codex_mouse_spleen", doc_header="Pre-processed CODEX dataset of mouse spleen from `Goltsev et al " "`__.", shape=(707474, 29), - url="https://figshare.com/ndownloader/files/38538101", + url="https://ndownloader.figshare.com/files/38538101", + filename="codex_mouse_spleen.h5ad", ) -for name, var in copy(locals()).items(): - if isinstance(var, AMetadata): - var._create_function(name, globals()) + +def _default_datasets_dir() -> Path: + return Path(os.environ.get("CELLCHARTER_DATA_DIR", Path.home() / ".cache" / "cellcharter" / "datasets")) + + +def _is_hdf5_file(path: Path) -> bool: + if not path.exists() or path.stat().st_size < 8: + return False + with path.open("rb") as input_file: + return input_file.read(8) == b"\x89HDF\r\n\x1a\n" + + +def _normalize_figshare_url(url: str) -> str: + parsed = urlparse(url) + if parsed.netloc == "figshare.com" and parsed.path.startswith("/ndownloader/files/"): + # Strip the /ndownloader prefix, keeping only /files/... + return f"https://ndownloader.figshare.com{parsed.path.removeprefix('/ndownloader')}" + return url + + +def _download_file(url: str, destination: Path) -> None: + destination.parent.mkdir(parents=True, exist_ok=True) + tmp_destination = destination.with_suffix(f"{destination.suffix}.part") + + request = Request(_normalize_figshare_url(url), headers={"User-Agent": "cellcharter-datasets"}) + progress = None + try: + with urlopen(request, timeout=60) as response, tmp_destination.open("wb") as output: + content_length = response.headers.get("Content-Length") + total = int(content_length) if content_length and content_length.isdigit() else None + if tqdm is not None: + progress = tqdm(total=total, unit="B", unit_scale=True, desc=f"Downloading {destination.name}") + + chunk_size = 1024 * 1024 + while True: + chunk = response.read(chunk_size) + if not chunk: + break + output.write(chunk) + if progress is not None: + progress.update(len(chunk)) + except Exception as error: # noqa: B902 + if tmp_destination.exists(): + tmp_destination.unlink() + if isinstance(error, (HTTPError, URLError)): + raise RuntimeError(f"Failed to download dataset from {url}.") from error + raise error + finally: + if progress is not None: + progress.close() + + if not _is_hdf5_file(tmp_destination): + tmp_destination.unlink(missing_ok=True) + raise RuntimeError( + f"Downloaded file from {url} is not a valid HDF5 file. " "Please check network access and dataset URL." + ) + + tmp_destination.replace(destination) + + +def _resolve_dataset_path(metadata: DatasetMetadata, path: str | Path | None = None) -> Path: + base_dir = _default_datasets_dir() if path is None else Path(path) + if base_dir.suffix: + return base_dir + return base_dir / metadata.filename + + +def _fetch_dataset_file( + metadata: DatasetMetadata, path: str | Path | None = None, force_download: bool = False +) -> Path: + dataset_path = _resolve_dataset_path(metadata, path=path) + should_download = force_download or not dataset_path.exists() or not _is_hdf5_file(dataset_path) + if should_download: + dataset_path.unlink(missing_ok=True) + _download_file(metadata.url, dataset_path) + return dataset_path + + +def codex_mouse_spleen(path: str | Path | None = None, force_download: bool = False) -> ad.AnnData: + """Pre-processed CODEX dataset of mouse spleen from `Goltsev et al `__.""" + dataset_path = _fetch_dataset_file(_codex_mouse_spleen, path=path, force_download=force_download) + return ad.read_h5ad(dataset_path) -__all__ = ["codex_mouse_spleen"] # noqa: F822 +__all__ = ["codex_mouse_spleen"] diff --git a/tests/conftest.py b/tests/conftest.py index 4dec851..70c245d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,7 @@ import time -from urllib.error import HTTPError +from pathlib import Path +from urllib.error import HTTPError, URLError +from urllib.request import Request, urlopen import anndata as ad import numpy as np @@ -9,6 +11,35 @@ _adata = sc.read("tests/_data/test_data.h5ad") _adata.raw = _adata.copy() +_CODEX_PATH = Path("tests/_data/codex_adata.h5ad") +_CODEX_URL = "https://ndownloader.figshare.com/files/46832722" + + +def _is_hdf5_file(path: Path) -> bool: + if not path.exists() or path.stat().st_size < 8: + return False + with path.open("rb") as input_file: + return input_file.read(8) == b"\x89HDF\r\n\x1a\n" + + +def _download_codex(path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp_path = path.with_suffix(f"{path.suffix}.part") + request = Request(_CODEX_URL, headers={"User-Agent": "cellcharter-tests"}) + + with urlopen(request, timeout=60) as response, tmp_path.open("wb") as output: + chunk_size = 1024 * 1024 + while True: + chunk = response.read(chunk_size) + if not chunk: + break + output.write(chunk) + + if not _is_hdf5_file(tmp_path): + tmp_path.unlink(missing_ok=True) + raise OSError("Downloaded codex fixture is not a valid HDF5 file.") + + tmp_path.replace(path) @pytest.fixture() @@ -31,12 +62,14 @@ def codex_adata() -> ad.AnnData: for attempt in range(max_retries): try: - adata = sc.read( - "tests/_data/codex_adata.h5ad", backup_url="https://figshare.com/ndownloader/files/46832722" - ) + if not _is_hdf5_file(_CODEX_PATH): + _CODEX_PATH.unlink(missing_ok=True) + _download_codex(_CODEX_PATH) + adata = ad.read_h5ad(_CODEX_PATH) adata.obs_names_make_unique() return adata[adata.obs["sample"].isin(["BALBc-1", "MRL-5"])].copy() - except HTTPError as e: + except (HTTPError, URLError, OSError) as e: + _CODEX_PATH.unlink(missing_ok=True) # Force re-download on next attempt if attempt == max_retries - 1: # Last attempt pytest.skip(f"Failed to download test data after {max_retries} attempts: {str(e)}") time.sleep(retry_delay)