Skip to content
Merged
6 changes: 3 additions & 3 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ jobs:
matrix:
include:
- os: ubuntu-latest
python: "3.10"
python: "3.11"
- os: ubuntu-latest
python: "3.12"
python: "3.13"
- os: ubuntu-latest
python: "3.12"
python: "3.13"
pip-flags: "--pre"
name: PRE-RELEASE DEPENDENCIES

Expand Down
10 changes: 5 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ exclude = [

[project]
name = "cellcharter"
version = "0.3.5"
version = "0.3.6"
description = "A Python package for the identification, characterization and comparison of spatial clusters from spatial -omics data."
readme = "README.md"
requires-python = ">=3.10,<3.14"
requires-python = ">=3.11,<3.14"
license = {file = "LICENSE"}
authors = [
{name = "CSO group"},
Expand All @@ -28,13 +28,13 @@ dependencies = [
"anndata",
"scikit-learn",
"squidpy >= 1.6.3",
"torchgmm >= 0.1.2",
"torchgmm >= 0.1.4",
# for debug logging (referenced from the issue template)
"session-info",
"spatialdata",
"spatialdata-plot",
"rasterio",
"sknw",
"spatialdata",
"spatialdata-plot",
]

[project.optional-dependencies]
Expand Down
118 changes: 110 additions & 8 deletions src/cellcharter/datasets/_dataset.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,120 @@
from copy import copy
from __future__ import annotations

from squidpy.datasets._utils import AMetadata
import os
from dataclasses import dataclass
from pathlib import Path
from urllib.error import HTTPError, URLError
from urllib.parse import urlparse
from urllib.request import Request, urlopen

_codex_mouse_spleen = AMetadata(
import anndata as ad

try:
from tqdm.auto import tqdm
except ImportError: # pragma: no cover - optional dependency
tqdm = None


@dataclass(frozen=True)
class DatasetMetadata:
name: str
doc_header: str
shape: tuple[int, int]
url: str
filename: str


_codex_mouse_spleen = DatasetMetadata(
name="codex_mouse_spleen",
doc_header="Pre-processed CODEX dataset of mouse spleen from `Goltsev et al "
"<https://doi.org/10.1016/j.cell.2018.07.010>`__.",
shape=(707474, 29),
url="https://figshare.com/ndownloader/files/38538101",
url="https://ndownloader.figshare.com/files/38538101",
filename="codex_mouse_spleen.h5ad",
)

for name, var in copy(locals()).items():
if isinstance(var, AMetadata):
var._create_function(name, globals())

def _default_datasets_dir() -> Path:
return Path(os.environ.get("CELLCHARTER_DATA_DIR", Path.home() / ".cache" / "cellcharter" / "datasets"))


def _is_hdf5_file(path: Path) -> bool:
if not path.exists() or path.stat().st_size < 8:
return False
with path.open("rb") as input_file:
return input_file.read(8) == b"\x89HDF\r\n\x1a\n"


def _normalize_figshare_url(url: str) -> str:
parsed = urlparse(url)
if parsed.netloc == "figshare.com" and parsed.path.startswith("/ndownloader/files/"):
# Strip the /ndownloader prefix, keeping only /files/...
return f"https://ndownloader.figshare.com{parsed.path.removeprefix('/ndownloader')}"
return url


def _download_file(url: str, destination: Path) -> None:
destination.parent.mkdir(parents=True, exist_ok=True)
tmp_destination = destination.with_suffix(f"{destination.suffix}.part")

request = Request(_normalize_figshare_url(url), headers={"User-Agent": "cellcharter-datasets"})
progress = None
try:
with urlopen(request, timeout=60) as response, tmp_destination.open("wb") as output:
content_length = response.headers.get("Content-Length")
total = int(content_length) if content_length and content_length.isdigit() else None
if tqdm is not None:
progress = tqdm(total=total, unit="B", unit_scale=True, desc=f"Downloading {destination.name}")

chunk_size = 1024 * 1024
while True:
chunk = response.read(chunk_size)
if not chunk:
break
output.write(chunk)
if progress is not None:
progress.update(len(chunk))
except Exception as error: # noqa: B902
if tmp_destination.exists():
tmp_destination.unlink()
if isinstance(error, (HTTPError, URLError)):
raise RuntimeError(f"Failed to download dataset from {url}.") from error
raise error
finally:
if progress is not None:
progress.close()

if not _is_hdf5_file(tmp_destination):
tmp_destination.unlink(missing_ok=True)
raise RuntimeError(
f"Downloaded file from {url} is not a valid HDF5 file. " "Please check network access and dataset URL."
)

tmp_destination.replace(destination)


def _resolve_dataset_path(metadata: DatasetMetadata, path: str | Path | None = None) -> Path:
base_dir = _default_datasets_dir() if path is None else Path(path)
if base_dir.suffix:
return base_dir
return base_dir / metadata.filename


def _fetch_dataset_file(
metadata: DatasetMetadata, path: str | Path | None = None, force_download: bool = False
) -> Path:
dataset_path = _resolve_dataset_path(metadata, path=path)
should_download = force_download or not dataset_path.exists() or not _is_hdf5_file(dataset_path)
if should_download:
dataset_path.unlink(missing_ok=True)
_download_file(metadata.url, dataset_path)
return dataset_path


def codex_mouse_spleen(path: str | Path | None = None, force_download: bool = False) -> ad.AnnData:
"""Pre-processed CODEX dataset of mouse spleen from `Goltsev et al <https://doi.org/10.1016/j.cell.2018.07.010>`__."""
dataset_path = _fetch_dataset_file(_codex_mouse_spleen, path=path, force_download=force_download)
return ad.read_h5ad(dataset_path)


__all__ = ["codex_mouse_spleen"] # noqa: F822
__all__ = ["codex_mouse_spleen"]
43 changes: 38 additions & 5 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import time
from urllib.error import HTTPError
from pathlib import Path
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen

import anndata as ad
import numpy as np
Expand All @@ -9,6 +11,35 @@

_adata = sc.read("tests/_data/test_data.h5ad")
_adata.raw = _adata.copy()
_CODEX_PATH = Path("tests/_data/codex_adata.h5ad")
_CODEX_URL = "https://ndownloader.figshare.com/files/46832722"


def _is_hdf5_file(path: Path) -> bool:
if not path.exists() or path.stat().st_size < 8:
return False
with path.open("rb") as input_file:
return input_file.read(8) == b"\x89HDF\r\n\x1a\n"


def _download_codex(path: Path) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
tmp_path = path.with_suffix(f"{path.suffix}.part")
request = Request(_CODEX_URL, headers={"User-Agent": "cellcharter-tests"})

with urlopen(request, timeout=60) as response, tmp_path.open("wb") as output:
chunk_size = 1024 * 1024
while True:
chunk = response.read(chunk_size)
if not chunk:
break
output.write(chunk)

if not _is_hdf5_file(tmp_path):
tmp_path.unlink(missing_ok=True)
raise OSError("Downloaded codex fixture is not a valid HDF5 file.")

tmp_path.replace(path)


@pytest.fixture()
Expand All @@ -31,12 +62,14 @@ def codex_adata() -> ad.AnnData:

for attempt in range(max_retries):
try:
adata = sc.read(
"tests/_data/codex_adata.h5ad", backup_url="https://figshare.com/ndownloader/files/46832722"
)
if not _is_hdf5_file(_CODEX_PATH):
_CODEX_PATH.unlink(missing_ok=True)
_download_codex(_CODEX_PATH)
adata = ad.read_h5ad(_CODEX_PATH)
adata.obs_names_make_unique()
return adata[adata.obs["sample"].isin(["BALBc-1", "MRL-5"])].copy()
except HTTPError as e:
except (HTTPError, URLError, OSError) as e:
_CODEX_PATH.unlink(missing_ok=True) # Force re-download on next attempt
if attempt == max_retries - 1: # Last attempt
pytest.skip(f"Failed to download test data after {max_retries} attempts: {str(e)}")
time.sleep(retry_delay)
Loading