Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 32 additions & 5 deletions mussel/cli/aggregate_sample_features.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import logging
import os
from dataclasses import dataclass, field
from typing import List, Optional

import h5py
import hydra
import numpy as np
from hydra.conf import HelpConf, HydraConf
from hydra.core.config_store import ConfigStore

from mussel.utils.feature_extract import aggregate_sample_features
from mussel.utils.file import save_hdf5

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -97,16 +101,39 @@ def main(cfg: AggregateSampleFeaturesConfig):
cfg.seed,
)

aggregate_sample_features(
patch_features_h5_paths=list(cfg.patch_features_h5_paths),
sample_ids=list(cfg.sample_ids),
output_dir=cfg.output_dir,
output_h5_suffix=cfg.output_h5_suffix,
patch_features_h5_paths = list(cfg.patch_features_h5_paths)
sample_ids = list(cfg.sample_ids)

if len(patch_features_h5_paths) != len(sample_ids):
raise ValueError(
f"patch_features_h5_paths ({len(patch_features_h5_paths)}) and "
f"sample_ids ({len(sample_ids)}) must have the same length."
)

features_list = []
coords_list = []
for h5_path in patch_features_h5_paths:
with h5py.File(h5_path, "r") as h5:
features_list.append(np.array(h5["features"]))
coords_list.append(h5["coords"][:])

results = aggregate_sample_features(
features_list=features_list,
coords_list=coords_list,
sample_ids=sample_ids,
Comment on lines +113 to +123
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The CLI now loads all slide feature arrays into memory upfront (features_list / coords_list) before grouping/aggregating. Previously, aggregation read H5s within the per-sample loop, so peak memory scaled with the largest sample rather than the entire input set. For large cohorts this can be a substantial memory regression and risk OOM; consider grouping indices by sample_id first, then reading/aggregating one sample at a time (or otherwise streaming) so only one sample’s slides are resident at once.

Copilot uses AI. Check for mistakes.
max_tiles=cfg.max_tiles,
subsampling_strategy=cfg.subsampling_strategy,
seed=cfg.seed,
)

os.makedirs(cfg.output_dir, exist_ok=True)
for sample_id, (features, coords) in results.items():
out_path = os.path.join(cfg.output_dir, f"{sample_id}.{cfg.output_h5_suffix}")
save_hdf5(out_path, {"features": features, "coords": coords}, mode="w")
logger.info(
"Wrote %s (%d tiles, dim=%d)", out_path, len(features), features.shape[1]
)

logger.info("Done.")


Expand Down
58 changes: 28 additions & 30 deletions mussel/utils/feature_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -1904,50 +1904,49 @@ def subsample_tiles(


def aggregate_sample_features(
patch_features_h5_paths: List[str],
features_list: List[np.ndarray],
coords_list: List[np.ndarray],
sample_ids: List[str],
output_dir: str,
output_h5_suffix: str = "features.h5",
max_tiles: Optional[int] = None,
subsampling_strategy: str = "random",
seed: int = 42,
) -> None:
"""Concatenate per-slide patch features into one H5 per sample.
) -> dict:
"""Concatenate per-slide patch features into one array per sample.
Comment on lines 1906 to +1914
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type annotation ) -> dict: (and groups: dict / results: dict) is very generic and doesn’t match the documented API (“Ordered dict mapping each unique sample_id to a (features, coords) tuple”). Please tighten the typing to something like collections.OrderedDict[str, tuple[np.ndarray, np.ndarray]] (or dict[str, tuple[np.ndarray, np.ndarray]] if you don’t want to expose OrderedDict) to make the new in-memory API easier/safer to consume.

Copilot uses AI. Check for mistakes.

Reads per-slide feature H5 files (each with ``features`` (N_i, D) and
``coords`` (N_i, 2) datasets), groups them by ``sample_id``, concatenates
on the tile axis, optionally subsamples to ``max_tiles``, and writes one
output H5 per unique sample.
Groups slides by ``sample_id``, concatenates their features and coordinates
on the tile axis, and optionally subsamples to ``max_tiles``.

Args:
patch_features_h5_paths: Paths to per-slide feature H5 files
(produced by ``extract_features``).
features_list: Per-slide feature arrays, each of shape ``(N_i, D)``.
coords_list: Per-slide coordinate arrays, each of shape ``(N_i, 2)``.
Must have the same length as ``features_list``.
sample_ids: Sample identifier for each slide (same length as
``patch_features_h5_paths``). Slides with the same ``sample_id``
are concatenated together.
output_dir: Directory where one ``{sample_id}.{output_h5_suffix}`` file
is written per unique sample.
output_h5_suffix: Filename suffix for output files (default
``"features.h5"``).
``features_list``). Slides with the same ``sample_id`` are
concatenated together.
max_tiles: If set, subsample each sample to at most this many tiles
after concatenation. ``None`` keeps all tiles.
subsampling_strategy: Strategy when subsampling — ``"random"``,
``"proportional"``, or ``"equal"``. Ignored when ``max_tiles``
is ``None`` or total tiles ≤ ``max_tiles``.
seed: Random seed for subsampling reproducibility (default ``42``).

Returns:
Ordered dict mapping each unique ``sample_id`` to a
``(features, coords)`` tuple of concatenated (and optionally
subsampled) arrays.
"""
if len(patch_features_h5_paths) != len(sample_ids):
if not (len(features_list) == len(coords_list) == len(sample_ids)):
raise ValueError(
f"patch_features_h5_paths ({len(patch_features_h5_paths)}) and "
f"sample_ids ({len(sample_ids)}) must have the same length."
f"features_list ({len(features_list)}), coords_list ({len(coords_list)}), "
f"and sample_ids ({len(sample_ids)}) must all have the same length."
)

os.makedirs(output_dir, exist_ok=True)

groups: dict = collections.OrderedDict()
for idx, sid in enumerate(sample_ids):
groups.setdefault(sid, []).append(idx)

results: dict = collections.OrderedDict()

for sample_id, indices in groups.items():
logger.info("Aggregating sample %s from %d slide(s)", sample_id, len(indices))

Expand All @@ -1956,14 +1955,12 @@ def aggregate_sample_features(
slide_sizes = []

for i in indices:
h5_path = patch_features_h5_paths[i]
with h5py.File(h5_path, "r") as h5:
feats = np.array(h5["features"])
coords = h5["coords"][:]
feats = features_list[i]
coords = coords_list[i]
all_features.append(feats)
all_coords.append(coords)
slide_sizes.append(len(feats))
logger.debug(" slide %d: %d tiles from %s", i, len(feats), h5_path)
logger.debug(" slide %d: %d tiles", i, len(feats))
Comment on lines 1957 to +1963
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aggregate_sample_features now assumes each features_list[i] and coords_list[i] pair is consistent, but it never validates shapes/lengths. If a slide has len(coords) != len(feats) (or unexpected dimensionality), np.concatenate / subsample_tiles can later fail with a less-informative error or silently produce inconsistent outputs. Consider validating per-slide that feats.ndim == 2, coords.ndim == 2, coords.shape[1] == 2, and len(coords) == len(feats) and raising a ValueError that includes the offending slide index/sample_id.

Copilot uses AI. Check for mistakes.

features = np.concatenate(all_features, axis=0)
coords = np.concatenate(all_coords, axis=0)
Expand All @@ -1984,11 +1981,12 @@ def aggregate_sample_features(
max_tiles,
)

out_path = os.path.join(output_dir, f"{sample_id}.{output_h5_suffix}")
save_hdf5(out_path, {"features": features, "coords": coords}, mode="w")
logger.info(
"Wrote %s (%d tiles, dim=%d)", out_path, len(features), features.shape[1]
"Sample %s: %d tiles, dim=%d", sample_id, len(features), features.shape[1]
)
results[sample_id] = (features, coords)

return results


@timed
Expand Down
87 changes: 38 additions & 49 deletions tests/mussel/cli/test_aggregate_sample_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,88 +125,77 @@ def test_subsample_tiles_invalid_strategy():

def test_aggregate_sample_features_single_slide(tmp_path):
"""One slide per sample — output equals input."""
h5_a = tmp_path / "slide_a.h5"
feats_a, coords_a = _write_fake_h5(h5_a, n_tiles=30)
feats_a, coords_a = _make_data(30)

_aggregate_sample_features(
patch_features_h5_paths=[str(h5_a)],
results = _aggregate_sample_features(
features_list=[feats_a],
coords_list=[coords_a],
sample_ids=["sample1"],
output_dir=str(tmp_path / "out"),
output_h5_suffix="features.h5",
max_tiles=None,
subsampling_strategy="random",
seed=42,
)

out_h5 = tmp_path / "out" / "sample1.features.h5"
assert out_h5.exists()
with h5py.File(out_h5) as f:
np.testing.assert_array_equal(f["features"][:], feats_a)
np.testing.assert_array_equal(f["coords"][:], coords_a)
assert "sample1" in results
np.testing.assert_array_equal(results["sample1"][0], feats_a)
np.testing.assert_array_equal(results["sample1"][1], coords_a)


def test_aggregate_sample_features_multi_slide(tmp_path):
"""Two slides per sample — features are concatenated."""
h5_a = tmp_path / "slide_a.h5"
h5_b = tmp_path / "slide_b.h5"
_write_fake_h5(h5_a, n_tiles=20, seed=1)
_write_fake_h5(h5_b, n_tiles=15, seed=2)

_aggregate_sample_features(
patch_features_h5_paths=[str(h5_a), str(h5_b)],
rng = np.random.default_rng(0)
feats_a = rng.random((20, 4)).astype(np.float32)
coords_a = rng.integers(0, 1000, (20, 2))
feats_b = rng.random((15, 4)).astype(np.float32)
coords_b = rng.integers(0, 1000, (15, 2))

results = _aggregate_sample_features(
features_list=[feats_a, feats_b],
coords_list=[coords_a, coords_b],
sample_ids=["sampleX", "sampleX"],
output_dir=str(tmp_path / "out"),
max_tiles=None,
)

out_h5 = tmp_path / "out" / "sampleX.features.h5"
assert out_h5.exists()
with h5py.File(out_h5) as f:
assert f["features"].shape == (35, 4)
assert f["coords"].shape == (35, 2)
assert results["sampleX"][0].shape == (35, 4)
assert results["sampleX"][1].shape == (35, 2)
Comment on lines +152 to +160
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test_aggregate_sample_features_multi_slide only asserts output shapes. Since this PR changes aggregate_sample_features behavior significantly, it would be more robust to also assert that the returned arrays equal the expected concatenation (e.g., np.concatenate([feats_a, feats_b]) / np.concatenate([coords_a, coords_b])) so ordering/content regressions are caught.

Copilot uses AI. Check for mistakes.


def test_aggregate_sample_features_two_samples(tmp_path):
"""Three slides, two samples — two output files."""
paths = [tmp_path / f"s{i}.h5" for i in range(3)]
for i, p in enumerate(paths):
_write_fake_h5(p, n_tiles=10, seed=i)
"""Three slides, two samples — two entries in result."""
rng = np.random.default_rng(0)
slides = [(rng.random((10, 4)).astype(np.float32), rng.integers(0, 1000, (10, 2))) for _ in range(3)]
features_list = [f for f, _ in slides]
coords_list = [c for _, c in slides]

_aggregate_sample_features(
patch_features_h5_paths=[str(p) for p in paths],
results = _aggregate_sample_features(
features_list=features_list,
coords_list=coords_list,
sample_ids=["sA", "sA", "sB"],
output_dir=str(tmp_path / "out"),
max_tiles=None,
)

out_a = tmp_path / "out" / "sA.features.h5"
out_b = tmp_path / "out" / "sB.features.h5"
assert out_a.exists() and out_b.exists()
with h5py.File(out_a) as f:
assert f["features"].shape[0] == 20
with h5py.File(out_b) as f:
assert f["features"].shape[0] == 10
assert results["sA"][0].shape[0] == 20
assert results["sB"][0].shape[0] == 10


def test_aggregate_sample_features_with_subsampling(tmp_path):
"""Subsampling reduces output to max_tiles."""
h5_a = tmp_path / "s0.h5"
h5_b = tmp_path / "s1.h5"
_write_fake_h5(h5_a, n_tiles=80, seed=0)
_write_fake_h5(h5_b, n_tiles=60, seed=1)

_aggregate_sample_features(
patch_features_h5_paths=[str(h5_a), str(h5_b)],
rng = np.random.default_rng(0)
feats_a = rng.random((80, 4)).astype(np.float32)
coords_a = rng.integers(0, 1000, (80, 2))
feats_b = rng.random((60, 4)).astype(np.float32)
coords_b = rng.integers(0, 1000, (60, 2))

results = _aggregate_sample_features(
features_list=[feats_a, feats_b],
coords_list=[coords_a, coords_b],
sample_ids=["big", "big"],
output_dir=str(tmp_path / "out"),
max_tiles=50,
subsampling_strategy="random",
seed=99,
)

out_h5 = tmp_path / "out" / "big.features.h5"
with h5py.File(out_h5) as f:
assert f["features"].shape[0] == 50
assert results["big"][0].shape[0] == 50
Comment on lines 181 to +198
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test_aggregate_sample_features_with_subsampling currently only checks that the output tile count is reduced to max_tiles. Consider also validating that subsampling is reproducible given seed and that features and coords remain aligned (e.g., by using easily-identifiable rows/coords and asserting they correspond) to catch potential indexing bugs in the aggregation path.

Copilot uses AI. Check for mistakes.


# =============================================================================
Expand Down
Loading