-
Notifications
You must be signed in to change notification settings - Fork 0
refactor: aggregate_sample_features accepts arrays instead of file paths #115
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1904,50 +1904,49 @@ def subsample_tiles( | |
|
|
||
|
|
||
| def aggregate_sample_features( | ||
| patch_features_h5_paths: List[str], | ||
| features_list: List[np.ndarray], | ||
| coords_list: List[np.ndarray], | ||
| sample_ids: List[str], | ||
| output_dir: str, | ||
| output_h5_suffix: str = "features.h5", | ||
| max_tiles: Optional[int] = None, | ||
| subsampling_strategy: str = "random", | ||
| seed: int = 42, | ||
| ) -> None: | ||
| """Concatenate per-slide patch features into one H5 per sample. | ||
| ) -> dict: | ||
| """Concatenate per-slide patch features into one array per sample. | ||
|
Comment on lines
1906
to
+1914
|
||
|
|
||
| Reads per-slide feature H5 files (each with ``features`` (N_i, D) and | ||
| ``coords`` (N_i, 2) datasets), groups them by ``sample_id``, concatenates | ||
| on the tile axis, optionally subsamples to ``max_tiles``, and writes one | ||
| output H5 per unique sample. | ||
| Groups slides by ``sample_id``, concatenates their features and coordinates | ||
| on the tile axis, and optionally subsamples to ``max_tiles``. | ||
|
|
||
| Args: | ||
| patch_features_h5_paths: Paths to per-slide feature H5 files | ||
| (produced by ``extract_features``). | ||
| features_list: Per-slide feature arrays, each of shape ``(N_i, D)``. | ||
| coords_list: Per-slide coordinate arrays, each of shape ``(N_i, 2)``. | ||
| Must have the same length as ``features_list``. | ||
| sample_ids: Sample identifier for each slide (same length as | ||
| ``patch_features_h5_paths``). Slides with the same ``sample_id`` | ||
| are concatenated together. | ||
| output_dir: Directory where one ``{sample_id}.{output_h5_suffix}`` file | ||
| is written per unique sample. | ||
| output_h5_suffix: Filename suffix for output files (default | ||
| ``"features.h5"``). | ||
| ``features_list``). Slides with the same ``sample_id`` are | ||
| concatenated together. | ||
| max_tiles: If set, subsample each sample to at most this many tiles | ||
| after concatenation. ``None`` keeps all tiles. | ||
| subsampling_strategy: Strategy when subsampling — ``"random"``, | ||
| ``"proportional"``, or ``"equal"``. Ignored when ``max_tiles`` | ||
| is ``None`` or total tiles ≤ ``max_tiles``. | ||
| seed: Random seed for subsampling reproducibility (default ``42``). | ||
|
|
||
| Returns: | ||
| Ordered dict mapping each unique ``sample_id`` to a | ||
| ``(features, coords)`` tuple of concatenated (and optionally | ||
| subsampled) arrays. | ||
| """ | ||
| if len(patch_features_h5_paths) != len(sample_ids): | ||
| if not (len(features_list) == len(coords_list) == len(sample_ids)): | ||
| raise ValueError( | ||
| f"patch_features_h5_paths ({len(patch_features_h5_paths)}) and " | ||
| f"sample_ids ({len(sample_ids)}) must have the same length." | ||
| f"features_list ({len(features_list)}), coords_list ({len(coords_list)}), " | ||
| f"and sample_ids ({len(sample_ids)}) must all have the same length." | ||
| ) | ||
|
|
||
| os.makedirs(output_dir, exist_ok=True) | ||
|
|
||
| groups: dict = collections.OrderedDict() | ||
| for idx, sid in enumerate(sample_ids): | ||
| groups.setdefault(sid, []).append(idx) | ||
|
|
||
| results: dict = collections.OrderedDict() | ||
|
|
||
| for sample_id, indices in groups.items(): | ||
| logger.info("Aggregating sample %s from %d slide(s)", sample_id, len(indices)) | ||
|
|
||
|
|
@@ -1956,14 +1955,12 @@ def aggregate_sample_features( | |
| slide_sizes = [] | ||
|
|
||
| for i in indices: | ||
| h5_path = patch_features_h5_paths[i] | ||
| with h5py.File(h5_path, "r") as h5: | ||
| feats = np.array(h5["features"]) | ||
| coords = h5["coords"][:] | ||
| feats = features_list[i] | ||
| coords = coords_list[i] | ||
| all_features.append(feats) | ||
| all_coords.append(coords) | ||
| slide_sizes.append(len(feats)) | ||
| logger.debug(" slide %d: %d tiles from %s", i, len(feats), h5_path) | ||
| logger.debug(" slide %d: %d tiles", i, len(feats)) | ||
|
Comment on lines
1957
to
+1963
|
||
|
|
||
| features = np.concatenate(all_features, axis=0) | ||
| coords = np.concatenate(all_coords, axis=0) | ||
|
|
@@ -1984,11 +1981,12 @@ def aggregate_sample_features( | |
| max_tiles, | ||
| ) | ||
|
|
||
| out_path = os.path.join(output_dir, f"{sample_id}.{output_h5_suffix}") | ||
| save_hdf5(out_path, {"features": features, "coords": coords}, mode="w") | ||
| logger.info( | ||
| "Wrote %s (%d tiles, dim=%d)", out_path, len(features), features.shape[1] | ||
| "Sample %s: %d tiles, dim=%d", sample_id, len(features), features.shape[1] | ||
| ) | ||
| results[sample_id] = (features, coords) | ||
|
|
||
| return results | ||
|
|
||
|
|
||
| @timed | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -125,88 +125,77 @@ def test_subsample_tiles_invalid_strategy(): | |
|
|
||
| def test_aggregate_sample_features_single_slide(tmp_path): | ||
| """One slide per sample — output equals input.""" | ||
| h5_a = tmp_path / "slide_a.h5" | ||
| feats_a, coords_a = _write_fake_h5(h5_a, n_tiles=30) | ||
| feats_a, coords_a = _make_data(30) | ||
|
|
||
| _aggregate_sample_features( | ||
| patch_features_h5_paths=[str(h5_a)], | ||
| results = _aggregate_sample_features( | ||
| features_list=[feats_a], | ||
| coords_list=[coords_a], | ||
| sample_ids=["sample1"], | ||
| output_dir=str(tmp_path / "out"), | ||
| output_h5_suffix="features.h5", | ||
| max_tiles=None, | ||
| subsampling_strategy="random", | ||
| seed=42, | ||
| ) | ||
|
|
||
| out_h5 = tmp_path / "out" / "sample1.features.h5" | ||
| assert out_h5.exists() | ||
| with h5py.File(out_h5) as f: | ||
| np.testing.assert_array_equal(f["features"][:], feats_a) | ||
| np.testing.assert_array_equal(f["coords"][:], coords_a) | ||
| assert "sample1" in results | ||
| np.testing.assert_array_equal(results["sample1"][0], feats_a) | ||
| np.testing.assert_array_equal(results["sample1"][1], coords_a) | ||
|
|
||
|
|
||
| def test_aggregate_sample_features_multi_slide(tmp_path): | ||
| """Two slides per sample — features are concatenated.""" | ||
| h5_a = tmp_path / "slide_a.h5" | ||
| h5_b = tmp_path / "slide_b.h5" | ||
| _write_fake_h5(h5_a, n_tiles=20, seed=1) | ||
| _write_fake_h5(h5_b, n_tiles=15, seed=2) | ||
|
|
||
| _aggregate_sample_features( | ||
| patch_features_h5_paths=[str(h5_a), str(h5_b)], | ||
| rng = np.random.default_rng(0) | ||
| feats_a = rng.random((20, 4)).astype(np.float32) | ||
| coords_a = rng.integers(0, 1000, (20, 2)) | ||
| feats_b = rng.random((15, 4)).astype(np.float32) | ||
| coords_b = rng.integers(0, 1000, (15, 2)) | ||
|
|
||
| results = _aggregate_sample_features( | ||
| features_list=[feats_a, feats_b], | ||
| coords_list=[coords_a, coords_b], | ||
| sample_ids=["sampleX", "sampleX"], | ||
| output_dir=str(tmp_path / "out"), | ||
| max_tiles=None, | ||
| ) | ||
|
|
||
| out_h5 = tmp_path / "out" / "sampleX.features.h5" | ||
| assert out_h5.exists() | ||
| with h5py.File(out_h5) as f: | ||
| assert f["features"].shape == (35, 4) | ||
| assert f["coords"].shape == (35, 2) | ||
| assert results["sampleX"][0].shape == (35, 4) | ||
| assert results["sampleX"][1].shape == (35, 2) | ||
|
Comment on lines
+152
to
+160
|
||
|
|
||
|
|
||
| def test_aggregate_sample_features_two_samples(tmp_path): | ||
| """Three slides, two samples — two output files.""" | ||
| paths = [tmp_path / f"s{i}.h5" for i in range(3)] | ||
| for i, p in enumerate(paths): | ||
| _write_fake_h5(p, n_tiles=10, seed=i) | ||
| """Three slides, two samples — two entries in result.""" | ||
| rng = np.random.default_rng(0) | ||
| slides = [(rng.random((10, 4)).astype(np.float32), rng.integers(0, 1000, (10, 2))) for _ in range(3)] | ||
| features_list = [f for f, _ in slides] | ||
| coords_list = [c for _, c in slides] | ||
|
|
||
| _aggregate_sample_features( | ||
| patch_features_h5_paths=[str(p) for p in paths], | ||
| results = _aggregate_sample_features( | ||
| features_list=features_list, | ||
| coords_list=coords_list, | ||
| sample_ids=["sA", "sA", "sB"], | ||
| output_dir=str(tmp_path / "out"), | ||
| max_tiles=None, | ||
| ) | ||
|
|
||
| out_a = tmp_path / "out" / "sA.features.h5" | ||
| out_b = tmp_path / "out" / "sB.features.h5" | ||
| assert out_a.exists() and out_b.exists() | ||
| with h5py.File(out_a) as f: | ||
| assert f["features"].shape[0] == 20 | ||
| with h5py.File(out_b) as f: | ||
| assert f["features"].shape[0] == 10 | ||
| assert results["sA"][0].shape[0] == 20 | ||
| assert results["sB"][0].shape[0] == 10 | ||
|
|
||
|
|
||
| def test_aggregate_sample_features_with_subsampling(tmp_path): | ||
| """Subsampling reduces output to max_tiles.""" | ||
| h5_a = tmp_path / "s0.h5" | ||
| h5_b = tmp_path / "s1.h5" | ||
| _write_fake_h5(h5_a, n_tiles=80, seed=0) | ||
| _write_fake_h5(h5_b, n_tiles=60, seed=1) | ||
|
|
||
| _aggregate_sample_features( | ||
| patch_features_h5_paths=[str(h5_a), str(h5_b)], | ||
| rng = np.random.default_rng(0) | ||
| feats_a = rng.random((80, 4)).astype(np.float32) | ||
| coords_a = rng.integers(0, 1000, (80, 2)) | ||
| feats_b = rng.random((60, 4)).astype(np.float32) | ||
| coords_b = rng.integers(0, 1000, (60, 2)) | ||
|
|
||
| results = _aggregate_sample_features( | ||
| features_list=[feats_a, feats_b], | ||
| coords_list=[coords_a, coords_b], | ||
| sample_ids=["big", "big"], | ||
| output_dir=str(tmp_path / "out"), | ||
| max_tiles=50, | ||
| subsampling_strategy="random", | ||
| seed=99, | ||
| ) | ||
|
|
||
| out_h5 = tmp_path / "out" / "big.features.h5" | ||
| with h5py.File(out_h5) as f: | ||
| assert f["features"].shape[0] == 50 | ||
| assert results["big"][0].shape[0] == 50 | ||
|
Comment on lines
181
to
+198
|
||
|
|
||
|
|
||
| # ============================================================================= | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The CLI now loads all slide feature arrays into memory upfront (
features_list/coords_list) before grouping/aggregating. Previously, aggregation read H5s within the per-sample loop, so peak memory scaled with the largest sample rather than the entire input set. For large cohorts this can be a substantial memory regression and risk OOM; consider grouping indices bysample_idfirst, then reading/aggregating one sample at a time (or otherwise streaming) so only one sample’s slides are resident at once.