From cff6c4030363469c06b07467e2f7d79b767f4dda Mon Sep 17 00:00:00 2001 From: Marco Varrone Date: Thu, 12 Feb 2026 12:25:52 +0100 Subject: [PATCH] Refactor connected components logic in _build.py to improve handling of sparse matrices --- src/cellcharter/gr/_build.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/cellcharter/gr/_build.py b/src/cellcharter/gr/_build.py index 6fe431a..bbf5bf4 100644 --- a/src/cellcharter/gr/_build.py +++ b/src/cellcharter/gr/_build.py @@ -139,11 +139,15 @@ def remove_intra_cluster_links( def _connected_components(adj: sps.spmatrix, min_cells: int = 250, count: int = 0) -> np.ndarray: + # scipy.sparse.csgraph works reliably with sparse matrices; convert sparse arrays/views if needed. + if sps.issparse(adj) and not isinstance(adj, sps.spmatrix): + adj = sps.csr_matrix(adj) + n_components, labels = sps.csgraph.connected_components(adj, return_labels=True) components, counts = np.unique(labels, return_counts=True) small_components = components[counts < min_cells] - small_components_idxs = np.in1d(labels, small_components) + small_components_idxs = np.isin(labels, small_components) labels[small_components_idxs] = -1 labels[~small_components_idxs] = pd.factorize(labels[~small_components_idxs])[0] + count @@ -191,12 +195,11 @@ def connected_components( cluster_values = adata.obs[cluster_key].unique() for cluster in cluster_values: - adata_cluster = adata[adata.obs[cluster_key] == cluster] + cluster_mask = (adata.obs[cluster_key] == cluster).values + adj_cluster = adata.obsp[connectivity_key][cluster_mask, :][:, cluster_mask] - labels, n_components = _connected_components( - adj=adata_cluster.obsp[connectivity_key], min_cells=min_cells, count=count - ) - output[adata.obs[cluster_key] == cluster] = labels + labels, n_components = _connected_components(adj=adj_cluster, min_cells=min_cells, count=count) + output[cluster_mask] = labels count += n_components else: labels, n_components = _connected_components(