From 1b9d0b760cc3a38929d97aa3d1c3a4698cf3632a Mon Sep 17 00:00:00 2001 From: weslewis Date: Sat, 28 Feb 2026 14:12:39 -0500 Subject: [PATCH 1/7] Match notebook p_final defaults: w_size=0.05, w_sens=0.12; sens clip/behavior; smoothing retained --- locat/locat_condensed_git.py | 1399 ++++++++++++++++++++++++++++++++++ 1 file changed, 1399 insertions(+) create mode 100644 locat/locat_condensed_git.py diff --git a/locat/locat_condensed_git.py b/locat/locat_condensed_git.py new file mode 100644 index 0000000..c4da29d --- /dev/null +++ b/locat/locat_condensed_git.py @@ -0,0 +1,1399 @@ +import math + +import numba +import numpy as np +from loguru import logger +from scanpy import AnnData +from scipy.interpolate import PchipInterpolator +from scipy.special import logsumexp +from scipy.stats import binom, betabinom, chi2 +from sklearn.metrics.pairwise import euclidean_distances as edist +from tqdm import tqdm +from sklearn.neighbors import NearestNeighbors +from scipy.stats import norm +from locat.wgmm import WGMM +from locat.wgmms import wgmm +from locat.rgmm import softbootstrap_gmm + +class LOCATNullDistribution: + mean = None + std = None + + def __init__(self, mean_func, std_func): + self.mean = mean_func + self.std = std_func + + def to_zscore(self, raw_score, p): + return (raw_score - self.mean(p)) / self.std(p) + + @classmethod + def from_estimates(cls, p, means, stds): + return cls( + mean_func=PchipInterpolator(p, means), + std_func=PchipInterpolator(p, stds), + ) + + +class LOCAT: + _cell_dist = None + _min_dist = None + _knn = None + _background_gmm: WGMM = None + _background_pdf: np.ndarray = None + _background_logpdf: np.ndarray = None + _null_distribution: LOCATNullDistribution = None + _X = None + _n_components_waypoints = None + _disable_progress_info = True + + def __init__( + self, + adata: AnnData, + cell_embedding: np.ndarray, + k: int, + n_bootstrap_inits: int, + show_progress=False, + wgmm_dtype: str = "same", # "same" | "float32" | "float64" + knn=None, # <-- NEW: precomputed adjacency/connectivities + knn_k: int | None = None, # <-- NEW: if computing, how many neighbors (defaults to k) + knn_mode: str = "binary", # <-- NEW: "binary" or "connectivity" + ): + self._disable_progress_info = not show_progress + self._adata = adata + + emb = np.asarray(cell_embedding) + emb = (emb - emb.mean(0)) / (emb.std(0) + emb.dtype.type(1e-6)) + self._embedding = emb + self._dtype = self._embedding.dtype + self._wgmm_dtype = wgmm_dtype + + self._k = k + self.n_cells, self.n_genes = self._adata.shape + self.n_dims = self._embedding.shape[1] + self.n_bootstrap_inits = n_bootstrap_inits + self._knn = None + self._knn_k = int(knn_k) if knn_k is not None else int(k) + self._knn_mode = str(knn_mode) + + self._reg_covar = None + if knn is not None: + self.set_knn(knn) + # ------------------------------------------------------------------ + # Basic geometry / regularization + # ------------------------------------------------------------------ + @property + def cell_dist(self) -> np.ndarray: + if self._cell_dist is None: + if not self._disable_progress_info: + logger.info("recomputing cell-cell distance") + self._cell_dist = edist(self._embedding) + return self._cell_dist + + @property + def min_dist(self) -> float: + if self._min_dist is None: + if not self._disable_progress_info: + logger.info("recomputing min cell-cell distance") + self._min_dist = np.mean( + np.nanmin( + np.where(self.cell_dist > 0, self.cell_dist, np.nan), + axis=1, + ) + ) + return self._min_dist + + def reg_covar(self, sample_size=None): + if self._reg_covar is not None: + return self._dtype.type(self._reg_covar) + + base = (self.min_dist ** (2 / self.n_dims)) / 6 + if sample_size is None: + rc = base + else: + c = 1 - 2 / self.n_dims + adj = np.sqrt((self.n_cells + c) / (sample_size + c)) + rc = base * adj + + rc = max(rc, 1e-4) + return self._dtype.type(rc) + + # ------------------------------------------------------------------ + # Data access / options + # ------------------------------------------------------------------ + @property + def X(self): + if self._X is None: + X = self._adata.X + if not isinstance(X, np.ndarray): + X = X.toarray() + self._X = X.astype(self._dtype, copy=False) + return self._X + + def show_progress(self, show_progress=True): + self._disable_progress_info = not show_progress + + # ------------------------------------------------------------------ + # Background GMM and LTST null + # ------------------------------------------------------------------ + def background_n_components_init(self, weights_transform=None, min_points=10, n_reps=30): + if not self._disable_progress_info: + logger.info("Estimating number of GMM components") + + self._n_components_waypoints = np.zeros(shape=(3, 2)) + self._n_components_waypoints[0, 0] = min_points + self._n_components_waypoints[1, 0] = int(np.sqrt(self.n_cells)) + self._n_components_waypoints[2, 0] = self.n_cells + + self._n_components_waypoints[0, 1] = 1 + + bic_component_cost = self.n_dims * (self.n_dims + 3) / 2 + + if weights_transform is not None: + Xdense = self.X.copy() + for i in range(self.n_genes): + Xdense[:, i] = weights_transform(Xdense[:, i]) + weights = np.asarray(Xdense.sum(axis=1), dtype=self._dtype) + else: + weights = np.ones((self.n_cells,), dtype=self._dtype) + + weights = np.clip(weights, self._dtype.type(1e-6), np.inf).astype( + self._dtype, copy=False + ) + s = float(weights.sum()) + weights = (weights / s) if s > 0 else np.full_like(weights, 1.0 / len(weights)) + weights[~np.isfinite(weights)] = self._dtype.type(1.0 / len(weights)) + + # Full-data BIC search for big-N waypoint + min_search = 1 + max_search = 10 + keep_searching = True + bic = [] + while keep_searching: + for i in tqdm( + range(min_search, max_search), + desc=f"estimating BIC for {self._n_components_waypoints[2, 0]:.0f} cells", + position=0, + leave=True, + disable=self._disable_progress_info, + ): + p = self.fit_wgmm(n_comp=i, weights=weights).pdf(self._embedding) + bic.append( + bic_component_cost * i * np.log(self.n_cells) + - np.sum(2 * np.log(p[p > 0])) + ) + self._n_components_waypoints[2, 1] = np.argmin(bic) + 1 + if self._n_components_waypoints[2, 1] >= len(bic) - 2: + min_search = max_search + max_search = max_search + 5 + else: + keep_searching = False + + # sqrt(n_cells) waypoint + n = np.max([min_points + 1, int(np.ceil(np.sqrt(self.n_cells)))]) + bic = np.zeros(shape=(int(self._n_components_waypoints[2, 1]), n_reps)) + + o = np.arange(self.n_cells) + wgs = np.zeros(shape=(self.n_cells, n_reps)) + for i_rep in range(n_reps): + np.random.shuffle(o) + wgs[o < n, i_rep] = weights[o < n] + wgs = wgs / np.sum(wgs, axis=0) + + for i_rep in tqdm( + range(n_reps), + desc=f"estimating BIC for {self._n_components_waypoints[1, 0]:.0f} cells", + position=0, + leave=True, + disable=self._disable_progress_info, + ): + for i in range(1, bic.shape[0] + 1): + p = self.fit_wgmm(n_comp=i, weights=wgs[:, i_rep]).pdf(self._embedding) + bic[i - 1, i_rep] = ( + bic_component_cost * i * np.log(n) - np.sum(2 * np.log(p[p > 0])) + ) + self._n_components_waypoints[1, 1] = np.argmin(np.median(bic, axis=1)) + 1 + + def auto_bkg_components(self, n_points, weights_transform=None): + if self._n_components_waypoints is None: + self.background_n_components_init(weights_transform=weights_transform) + + n_components = np.interp( + np.log(n_points), + xp=np.log(self._n_components_waypoints[:, 0]), + fp=np.log(self._n_components_waypoints[:, 1]), + ) + return int(np.round(np.exp(n_components))) + + def _auto_n_effective_weights(self, weights=None, min_dist_cutoff=None): + if min_dist_cutoff is None: + min_dist_cutoff = self.min_dist * 3 + + rweights = np.zeros(shape=(self.n_cells,)) + if weights is None: + wg0 = rweights == 0 + else: + wg0 = weights > 0 + + if np.sum(wg0) > 5: + # only compute the components to describe the cells that are close enough + cell_include = ( + np.min(self.cell_dist[wg0, :][:, wg0] + np.eye(np.sum(wg0)) * 10, axis=1) + < min_dist_cutoff + ) + rweights[wg0] = cell_include.astype(float) + return rweights + return None + + def auto_n_components(self, coords, weights=None, indices=None, min_points_fraction=0.95): + if weights is None: + weights = np.ones(shape=(coords.shape[0],)) + + if indices is None: + if coords.shape[0] != self.n_cells: + raise ValueError("A selection was made but the indices were not passed") + indices = np.full_like(weights, fill_value=True) + + logpdf = self._background_logpdf[indices, :] * weights[:, None] + id_component = np.argmax(logpdf, axis=1) + + component_counts = np.bincount( + id_component[weights > 0], minlength=self._background_gmm.n_comp + ) + n_counts = np.sum(component_counts) + component_counts[component_counts < 5] = 0 + if np.sum(component_counts) < 10: + return 1 + else: + component_counts = component_counts / np.sum(component_counts) + estimated_n_components = ( + np.min( + np.flatnonzero( + np.cumsum(np.sort(component_counts)[::-1]) > min_points_fraction + ) + ) + + 1 + ) + estimated_n_components = int(np.floor(np.sqrt(estimated_n_components + 1)) + 1) + return int( + np.max( + [ + 1, + np.min( + [ + self._background_gmm.n_comp, + n_counts / 5, + estimated_n_components, + ] + ), + ] + ) + ) + def set_knn(self, knn): + """ + Store a precomputed KNN graph. + + Accepts: + - scipy sparse (csr/csc/coo) adjacency or connectivities + - dense numpy array adjacency/connectivities + + Expected shape: (n_cells, n_cells) + """ + if sp.issparse(knn): + K = knn.tocsr() + else: + K = np.asarray(knn) + + if K.shape != (self.n_cells, self.n_cells): + raise ValueError(f"knn must have shape {(self.n_cells, self.n_cells)}, got {K.shape}") + + # zero diagonal to avoid self-neighbor artifacts + if sp.issparse(K): + K = K.tolil() + K.setdiag(0.0) + K = K.tocsr() + K.eliminate_zeros() + else: + np.fill_diagonal(K, 0.0) + + self._knn = K + + + def knn(self): + """ + Return a KNN adjacency/connectivity matrix. + + If a KNN was provided at init (or via set_knn), returns it. + Otherwise computes one from the embedding. + """ + if self._knn is not None: + return self._knn + + k = int(self._knn_k) + if k <= 0: + raise ValueError("knn_k must be >= 1") + + # Build kNN from embedding (exclude self by taking k+1 then dropping self) + nbrs = NearestNeighbors(n_neighbors=min(k + 1, self.n_cells), metric="euclidean") + nbrs.fit(self._embedding) + dists, inds = nbrs.kneighbors(self._embedding, return_distance=True) + + # Drop self neighbor (usually first column) + inds = inds[:, 1: k + 1] + + rows = np.repeat(np.arange(self.n_cells), inds.shape[1]) + cols = inds.reshape(-1) + + if self._knn_mode == "binary": + data = np.ones_like(cols, dtype=self._dtype) + elif self._knn_mode == "connectivity": + # simple distance-based weights; you can swap for something else + dd = dists[:, 1: k + 1].reshape(-1) + data = (1.0 / (dd + 1e-6)).astype(self._dtype, copy=False) + else: + raise ValueError("knn_mode must be 'binary' or 'connectivity'") + + K = sp.csr_matrix((data, (rows, cols)), shape=(self.n_cells, self.n_cells)) + K.eliminate_zeros() + self._knn = K + return self._knn + + + def background_pdf( + self, + n_comp=None, + reps=10, + total_counts_weight=True, + weights_transform=None, + force_refresh=False, + ): + if (self._background_pdf is None) or force_refresh: + if not self._disable_progress_info: + logger.info("fitting background PDF") + + if n_comp is None: + if self._n_components_waypoints is None: + self.background_n_components_init(weights_transform=weights_transform) + n_comp = np.interp( + np.log(self.n_cells), + xp=np.log(self._n_components_waypoints[:, 0]), + fp=np.log(self._n_components_waypoints[:, 1]), + ) + n_comp = int(np.round(np.exp(n_comp))) + if not self._disable_progress_info: + logger.info(f"Using {n_comp} components") + + if weights_transform is not None: + Xdense = self.X.copy() + for i in range(self.n_genes): + Xdense[:, i] = weights_transform(Xdense[:, i]) + weights = np.asarray(Xdense.sum(axis=1), dtype=self._dtype) + else: + weights = np.ones((self.n_cells,), dtype=self._dtype) + + weights = np.clip(weights, self._dtype.type(1e-6), np.inf).astype( + self._dtype, copy=False + ) + s = float(weights.sum()) + weights = (weights / s) if s > 0 else np.full_like(weights, 1.0 / len(weights)) + weights[~np.isfinite(weights)] = self._dtype.type(1.0 / len(weights)) + + self._background_pdf = np.zeros(shape=(self.n_cells,)) + self._background_logpdf = np.zeros(shape=(self.n_cells, n_comp)) + + background_gmm = None + for _ in tqdm( + range(reps), + desc="fitting background", + position=0, + leave=True, + disable=self._disable_progress_info, + ): + background_gmm = self.fit_wgmm(n_comp, weights=weights) + self._background_pdf += background_gmm.pdf(self._embedding) + self._background_logpdf += background_gmm.loglikelihood_by_component( + self._embedding, np.ones(shape=self.n_cells) + ) + self._background_gmm = background_gmm + self._background_pdf /= reps + self._background_logpdf /= reps + + self.estimate_null_parameters() + return self._background_pdf + + def signal_gmm(self, weights, n_comp=None): + if n_comp is None: + comp_weights = self._auto_n_effective_weights(weights) + if comp_weights is None: + n_comp = 1 + else: + loc_indices = comp_weights > 0 + n_comp = self.auto_n_components( + self._embedding[loc_indices, :], + weights[loc_indices], + indices=loc_indices, + ) + + return self.fit_wgmm(n_comp, weights=weights) + + def fit_wgmm(self, n_comp, weights=None) -> WGMM: + if weights is None: + weights = np.ones(shape=(self.n_cells,)) + + pis, mus, sigmas, _ = wgmm( + self._embedding, + raw_weights=weights, + n_components=n_comp, + n_inits=1, + reg_covar=self.reg_covar(), + ) + pis = np.array(pis) + mus = np.array(mus) + sigmas = np.array(sigmas) + return WGMM(pis, mus, sigmas) + + def estimate_null_parameters(self, fractions=None, n_reps=50): + """ + Estimate LTST null mean/std as a function of expression fraction p + using random pseudo-genes: + - pick random expressing cells at frequency p + - fit signal GMM with the *same* pipeline as real genes + - compute LTST exactly as in gmm_scan_new + """ + if fractions is None: + fractions = 10 ** np.linspace(np.log10(10 / self.n_cells), 0, 7) + + f0 = self.background_pdf() + scores = [] + + for frac in tqdm( + fractions, + desc="null distribution parameters (perm. pseudo-genes)", + position=0, + leave=True, + disable=self._disable_progress_info, + ): + n_pos = max(5, int(round(frac * self.n_cells))) + if n_pos >= self.n_cells: + n_pos = self.n_cells - 1 + + ltst_vals = [] + + for _ in range(n_reps): + mask = np.zeros(self.n_cells, dtype=bool) + mask[np.random.choice(self.n_cells, n_pos, replace=False)] = True + gene_prior = mask.astype(self._dtype) + + comp_gene_prior = self._auto_n_effective_weights(gene_prior) + if comp_gene_prior is None: + n_comp = 1 + else: + loc_indices = comp_gene_prior > 0 + n_comp = self.auto_n_components( + self._embedding[loc_indices, :], + gene_prior[loc_indices], + loc_indices, + ) + gmm1 = self.signal_gmm(weights=gene_prior, n_comp=n_comp) + + i1 = gene_prior > 0 + loc_f1 = gmm1.pdf(self._embedding[i1, :]) + p1 = float(np.mean(i1)) + + ltst_score = ltst_score_func(f0[i1], loc_f1, p1) + + w_expr = gene_prior[i1] + w_expr = w_expr / (w_expr.sum() if w_expr.sum() > 0 else 1.0) + ltst_vals.append(float(np.dot(w_expr, ltst_score))) + + ltst_vals = np.asarray(ltst_vals, dtype=float) + scores.append( + [ + frac, + float(np.mean(ltst_vals)), + float(np.std(ltst_vals) + 1e-9), + ] + ) + + scores = np.asarray(scores) + self._null_distribution = LOCATNullDistribution.from_estimates( + p=scores[:, 0], + means=scores[:, 1], + stds=scores[:, 2], + ) + + # ------------------------------------------------------------------ + # Depletion-style localization scan + # ------------------------------------------------------------------ + def localization_pval_dep_scan( + self, + gmm1, + gene_prior, + *, + c_values=None, + soft_bound=None, # default computed from n: max((n-1)/n, 0.99) + min_p0_abs=0.10, + min_expected=30, + min_abs_deficit=0.02, + n_trials_cap=500, + weight_mode="amount", + p_floor=1e-12, + n_eff_scale=1.0, + rho_bb=0.2, # >0 enables Beta–Binomial tail + eps_rel=0.01, + debug=False, + debug_store_masks=False, + debug_max_cells=5000, + ): + if self._background_gmm is None: + _ = self.background_pdf() + + X = self._embedding + n = int(X.shape[0]) + if soft_bound is None: + soft_bound = max((n - 1) / max(n, 1), 0.99) + + gp = np.asarray(gene_prior, float) + if weight_mode == "binary": + w_obs = (gp > 0).astype(float) + elif weight_mode == "amount": + w_obs = np.clip(gp, 0.0, np.inf) + else: + raise ValueError("weight_mode must be 'amount' or 'binary'") + + # Kish n_eff with tempering + cap + sw = float(np.sum(w_obs)) + n_eff = 0.0 if sw <= 0 else (sw * sw) / max(float(np.sum(w_obs * w_obs)), 1e-12) + n_eff *= float(n_eff_scale) + if n_trials_cap is not None: + n_eff = min(n_eff, float(n_trials_cap)) + n_eff = max(1.0, n_eff) + n_trials = int(round(n_eff)) + + f0_x = np.clip(self._background_gmm.pdf(X), 1e-300, np.inf) + f1_x = np.clip(gmm1.pdf(X), 1e-300, np.inf) + w0 = f0_x / float(np.sum(f0_x)) + + w_obs_alpha = w_obs / (sw if sw > 0 else 1.0) + w0_alpha = w0 + + if c_values is None: + c_values = np.concatenate([[1.0], np.geomspace(1.05, 3.0, 12)]) + + best_logp = None + best = {"c": None, "k_obs": None, "p0": None, "obs_prop": None} + scanned = 0 + tested = 0 + + per_c = [] if debug else None + + for c in c_values: + c = float(c) + in_R_mask = f0_x > c * f1_x + + p0_abs = float(np.sum(w0_alpha * in_R_mask)) + reason = None + if p0_abs < min_p0_abs: + reason = "fail:min_p0_abs" + if n_eff * p0_abs < min_expected: + reason = "fail:min_expected" + if p0_abs > soft_bound: + reason = "fail:soft_bound" + + obs_prop = float(np.sum(w_obs_alpha * in_R_mask)) + if reason is None: + if (p0_abs - obs_prop) < min_abs_deficit: + reason = "fail:min_abs_deficit" + elif obs_prop > (p0_abs / c) * (1.0 - float(eps_rel)): + reason = "fail:c_bound" #this checks whether observed f1 density in the region is at an equal or lower proportion than the observed f0 density in the region * c (where c is the contrast). If the region is larger than expectation, reject the gene. + + if debug: + rec = { + "c": c, + "p0_abs": p0_abs, + "obs_prop": obs_prop, + "n_eff": float(n_eff), + "n_eff_expected": float(n_eff * p0_abs), + "reason": reason, + } + if debug_store_masks: + ncap = min(int(debug_max_cells), n) + rec["in_R_idx"] = np.flatnonzero(in_R_mask[:ncap]).astype(int) + if weight_mode == "binary": + expr_mask = gp > 0 + rec["expr_in_R_count"] = int(np.sum(expr_mask & in_R_mask)) + per_c.append(rec) + + if reason is not None: + continue + + scanned += 1 + tested += 1 + + k_eff = int(np.rint(obs_prop * n_trials)) + p0_clip = np.clip(p0_abs, 1e-12, 1 - 1e-12) + + if rho_bb and rho_bb > 0.0: + ab_sum = max(1.0 / float(rho_bb) - 1.0, 2.0) + alpha = float(p0_clip * ab_sum) + beta = float((1.0 - p0_clip) * ab_sum) + p_raw = float(betabinom.cdf(k_eff, n_trials, alpha, beta)) + logp_raw = np.log(max(p_raw, np.finfo(float).tiny)) + else: + lFkm1 = binom.logcdf(k_eff - 1, n_trials, p0_clip) + lpk = binom.logpmf(k_eff, n_trials, p0_clip) + np.log(0.5) + logp_raw = logsumexp([lFkm1, lpk]) + + if (best_logp is None) or (logp_raw < best_logp): + best_logp = float(logp_raw) + best.update( + {"c": c, "k_obs": k_eff, "p0": p0_abs, "obs_prop": obs_prop} + ) + + if tested == 0: + out = { + "p_value": 1.0, + "raw_min_p": 1.0, + "log_p_single": 0.0, + "log_p_sidak": 0.0, + "neglog10_p_single": 0.0, + "neglog10_p_sidak": 0.0, + "best_c": None, + "k_obs_eff": None, + "p0_abs": None, + "obs_prop": None, + "scanned": int(scanned), + "tested": int(tested), + "sidak_penalty": 1, + "n": n, + "n_eff": float(n_eff), + "guards": { + "min_p0_abs": float(min_p0_abs), + "min_expected": float(min_expected), + "min_abs_deficit": float(min_abs_deficit), + "soft_bound": float(soft_bound), + "eps_rel": float(eps_rel), + }, + "n_eff_scale": float(n_eff_scale), + "rho_bb": float(rho_bb), + } + if debug: + out["per_c"] = per_c + return out + + m_eff = tested + sidak_logp = _logsidak_from_logp(best_logp, m_eff) + + p_value = float(np.exp(sidak_logp)) + raw_min_p = float(np.exp(best_logp)) + p_value = _safe_p(max(p_value, p_floor)) + raw_min_p = _safe_p(max(raw_min_p, p_floor)) + + out = { + "p_value": p_value, + "raw_min_p": raw_min_p, + "log_p_single": float(best_logp), + "log_p_sidak": float(sidak_logp), + "neglog10_p_single": float(-best_logp / np.log(10)), + "neglog10_p_sidak": float(-sidak_logp / np.log(10)), + "best_c": best["c"], + "k_obs_eff": best["k_obs"], + "p0_abs": best["p0"], + "obs_prop": best["obs_prop"], + "scanned": int(scanned), + "tested": int(tested), + "sidak_penalty": 1, + "n": n, + "n_eff": float(n_eff), + "guards": { + "min_p0_abs": float(min_p0_abs), + "min_expected": float(min_expected), + "min_abs_deficit": float(min_abs_deficit), + "soft_bound": float(soft_bound), + "eps_rel": float(eps_rel), + }, + "n_eff_scale": float(n_eff_scale), + "rho_bb": float(rho_bb), + } + if debug: + out["per_c"] = per_c + return out + + # ------------------------------------------------------------------ + # Main scan used in practice + # ------------------------------------------------------------------ + def bic_score(self, gmm1, gene_prior): + bic_component_cost = self.n_dims * (self.n_dims + 3) / 2 + p = gmm1.pdf(self._embedding[gene_prior > 0, :]) + n_cells = np.sum(gene_prior > 0) + bic = ( + bic_component_cost * gmm1.n_comp * np.log(np.sum(gene_prior > 0)) + - 2 * np.sum(p[p > 0]) + ) + return bic / n_cells + + def gmm_scan_new( + self, + genes=None, + weights_transform=None, + zscore_thresh=None, + max_freq=0.5, + verbose=False, + n_bootstrap_inits=None, + # Depletion-scan defaults + rc_c_values=None, # default inside method + rc_min_p0_abs=0.10, #minimum proportion of f0 density in depleted region required for the region pval to be estimated + rc_min_expected=30, #minimum expected cells in depleted region required for the region pval to be estimated + rc_min_abs_deficit=0.02, #minimum absolute difference in f1(x) - f0(x) for all x in depleted region + rc_n_trials_cap=500, #maximum effective sample size + rc_soft_bound=1.0, #this is unused/can be removed + rc_n_eff_scale=0.5, #scaling factor for effective sample sizes -- can be tweaked to stabilize pvalues across various gene sample sizes + rc_p_floor=1e-12, # this is just model precision, can be ignored + rc_rho_bb=0.02, #this is the strength of the beta binomial (0.0 is standard binomial, set at 0.02-0.05 for wider tails) + rc_weight_mode="amount", + rc_eps_rel=0.01, + ): + if verbose: + print("gmm_scan_new: using depletion scan for localization_pval (localization_pval_dep_scan)") + + if n_bootstrap_inits is not None: + self.n_bootstrap_inits = int(n_bootstrap_inits) + + locally_enriched = dict() + gzeros, freqzeros, zzeros = [], [], [] + inclgenes = self.get_genes_indices(genes) + + f0 = self.background_pdf(weights_transform=weights_transform) + for i_gene in tqdm( + inclgenes, + desc="scanning genes", + position=0, + leave=True, + disable=self._disable_progress_info, + ): + gene_prior = self.background_pdf(i_gene, weights_transform) + try: + if np.sum(gene_prior) == 0: + gzeros.append(self._adata.var_names[i_gene]) + continue + if np.mean(gene_prior > 0) > max_freq: + freqzeros.append(self._adata.var_names[i_gene]) + continue + + comp_gene_prior = self._auto_n_effective_weights(gene_prior) + if comp_gene_prior is None: + n_comp = 1 + else: + loc_indices = comp_gene_prior > 0 + n_comp = self.auto_n_components( + self._embedding[loc_indices, :], + gene_prior[loc_indices], + loc_indices, + ) + gmm1 = self.signal_gmm(weights=gene_prior, n_comp=n_comp) + + i1 = gene_prior > 0 + loc_f1 = gmm1.pdf(self._embedding[i1, :]) + p1 = np.mean(i1) + sample_size = p1 * self.n_cells + + ltst_score = ltst_score_func(f0[i1], loc_f1, p1) + sens_score = sens_score_func(f0[i1], loc_f1, i1[i1]) + + zscore = np.dot(gene_prior[i1], ltst_score) / np.sum(gene_prior[i1]) + zscore = self._null_distribution.to_zscore(zscore, p1) + if (zscore_thresh is not None) and (zscore < zscore_thresh): + zzeros.append(self._adata.var_names[i_gene]) + continue + + cs_res = self.localization_pval_dep_scan( + gmm1, + gene_prior, + debug=True, + c_values=( + rc_c_values + if rc_c_values is not None + else np.concatenate([[1.0], np.geomspace(1.05, 3.0, 12)]) + ), + soft_bound=rc_soft_bound, + min_p0_abs=rc_min_p0_abs, + min_expected=rc_min_expected, + min_abs_deficit=rc_min_abs_deficit, + n_trials_cap=rc_n_trials_cap, + weight_mode=rc_weight_mode, + p_floor=rc_p_floor, + n_eff_scale=rc_n_eff_scale, + rho_bb=rc_rho_bb, + eps_rel=rc_eps_rel, + ) + localization_pval = _safe_p(cs_res["p_value"]) + + concentration_pval = _safe_p(float(normal_sf(zscore, 0.0, 1.0))) + + p_cauchy = cauchy_combine([localization_pval, concentration_pval]) + p_size = _safe_p(1.0 - np.exp(-1.0 / (sample_size + 1.0))) + # match notebook behavior: clip sens then use (1 - sens) (sens_power==1.0) + sens = np.clip(sens_score, 1e-6, 1.0) + p_sens = _safe_p(1.0 - sens) + # use notebook weights (w_size=0.05, w_sens=0.12) + p_final = 1.0 - (1.0 - p_cauchy) * (1.0 - 0.05 * p_size) * (1.0 - 0.12 * p_sens) + p_final = float(smooth_qvals(np.array([_safe_p(p_final)]))[0]) + + locally_enriched[self._adata.var_names[i_gene]] = { + "bic": self.bic_score(gmm1, gene_prior), + "zscore": zscore, + "sens_score": sens_score, + "localization_pval": localization_pval, + "concentration_pval": concentration_pval, + "pval": p_final, + "n_components": n_comp, + "sample_size": sample_size, + "depl_scan": cs_res, + } + + except ValueError as e: + if verbose: + logger.info(e) + + if verbose: + print("gzeros:", len(gzeros), "freqzeros:", len(freqzeros), "zzeros:", len(zzeros)) + return locally_enriched + + # ------------------------------------------------------------------ + # Small helpers still used by gmm_scan_new + # ------------------------------------------------------------------ + def get_genes_indices(self, genes): + inclgenes = range(self.n_genes) + list_genes = self._adata.var_names.tolist() + if genes is not None: + inclgenes = [list_genes.index(i) for i in genes] + return inclgenes + + def get_gene_prior(self, i_gene, weights_transform): + gene_prior = self.X[:, i_gene] + if weights_transform is not None: + gene_prior = weights_transform(gene_prior) + return gene_prior + + def signal_pdf(self, weights, n_comp=None): + gmm = self.signal_gmm(weights=weights, n_comp=n_comp, ) + return gmm.pdf(self._embedding) + + def gmm_loglikelihoodtest(self, genes=None, weights_transform=None, max_freq=0.5): + res = dict() + bkg_df = self.auto_bkg_components(self.n_cells) + log_bkg_pdf = self.background_pdf(weights_transform=weights_transform) + bkg_pdf_gt0 = log_bkg_pdf > 0 + log_bkg_pdf = np.where(bkg_pdf_gt0, np.log(log_bkg_pdf), 0) + + inclgenes = self.get_genes_indices(genes) + + for i_gene in tqdm(inclgenes, desc=f'scanning genes', + position=0, leave=True, disable=self._disable_progress_info): + gene_prior = self.get_gene_prior(i_gene, weights_transform) + gene_prior_gt0 = gene_prior > 0 + n_gene_prior = np.sum(gene_prior_gt0) + pv = 1.0 + df = None + lr = None + n_comp = 0 + sample_size = None + + if (n_gene_prior > 5) & ((n_gene_prior / len(gene_prior)) < max_freq): + comp_gene_prior = self._auto_n_effective_weights(gene_prior) + if comp_gene_prior is None: + n_comp = 1 + else: + loc_indices = comp_gene_prior > 0 + n_comp = self.auto_n_components( + self._embedding[loc_indices, :], + gene_prior[loc_indices], + indices=loc_indices, # <-- required + ) + + + f1 = self.signal_pdf(weights=gene_prior, n_comp=n_comp) + + df = (bkg_df - n_comp) + 1 + ix = gene_prior_gt0 & (f1 > 0) + if np.sum(ix) > 5: + ix = ix & bkg_pdf_gt0 + sample_size = np.sum(ix) + + if sample_size == 0: + pv = 0 + + elif sample_size > 5: + lr = self.calc_lratio(f1, ix, log_bkg_pdf, sample_size) + pv = 1 - chi2.cdf(lr, df) + + res[self._adata.var_names[i_gene]] = { + 'llratio_pvalue': pv, + 'llratio_sample_size': sample_size, + 'llratio_stat': lr, + 'llratio_df': df, + 'n_comp': n_comp, + } + + return res + + def gmm_local_pvalue( + self, + genes=None, + n_comp=None, + weights_transform=None, + alpha=0.05, + n_inits=100, + normalize_knn=True, + eps=1e-12, + ): + from scipy.stats import rankdata + from sklearn.metrics import roc_curve + import scipy.sparse as sp + import numpy as np + + f0 = np.asarray(self.background_pdf(weights_transform=weights_transform), dtype=self._dtype).ravel() + K = self.knn() + + # Optional: row-normalize K so every cell has comparable neighborhood "mass" + if normalize_knn: + if sp.issparse(K): + deg = np.asarray(K.sum(axis=1)).ravel() + inv = 1.0 / np.maximum(deg, eps) + Kuse = sp.diags(inv).dot(K) + else: + deg = K.sum(axis=1) + Kuse = K / np.maximum(deg[:, None], eps) + else: + Kuse = K + + locally_enriched = {} + inclgenes = self.get_genes_indices(genes) + + for i_gene in tqdm( + inclgenes, + desc="scanning genes", + position=0, + leave=True, + disable=self._disable_progress_info, + ): + gene_prior = self.get_gene_prior(i_gene, weights_transform) + gp = np.asarray(gene_prior, dtype=self._dtype).ravel() + + sw = float(gp.sum()) + if sw <= 0: + continue # or store null result if you prefer + + # signal pdf + LTST + f1 = np.asarray(self.signal_pdf(weights=gp, n_comp=n_comp), dtype=self._dtype).ravel() + p = float(np.mean(gp > 0)) + ltst_score = ltst_score_func(f0, f1, p) # (n_cells,) + + zscore = float(np.dot(gp, ltst_score) / sw) + + # random pseudo-genes (must return f2: (n_cells, n_inits), rweights: (n_cells, n_inits)) + f2, rweights = self.random_pdf(weights=gp, n_comp=n_comp, n_inits=n_inits) + f2 = np.asarray(f2, dtype=self._dtype) + rweights = np.asarray(rweights, dtype=self._dtype) + + # p2 is per-init expression fraction + p2 = np.mean(rweights > 0, axis=0).astype(self._dtype, copy=False) # (n_inits,) + + ltst_score2 = ltst_score_func(f0[:, None], f2, p2) # (n_cells, n_inits) + + # empirical zscore p-value (avoid 0/1 extremes) + z2 = (np.sum(rweights * ltst_score2, axis=0) / sw).astype(np.float64) # (n_inits,) + z_p = (1.0 + np.sum(z2 >= zscore)) / (n_inits + 1.0) + + # neighborhood-smoothed statistics + if sp.issparse(Kuse): + wstat1 = np.asarray(Kuse.dot(ltst_score)).ravel() + wstat2 = np.asarray(Kuse.dot(ltst_score2)) + else: + wstat1 = (Kuse @ ltst_score).ravel() + wstat2 = (Kuse @ ltst_score2) + + # empirical p-values per cell: rank among [wstat1, wstat2...] + # shape: (n_cells, n_inits+1) + M = np.concatenate([wstat1[:, None], wstat2], axis=1) + ranks = rankdata(-M, axis=1, method="average")[:, 0] # rank of observed in each row + emp_p = ranks / (n_inits + 1.0) # in (0,1] + + # pick a cutoff from ROC if there is signal + empirical_h1 = emp_p < (alpha / self.n_cells) + + wstat1_cutoff = 0.0 + if np.any(empirical_h1) and np.any(~empirical_h1): + fpr, tpr, cuts = roc_curve(empirical_h1.astype(int), wstat1) + # maximize balanced accuracy ( (tpr + (1-fpr)) / 2 ) + j = np.argmax(((1 - fpr) + tpr) / 2.0) + wstat1_cutoff = float(cuts[j]) + + local_res = { + "wstat_cutoff": wstat1_cutoff, + "wstat_alpha": float(alpha), + "wstat_pvalues": emp_p.astype(np.float32, copy=False), + "zscore": float(zscore), + "zscore_pvalue": float(z_p), + "wstat_repetitions": int(n_inits), + } + + sig_idx = np.flatnonzero(wstat1 > wstat1_cutoff) + local_res["wstat_significant"] = sig_idx if sig_idx.size else None + local_res["wstat_significant_clusters"] = None # placeholder + + locally_enriched[self._adata.var_names[i_gene]] = local_res + + return locally_enriched + + + def gmm_local_scan(self, + genes=None, + weights_transform=None, + zscore_thresh=None, + max_freq=0.5): + + locally_enriched = dict() + if zscore_thresh is None: + zscore_thresh = 1.0 + K = self.knn() + + if sp.issparse(K): + knn_neis = np.asarray(K.sum(axis=1)).ravel() + inv = 1.0 / np.maximum(knn_neis, 1e-12) + # row-normalize: D^{-1} K + knn_norm = sp.diags(inv).dot(K) + else: + knn_neis = K.sum(axis=1) + knn_norm = K / np.maximum(knn_neis[:, None], 1e-12) + + inclgenes = self.get_genes_indices(genes) + + f0 = self.background_pdf(weights_transform=weights_transform) + f0 = np.asarray(f0, dtype=self._dtype) + for i_gene in tqdm(inclgenes, desc=f'scanning genes', + position=0, leave=True, disable=self._disable_progress_info): + gene_prior = self.get_gene_prior(i_gene, weights_transform) + try: + if np.sum(gene_prior) == 0: + continue + + if np.mean(gene_prior > 0) > max_freq: + continue + + comp_gene_prior = self._auto_n_effective_weights(gene_prior) + if comp_gene_prior is None: + n_comp = 1 + else: + loc_indices = comp_gene_prior > 0 + n_comp = self.auto_n_components( + self._embedding[loc_indices, :], + gene_prior[loc_indices], + indices=loc_indices, # <-- THIS is the missing piece + ) + + + f1 = self.signal_pdf(weights=gene_prior, n_comp=n_comp) + expr = (gene_prior > 0).astype(self._dtype, copy=False) + + # p1 = local expression fraction around each cell + if sp.issparse(knn_norm): + p1 = knn_norm.dot(expr) + else: + p1 = knn_norm @ expr + p1 = np.asarray(p1, dtype=self._dtype).ravel() + f1 = np.asarray(f1, dtype=self._dtype).ravel() + # compute ltst per-cell then smooth by knn + ltst = ltst_score_func( + np.asarray(f0, dtype=self._dtype), + np.asarray(f1, dtype=self._dtype), + p1, + ) + + if sp.issparse(knn_norm): + local_zscore = knn_norm.dot(ltst) + else: + local_zscore = knn_norm @ ltst + local_zscore = np.asarray(local_zscore, dtype=self._dtype).ravel() + + local_zscore = self._null_distribution.to_zscore(local_zscore, p1) + + K = self.knn() + expr_mask = (gene_prior > 0).astype(np.float32) # (n,) + sig_mask = (f1 > f0).astype(np.float32) # (n,) + + if sp.issparse(K): + expr_neighbors = K.dot(expr_mask) # how many expressing neighbors (weighted) + sig_expr_neighbors = K.dot(expr_mask * sig_mask) # how many expressing-and-signal neighbors + else: + expr_neighbors = K @ expr_mask + sig_expr_neighbors = K @ (expr_mask * sig_mask) + + local_lscore = sig_expr_neighbors / np.maximum(expr_neighbors, 1e-12) + local_lscore = np.asarray(local_lscore).ravel() + + + # rate + # local_zscore = local_zscore * knn_neis / self.n_cells + assert K.shape == (self.n_cells, self.n_cells), (K.shape, self.n_cells) + assert f0.shape == (self.n_cells,), (f0.shape, self.n_cells) + assert f1.shape == (self.n_cells,), (f1.shape, self.n_cells) + + + localization_pval = localization_pvalue_nn_func(gene_prior, f1, f0, K) + concentration_pval = norm.sf(local_zscore) + #localization_pval = 1.0 + #concentration_pval = norm.sf(local_zscore) + + + if np.any(local_zscore > zscore_thresh): + locally_enriched[self._adata.var_names[i_gene]] = { + 'enriched': np.flatnonzero(local_zscore > zscore_thresh), + 'local_zscore': local_zscore, + 'local_lscore': local_lscore, + 'localization_pval': localization_pval, + 'concentration_pval': concentration_pval, + 'local_pvalue': localization_pval + concentration_pval - localization_pval*concentration_pval, + 'n_components': n_comp + } + except Exception as e: + logger.exception(e) # prints full traceback + raise # optional: stop on first failure + + return locally_enriched + + @staticmethod + def calc_lratio(f1, ix, log_bkg_pdf, sample_size, eps=1e-300): + """ + Per-cell LRT contribution on expressing cells: + -2 * sum_{i in ix} ( log f0(i) - log f1(i) ) / sample_size + """ + # ensure 1D arrays + f1 = np.asarray(f1).ravel() + ix = np.asarray(ix, dtype=bool).ravel() + log_bkg_pdf = np.asarray(log_bkg_pdf).ravel() + + # log f1 safely (no mutation) + log_f1 = np.log(np.clip(f1, eps, np.inf)) + + denom = float(sample_size) if sample_size > 0 else 1.0 + return float((-2.0 * np.sum((log_bkg_pdf[ix] - log_f1[ix])) ) / denom) + + def random_pdf( + self, + weights, + n_comp=None, + n_inits=300, + buckets=None, + ): + import numpy as np + from scipy.stats import multivariate_normal as mnorm # <-- add back + + weights = np.asarray(weights, dtype=self._dtype).ravel() + + if n_comp is None: + comp_weights = self._auto_n_effective_weights(weights) + if comp_weights is None: + n_comp = 1 + else: + loc_indices = comp_weights > 0 + n_comp = self.auto_n_components( + self._embedding[loc_indices, :], + weights[loc_indices], + indices=loc_indices, # <-- IMPORTANT + ) + + pis, mus, sigmas, rweights = softbootstrap_gmm( + self._embedding, + raw_weights=weights, + n_components=n_comp, + reg_covar=self.reg_covar(), + n_inits=n_inits, + buckets=buckets, + ) + + pis = np.asarray(pis) + mus = np.asarray(mus) + sigmas = np.asarray(sigmas) + rweights = np.asarray(rweights) + + dtot = np.zeros((self.n_cells, n_inits), dtype=np.float64) + + # NOTE: this loop is slow but correct; optimize later if needed + for j in range(n_inits): + for i in range(n_comp): + w = pis[j, i] + if w == 0: + continue + c0 = mnorm(mean=mus[j, i], cov=sigmas[j, i], allow_singular=True) + dtot[:, j] += w * c0.pdf(self._embedding) + + return dtot.astype(self._dtype, copy=False), rweights.astype(self._dtype, copy=False) + + +# ---------------------------------------------------------------------- +# JIT-accelerated scoring helpers +# ---------------------------------------------------------------------- +@numba.jit(nopython=True) +def ltst_score_func(f0, f1, p): + q = np.sqrt(p) + return (q * (1 - q)) * (f1 - f0) / (f1 + f0) + + +@numba.jit(nopython=True) +def sens_score_func(f0, f1, i): + return np.mean(f1[i > 0] > f0[i > 0]) + +import numpy as np +import scipy.sparse as sp + +def localization_pvalue_nn_func(x1, f1, f0, nn): + """ + Sparse-safe rewrite of the original localization_pvalue_nn_func. + + Preserves: + - i1/obs1/obs2 definitions + - n and o as neighborhood (weighted) counts / signed balance + - f2 weighting for global mu_hat, p1, p0 + - effective_n, sd_hat + - per-cell p via normal_sf(o[i], mu_hat, sd_hat[i]) + + Works for: + - nn sparse CSR/CSC/COO + - nn dense numpy array + """ + x1 = np.asarray(x1) + f1 = np.asarray(f1) + f0 = np.asarray(f0) + + # boolean masks + i1 = (x1 > 0) + obs1 = ((f1 > f0) & i1) + obs2 = ((f1 < f0) & i1) + + # convert to float vectors for dot products + i1_f = i1.astype(np.float32) + obs1_f = obs1.astype(np.float32) + obs2_f = obs2.astype(np.float32) + + # neighborhood weighted counts + if sp.issparse(nn): + # ensure CSR for fast dot + nn = nn.tocsr() + n = nn.dot(i1_f) + o = nn.dot(obs1_f) - nn.dot(obs2_f) + else: + n = nn @ i1_f + o = (nn @ obs1_f) - (nn @ obs2_f) + + n = np.asarray(n).ravel() + o = np.asarray(o).ravel() + + # normalize signed balance + o = o / np.clip(n, 1.0, np.inf) + + # same f2 + global mu_hat logic as original + f2 = 2.0 * (f1 * f0) / np.clip(f1 + f0, 1e-12, np.inf) + + denom = float(np.sum(f2)) + if denom <= 0 or not np.isfinite(denom): + # degenerate fallback: no information => p=1 everywhere + return np.ones(shape=(nn.shape[0],), dtype=np.float64) + + p1 = float(np.sum((f1 > f0) * f2) / denom) + p0 = float(np.sum((f1 < f0) * f2) / denom) + mu_hat = p1 - p0 + + effective_n = np.clip(n * (1.0 + 2.0 * np.abs(o / 2.0)), 1.0, np.inf) + sd_hat = np.sqrt(((p1 + p0) - ((p1 - p0) ** 2)) / effective_n) + + # compute p only where n>0 (same behavior as your loop) + p = np.ones(shape=(nn.shape[0],), dtype=np.float64) + idx = np.flatnonzero(n > 0) + + # vectorized normal_sf (your normal_sf is scalar-numba; keep loop or vectorize in numpy) + # We'll keep the loop for correctness with your numba normal_sf signature. + for i in idx: + p[i] = normal_sf(float(o[i]), mu=mu_hat, sigma=float(sd_hat[i])) + + return p + + +@numba.njit +def normal_sf(x, mu, sigma): + """ + Normal survival function SF = 1 - CDF, numba-jitted. + """ + z = (x - mu) / sigma + return 0.5 * math.erfc(z * 0.7071067811865476) # = 1 - Phi(z) + + +def _logsidak_from_logp(logp_min: float, m_eff: int) -> float: + """ + Sidák combine in log-space: + log p_sidák = log(1 - (1 - p_min)^m_eff) + = log(1 - exp(m_eff * log(1 - p_min))) + with log(1 - p_min) = log1p(-exp(logp_min)). + """ + if m_eff <= 1: + return float(logp_min) + l1mp = np.log1p(-np.exp(logp_min)) + return float(np.log1p(-np.exp(m_eff * l1mp))) + + +@numba.njit +def smooth_qvals(x): + return np.where( + x <= 0.99, + x, + 1 - 0.01 / (1 + 100 * (x - 0.99) + 10000 * (x - 0.99) ** 2), + ) + + +def _safe_p(p, eps=1e-15): + if not np.isfinite(p): + return 1.0 - eps + return float(min(max(p, eps), 1.0 - eps)) + + +def cauchy_combine(pvals, weights=None): + """ + Robust p-value combiner for dependent tests (Cauchy combination). + Liu & Xie (2020). + """ + ps = np.array([_safe_p(p) for p in pvals], dtype=np.float64) + if weights is None: + weights = np.ones_like(ps) + w = np.asarray(weights, dtype=np.float64) + w = w / (w.sum() if w.sum() > 0 else 1.0) + + t = np.sum(w * np.tan((0.5 - ps) * np.pi)) + pc = 0.5 - np.arctan(t) / np.pi + return _safe_p(pc) + + +def summarize_rc_debug(cs_res, top=8): + """ + Convenience helper to inspect per-threshold diagnostics from + localization_pval_dep_scan(..., debug=True). + """ + if "per_c" not in cs_res: + print("No per-threshold diagnostics captured. Run with debug=True.") + return + rows = cs_res["per_c"] + if not rows: + print("No thresholds scanned.") + return + from collections import Counter + + reasons = Counter(r.get("reason") for r in rows) + print("Reason counts:", dict(reasons)) + + cand = [r for r in rows if r.get("reason") is not None] + cand.sort(key=lambda r: r["n_eff_expected"], reverse=True) + print(f"\nTop {min(top, len(cand))} failing thresholds by n_eff_expected:") + for r in cand[:top]: + print( + f" c={r['c']:.3f} p0_abs={r['p0_abs']:.3f} " + f"obs_prop={r['obs_prop']:.3f} n_eff_expected={r['n_eff_expected']:.1f} " + f"reason={r['reason']}" + ) From abd2923abe5c12a1f3f556532c8a25be87371f3f Mon Sep 17 00:00:00 2001 From: weslewis Date: Sat, 28 Feb 2026 16:34:58 -0500 Subject: [PATCH 2/7] Set gmm_scan_new default rc_weight_mode to binary --- locat/locat_condensed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/locat/locat_condensed.py b/locat/locat_condensed.py index b3e35df..cbd852f 100755 --- a/locat/locat_condensed.py +++ b/locat/locat_condensed.py @@ -754,7 +754,7 @@ def gmm_scan_new( rc_n_eff_scale=0.5, #scaling factor for effective sample sizes -- can be tweaked to stabilize pvalues across various gene sample sizes rc_p_floor=1e-12, # this is just model precision, can be ignored rc_rho_bb=0.02, #this is the strength of the beta binomial (0.0 is standard binomial, set at 0.02-0.05 for wider tails) - rc_weight_mode="amount", + rc_weight_mode="binary", rc_eps_rel=0.01, ): if verbose: From 0844d8868a0ffd7d422fe2165196e8088305ffee Mon Sep 17 00:00:00 2001 From: weslewis Date: Sat, 28 Feb 2026 17:39:26 -0500 Subject: [PATCH 3/7] Set default rc_min_abs_deficit to 0.04 --- locat/locat_condensed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/locat/locat_condensed.py b/locat/locat_condensed.py index cbd852f..9e91458 100755 --- a/locat/locat_condensed.py +++ b/locat/locat_condensed.py @@ -748,7 +748,7 @@ def gmm_scan_new( rc_c_values=None, # default inside method rc_min_p0_abs=0.10, #minimum proportion of f0 density in depleted region required for the region pval to be estimated rc_min_expected=30, #minimum expected cells in depleted region required for the region pval to be estimated - rc_min_abs_deficit=0.02, #minimum absolute difference in f1(x) - f0(x) for all x in depleted region + rc_min_abs_deficit=0.04, #minimum absolute difference in f1(x) - f0(x) for all x in depleted region rc_n_trials_cap=500, #maximum effective sample size rc_soft_bound=1.0, #this is unused/can be removed rc_n_eff_scale=0.5, #scaling factor for effective sample sizes -- can be tweaked to stabilize pvalues across various gene sample sizes From 8478e84924ddd029a42de4d841714d727f0b7d08 Mon Sep 17 00:00:00 2001 From: weslewis Date: Mon, 2 Mar 2026 11:22:29 -0500 Subject: [PATCH 4/7] Align LOCAT naming with manuscript notation --- locat/locat_condensed.py | 125 ++++++++++++++++------------ locat/locat_condensed_git.py | 123 +++++++++++++++------------ locat/plotting_and_other_methods.py | 5 +- 3 files changed, 141 insertions(+), 112 deletions(-) diff --git a/locat/locat_condensed.py b/locat/locat_condensed.py index 9e91458..82341ea 100755 --- a/locat/locat_condensed.py +++ b/locat/locat_condensed.py @@ -126,7 +126,7 @@ def reg_covar(self, sample_size=None): # Data access / options # ------------------------------------------------------------------ @property - def X(self): + def W(self): if self._X is None: X = self._adata.X if not isinstance(X, np.ndarray): @@ -134,6 +134,11 @@ def X(self): self._X = X.astype(self._dtype, copy=False) return self._X + @property + def X(self): + # Backward-compatible alias for previous property name. + return self.W + def show_progress(self, show_progress=True): self._disable_progress_info = not show_progress @@ -154,7 +159,7 @@ def background_n_components_init(self, weights_transform=None, min_points=10, n_ bic_component_cost = self.n_dims * (self.n_dims + 3) / 2 if weights_transform is not None: - Xdense = self.X.copy() + Xdense = self.W.copy() for i in range(self.n_genes): Xdense[:, i] = weights_transform(Xdense[:, i]) weights = np.asarray(Xdense.sum(axis=1), dtype=self._dtype) @@ -388,7 +393,7 @@ def background_pdf( logger.info(f"Using {n_comp} components") if weights_transform is not None: - Xdense = self.X.copy() + Xdense = self.W.copy() for i in range(self.n_genes): Xdense[:, i] = weights_transform(Xdense[:, i]) weights = np.asarray(Xdense.sum(axis=1), dtype=self._dtype) @@ -529,12 +534,12 @@ def estimate_null_parameters(self, fractions=None, n_reps=50): # ------------------------------------------------------------------ # Depletion-style localization scan # ------------------------------------------------------------------ - def localization_pval_dep_scan( + def depletion_pval_scan( self, gmm1, gene_prior, *, - c_values=None, + lambda_values=None, soft_bound=None, # default computed from n: max((n-1)/n, 0.99) min_p0_abs=0.10, min_expected=30, @@ -565,14 +570,14 @@ def localization_pval_dep_scan( else: raise ValueError("weight_mode must be 'amount' or 'binary'") - # Kish n_eff with tempering + cap + # Kish n_eff_g with tempering + cap sw = float(np.sum(w_obs)) - n_eff = 0.0 if sw <= 0 else (sw * sw) / max(float(np.sum(w_obs * w_obs)), 1e-12) - n_eff *= float(n_eff_scale) + n_eff_g = 0.0 if sw <= 0 else (sw * sw) / max(float(np.sum(w_obs * w_obs)), 1e-12) + n_eff_g *= float(n_eff_scale) if n_trials_cap is not None: - n_eff = min(n_eff, float(n_trials_cap)) - n_eff = max(1.0, n_eff) - n_trials = int(round(n_eff)) + n_eff_g = min(n_eff_g, float(n_trials_cap)) + n_eff_g = max(1.0, n_eff_g) + n_trials_eff = int(round(n_eff_g)) f0_x = np.clip(self._background_gmm.pdf(X), 1e-300, np.inf) f1_x = np.clip(gmm1.pdf(X), 1e-300, np.inf) @@ -581,25 +586,25 @@ def localization_pval_dep_scan( w_obs_alpha = w_obs / (sw if sw > 0 else 1.0) w0_alpha = w0 - if c_values is None: - c_values = np.concatenate([[1.0], np.geomspace(1.05, 3.0, 12)]) + if lambda_values is None: + lambda_values = np.concatenate([[1.0], np.geomspace(1.05, 3.0, 12)]) best_logp = None - best = {"c": None, "k_obs": None, "p0": None, "obs_prop": None} + best = {"lambda": None, "k_obs": None, "p0": None, "obs_prop": None} scanned = 0 tested = 0 - per_c = [] if debug else None + per_lambda = [] if debug else None - for c in c_values: - c = float(c) - in_R_mask = f0_x > c * f1_x + for lambda_ in lambda_values: + lambda_ = float(lambda_) + in_R_mask = f0_x > lambda_ * f1_x p0_abs = float(np.sum(w0_alpha * in_R_mask)) reason = None if p0_abs < min_p0_abs: reason = "fail:min_p0_abs" - if n_eff * p0_abs < min_expected: + if n_eff_g * p0_abs < min_expected: reason = "fail:min_expected" if p0_abs > soft_bound: reason = "fail:soft_bound" @@ -608,16 +613,16 @@ def localization_pval_dep_scan( if reason is None: if (p0_abs - obs_prop) < min_abs_deficit: reason = "fail:min_abs_deficit" - elif obs_prop > (p0_abs / c) * (1.0 - float(eps_rel)): + elif obs_prop > (p0_abs / lambda_) * (1.0 - float(eps_rel)): reason = "fail:c_bound" #this checks whether observed f1 density in the region is at an equal or lower proportion than the observed f0 density in the region * c (where c is the contrast). If the region is larger than expectation, reject the gene. if debug: rec = { - "c": c, + "lambda": lambda_, "p0_abs": p0_abs, "obs_prop": obs_prop, - "n_eff": float(n_eff), - "n_eff_expected": float(n_eff * p0_abs), + "n_eff_g": float(n_eff_g), + "n_eff_expected": float(n_eff_g * p0_abs), "reason": reason, } if debug_store_masks: @@ -626,7 +631,7 @@ def localization_pval_dep_scan( if weight_mode == "binary": expr_mask = gp > 0 rec["expr_in_R_count"] = int(np.sum(expr_mask & in_R_mask)) - per_c.append(rec) + per_lambda.append(rec) if reason is not None: continue @@ -634,24 +639,24 @@ def localization_pval_dep_scan( scanned += 1 tested += 1 - k_eff = int(np.rint(obs_prop * n_trials)) + k_eff = int(np.rint(obs_prop * n_trials_eff)) p0_clip = np.clip(p0_abs, 1e-12, 1 - 1e-12) if rho_bb and rho_bb > 0.0: ab_sum = max(1.0 / float(rho_bb) - 1.0, 2.0) alpha = float(p0_clip * ab_sum) beta = float((1.0 - p0_clip) * ab_sum) - p_raw = float(betabinom.cdf(k_eff, n_trials, alpha, beta)) + p_raw = float(betabinom.cdf(k_eff, n_trials_eff, alpha, beta)) logp_raw = np.log(max(p_raw, np.finfo(float).tiny)) else: - lFkm1 = binom.logcdf(k_eff - 1, n_trials, p0_clip) - lpk = binom.logpmf(k_eff, n_trials, p0_clip) + np.log(0.5) + lFkm1 = binom.logcdf(k_eff - 1, n_trials_eff, p0_clip) + lpk = binom.logpmf(k_eff, n_trials_eff, p0_clip) + np.log(0.5) logp_raw = logsumexp([lFkm1, lpk]) if (best_logp is None) or (logp_raw < best_logp): best_logp = float(logp_raw) best.update( - {"c": c, "k_obs": k_eff, "p0": p0_abs, "obs_prop": obs_prop} + {"lambda": lambda_, "k_obs": k_eff, "p0": p0_abs, "obs_prop": obs_prop} ) if tested == 0: @@ -662,7 +667,7 @@ def localization_pval_dep_scan( "log_p_sidak": 0.0, "neglog10_p_single": 0.0, "neglog10_p_sidak": 0.0, - "best_c": None, + "best_lambda": None, "k_obs_eff": None, "p0_abs": None, "obs_prop": None, @@ -670,7 +675,8 @@ def localization_pval_dep_scan( "tested": int(tested), "sidak_penalty": 1, "n": n, - "n_eff": float(n_eff), + "n_eff_g": float(n_eff_g), + "n_trials_eff": int(n_trials_eff), "guards": { "min_p0_abs": float(min_p0_abs), "min_expected": float(min_expected), @@ -682,7 +688,7 @@ def localization_pval_dep_scan( "rho_bb": float(rho_bb), } if debug: - out["per_c"] = per_c + out["per_lambda"] = per_lambda return out m_eff = tested @@ -700,7 +706,7 @@ def localization_pval_dep_scan( "log_p_sidak": float(sidak_logp), "neglog10_p_single": float(-best_logp / np.log(10)), "neglog10_p_sidak": float(-sidak_logp / np.log(10)), - "best_c": best["c"], + "best_lambda": best["lambda"], "k_obs_eff": best["k_obs"], "p0_abs": best["p0"], "obs_prop": best["obs_prop"], @@ -708,7 +714,8 @@ def localization_pval_dep_scan( "tested": int(tested), "sidak_penalty": 1, "n": n, - "n_eff": float(n_eff), + "n_eff_g": float(n_eff_g), + "n_trials_eff": int(n_trials_eff), "guards": { "min_p0_abs": float(min_p0_abs), "min_expected": float(min_expected), @@ -720,9 +727,13 @@ def localization_pval_dep_scan( "rho_bb": float(rho_bb), } if debug: - out["per_c"] = per_c + out["per_lambda"] = per_lambda return out + # Backward-compatible alias + def localization_pval_dep_scan(self, *args, **kwargs): + return self.depletion_pval_scan(*args, **kwargs) + # ------------------------------------------------------------------ # Main scan used in practice # ------------------------------------------------------------------ @@ -745,7 +756,7 @@ def gmm_scan_new( verbose=False, n_bootstrap_inits=None, # Depletion-scan defaults - rc_c_values=None, # default inside method + rc_lambda_values=None, # default inside method rc_min_p0_abs=0.10, #minimum proportion of f0 density in depleted region required for the region pval to be estimated rc_min_expected=30, #minimum expected cells in depleted region required for the region pval to be estimated rc_min_abs_deficit=0.04, #minimum absolute difference in f1(x) - f0(x) for all x in depleted region @@ -758,7 +769,7 @@ def gmm_scan_new( rc_eps_rel=0.01, ): if verbose: - print("gmm_scan_new: using depletion scan for localization_pval (localization_pval_dep_scan)") + print("gmm_scan_new: using depletion scan for depletion_pval (depletion_pval_scan)") if n_bootstrap_inits is not None: self.n_bootstrap_inits = int(n_bootstrap_inits) @@ -810,13 +821,13 @@ def gmm_scan_new( zzeros.append(self._adata.var_names[i_gene]) continue - cs_res = self.localization_pval_dep_scan( + cs_res = self.depletion_pval_scan( gmm1, gene_prior, debug=True, - c_values=( - rc_c_values - if rc_c_values is not None + lambda_values=( + rc_lambda_values + if rc_lambda_values is not None else np.concatenate([[1.0], np.geomspace(1.05, 3.0, 12)]) ), soft_bound=rc_soft_bound, @@ -830,15 +841,15 @@ def gmm_scan_new( rho_bb=rc_rho_bb, eps_rel=rc_eps_rel, ) - localization_pval = _safe_p(cs_res["p_value"]) + depletion_pval = _safe_p(cs_res["p_value"]) concentration_pval = _safe_p(float(normal_sf(zscore, 0.0, 1.0))) - p_cauchy = cauchy_combine([localization_pval, concentration_pval]) - p_size = _safe_p(1.0 - np.exp(-1.0 / (sample_size + 1.0))) - p_sens = _safe_p(1.0 - (sens_score + 1e-9)) - p_final = 1.0 - (1.0 - p_cauchy) * (1.0 - 0.05 * p_size) * ( - 1.0 - 0.12 * p_sens + p_cauchy = cauchy_combine([depletion_pval, concentration_pval]) + h_size = _safe_p(1.0 - np.exp(-1.0 / (sample_size + 1.0))) + h_sens = _safe_p(1.0 - (sens_score + 1e-9)) + p_final = 1.0 - (1.0 - p_cauchy) * (1.0 - 0.05 * h_size) * ( + 1.0 - 0.12 * h_sens ) p_final = float(smooth_qvals(np.array([_safe_p(p_final)]))[0]) @@ -846,11 +857,15 @@ def gmm_scan_new( "bic": self.bic_score(gmm1, gene_prior), "zscore": zscore, "sens_score": sens_score, - "localization_pval": localization_pval, + "depletion_pval": depletion_pval, + "localization_pval": depletion_pval, # backward-compatible alias "concentration_pval": concentration_pval, + "h_size": h_size, + "h_sens": h_sens, "pval": p_final, - "n_components": n_comp, + "K_components": n_comp, "sample_size": sample_size, + "depletion_scan": cs_res, "depl_scan": cs_res, } @@ -873,7 +888,7 @@ def get_genes_indices(self, genes): return inclgenes def get_gene_prior(self, i_gene, weights_transform): - gene_prior = self.X[:, i_gene] + gene_prior = self.W[:, i_gene] if weights_transform is not None: gene_prior = weights_transform(gene_prior) return gene_prior @@ -1157,7 +1172,7 @@ def gmm_local_scan(self, 'localization_pval': localization_pval, 'concentration_pval': concentration_pval, 'local_pvalue': localization_pval + concentration_pval - localization_pval*concentration_pval, - 'n_components': n_comp + 'K_components': n_comp } except Exception as e: logger.exception(e) # prints full traceback @@ -1378,12 +1393,12 @@ def cauchy_combine(pvals, weights=None): def summarize_rc_debug(cs_res, top=8): """ Convenience helper to inspect per-threshold diagnostics from - localization_pval_dep_scan(..., debug=True). + depletion_pval_scan(..., debug=True). """ - if "per_c" not in cs_res: + if "per_lambda" not in cs_res: print("No per-threshold diagnostics captured. Run with debug=True.") return - rows = cs_res["per_c"] + rows = cs_res["per_lambda"] if not rows: print("No thresholds scanned.") return @@ -1397,7 +1412,7 @@ def summarize_rc_debug(cs_res, top=8): print(f"\nTop {min(top, len(cand))} failing thresholds by n_eff_expected:") for r in cand[:top]: print( - f" c={r['c']:.3f} p0_abs={r['p0_abs']:.3f} " + f" lambda={r['lambda']:.3f} p0_abs={r['p0_abs']:.3f} " f"obs_prop={r['obs_prop']:.3f} n_eff_expected={r['n_eff_expected']:.1f} " f"reason={r['reason']}" ) diff --git a/locat/locat_condensed_git.py b/locat/locat_condensed_git.py index c4da29d..ed09282 100644 --- a/locat/locat_condensed_git.py +++ b/locat/locat_condensed_git.py @@ -121,7 +121,7 @@ def reg_covar(self, sample_size=None): # Data access / options # ------------------------------------------------------------------ @property - def X(self): + def W(self): if self._X is None: X = self._adata.X if not isinstance(X, np.ndarray): @@ -129,6 +129,11 @@ def X(self): self._X = X.astype(self._dtype, copy=False) return self._X + @property + def X(self): + # Backward-compatible alias for previous property name. + return self.W + def show_progress(self, show_progress=True): self._disable_progress_info = not show_progress @@ -149,7 +154,7 @@ def background_n_components_init(self, weights_transform=None, min_points=10, n_ bic_component_cost = self.n_dims * (self.n_dims + 3) / 2 if weights_transform is not None: - Xdense = self.X.copy() + Xdense = self.W.copy() for i in range(self.n_genes): Xdense[:, i] = weights_transform(Xdense[:, i]) weights = np.asarray(Xdense.sum(axis=1), dtype=self._dtype) @@ -383,7 +388,7 @@ def background_pdf( logger.info(f"Using {n_comp} components") if weights_transform is not None: - Xdense = self.X.copy() + Xdense = self.W.copy() for i in range(self.n_genes): Xdense[:, i] = weights_transform(Xdense[:, i]) weights = np.asarray(Xdense.sum(axis=1), dtype=self._dtype) @@ -524,12 +529,12 @@ def estimate_null_parameters(self, fractions=None, n_reps=50): # ------------------------------------------------------------------ # Depletion-style localization scan # ------------------------------------------------------------------ - def localization_pval_dep_scan( + def depletion_pval_scan( self, gmm1, gene_prior, *, - c_values=None, + lambda_values=None, soft_bound=None, # default computed from n: max((n-1)/n, 0.99) min_p0_abs=0.10, min_expected=30, @@ -560,14 +565,14 @@ def localization_pval_dep_scan( else: raise ValueError("weight_mode must be 'amount' or 'binary'") - # Kish n_eff with tempering + cap + # Kish n_eff_g with tempering + cap sw = float(np.sum(w_obs)) - n_eff = 0.0 if sw <= 0 else (sw * sw) / max(float(np.sum(w_obs * w_obs)), 1e-12) - n_eff *= float(n_eff_scale) + n_eff_g = 0.0 if sw <= 0 else (sw * sw) / max(float(np.sum(w_obs * w_obs)), 1e-12) + n_eff_g *= float(n_eff_scale) if n_trials_cap is not None: - n_eff = min(n_eff, float(n_trials_cap)) - n_eff = max(1.0, n_eff) - n_trials = int(round(n_eff)) + n_eff_g = min(n_eff_g, float(n_trials_cap)) + n_eff_g = max(1.0, n_eff_g) + n_trials_eff = int(round(n_eff_g)) f0_x = np.clip(self._background_gmm.pdf(X), 1e-300, np.inf) f1_x = np.clip(gmm1.pdf(X), 1e-300, np.inf) @@ -576,25 +581,25 @@ def localization_pval_dep_scan( w_obs_alpha = w_obs / (sw if sw > 0 else 1.0) w0_alpha = w0 - if c_values is None: - c_values = np.concatenate([[1.0], np.geomspace(1.05, 3.0, 12)]) + if lambda_values is None: + lambda_values = np.concatenate([[1.0], np.geomspace(1.05, 3.0, 12)]) best_logp = None - best = {"c": None, "k_obs": None, "p0": None, "obs_prop": None} + best = {"lambda": None, "k_obs": None, "p0": None, "obs_prop": None} scanned = 0 tested = 0 - per_c = [] if debug else None + per_lambda = [] if debug else None - for c in c_values: - c = float(c) - in_R_mask = f0_x > c * f1_x + for lambda_ in lambda_values: + lambda_ = float(lambda_) + in_R_mask = f0_x > lambda_ * f1_x p0_abs = float(np.sum(w0_alpha * in_R_mask)) reason = None if p0_abs < min_p0_abs: reason = "fail:min_p0_abs" - if n_eff * p0_abs < min_expected: + if n_eff_g * p0_abs < min_expected: reason = "fail:min_expected" if p0_abs > soft_bound: reason = "fail:soft_bound" @@ -603,16 +608,16 @@ def localization_pval_dep_scan( if reason is None: if (p0_abs - obs_prop) < min_abs_deficit: reason = "fail:min_abs_deficit" - elif obs_prop > (p0_abs / c) * (1.0 - float(eps_rel)): + elif obs_prop > (p0_abs / lambda_) * (1.0 - float(eps_rel)): reason = "fail:c_bound" #this checks whether observed f1 density in the region is at an equal or lower proportion than the observed f0 density in the region * c (where c is the contrast). If the region is larger than expectation, reject the gene. if debug: rec = { - "c": c, + "lambda": lambda_, "p0_abs": p0_abs, "obs_prop": obs_prop, - "n_eff": float(n_eff), - "n_eff_expected": float(n_eff * p0_abs), + "n_eff_g": float(n_eff_g), + "n_eff_expected": float(n_eff_g * p0_abs), "reason": reason, } if debug_store_masks: @@ -621,7 +626,7 @@ def localization_pval_dep_scan( if weight_mode == "binary": expr_mask = gp > 0 rec["expr_in_R_count"] = int(np.sum(expr_mask & in_R_mask)) - per_c.append(rec) + per_lambda.append(rec) if reason is not None: continue @@ -629,24 +634,24 @@ def localization_pval_dep_scan( scanned += 1 tested += 1 - k_eff = int(np.rint(obs_prop * n_trials)) + k_eff = int(np.rint(obs_prop * n_trials_eff)) p0_clip = np.clip(p0_abs, 1e-12, 1 - 1e-12) if rho_bb and rho_bb > 0.0: ab_sum = max(1.0 / float(rho_bb) - 1.0, 2.0) alpha = float(p0_clip * ab_sum) beta = float((1.0 - p0_clip) * ab_sum) - p_raw = float(betabinom.cdf(k_eff, n_trials, alpha, beta)) + p_raw = float(betabinom.cdf(k_eff, n_trials_eff, alpha, beta)) logp_raw = np.log(max(p_raw, np.finfo(float).tiny)) else: - lFkm1 = binom.logcdf(k_eff - 1, n_trials, p0_clip) - lpk = binom.logpmf(k_eff, n_trials, p0_clip) + np.log(0.5) + lFkm1 = binom.logcdf(k_eff - 1, n_trials_eff, p0_clip) + lpk = binom.logpmf(k_eff, n_trials_eff, p0_clip) + np.log(0.5) logp_raw = logsumexp([lFkm1, lpk]) if (best_logp is None) or (logp_raw < best_logp): best_logp = float(logp_raw) best.update( - {"c": c, "k_obs": k_eff, "p0": p0_abs, "obs_prop": obs_prop} + {"lambda": lambda_, "k_obs": k_eff, "p0": p0_abs, "obs_prop": obs_prop} ) if tested == 0: @@ -657,7 +662,7 @@ def localization_pval_dep_scan( "log_p_sidak": 0.0, "neglog10_p_single": 0.0, "neglog10_p_sidak": 0.0, - "best_c": None, + "best_lambda": None, "k_obs_eff": None, "p0_abs": None, "obs_prop": None, @@ -665,7 +670,8 @@ def localization_pval_dep_scan( "tested": int(tested), "sidak_penalty": 1, "n": n, - "n_eff": float(n_eff), + "n_eff_g": float(n_eff_g), + "n_trials_eff": int(n_trials_eff), "guards": { "min_p0_abs": float(min_p0_abs), "min_expected": float(min_expected), @@ -677,7 +683,7 @@ def localization_pval_dep_scan( "rho_bb": float(rho_bb), } if debug: - out["per_c"] = per_c + out["per_lambda"] = per_lambda return out m_eff = tested @@ -695,7 +701,7 @@ def localization_pval_dep_scan( "log_p_sidak": float(sidak_logp), "neglog10_p_single": float(-best_logp / np.log(10)), "neglog10_p_sidak": float(-sidak_logp / np.log(10)), - "best_c": best["c"], + "best_lambda": best["lambda"], "k_obs_eff": best["k_obs"], "p0_abs": best["p0"], "obs_prop": best["obs_prop"], @@ -703,7 +709,8 @@ def localization_pval_dep_scan( "tested": int(tested), "sidak_penalty": 1, "n": n, - "n_eff": float(n_eff), + "n_eff_g": float(n_eff_g), + "n_trials_eff": int(n_trials_eff), "guards": { "min_p0_abs": float(min_p0_abs), "min_expected": float(min_expected), @@ -715,9 +722,13 @@ def localization_pval_dep_scan( "rho_bb": float(rho_bb), } if debug: - out["per_c"] = per_c + out["per_lambda"] = per_lambda return out + # Backward-compatible alias + def localization_pval_dep_scan(self, *args, **kwargs): + return self.depletion_pval_scan(*args, **kwargs) + # ------------------------------------------------------------------ # Main scan used in practice # ------------------------------------------------------------------ @@ -740,7 +751,7 @@ def gmm_scan_new( verbose=False, n_bootstrap_inits=None, # Depletion-scan defaults - rc_c_values=None, # default inside method + rc_lambda_values=None, # default inside method rc_min_p0_abs=0.10, #minimum proportion of f0 density in depleted region required for the region pval to be estimated rc_min_expected=30, #minimum expected cells in depleted region required for the region pval to be estimated rc_min_abs_deficit=0.02, #minimum absolute difference in f1(x) - f0(x) for all x in depleted region @@ -753,7 +764,7 @@ def gmm_scan_new( rc_eps_rel=0.01, ): if verbose: - print("gmm_scan_new: using depletion scan for localization_pval (localization_pval_dep_scan)") + print("gmm_scan_new: using depletion scan for depletion_pval (depletion_pval_scan)") if n_bootstrap_inits is not None: self.n_bootstrap_inits = int(n_bootstrap_inits) @@ -805,13 +816,13 @@ def gmm_scan_new( zzeros.append(self._adata.var_names[i_gene]) continue - cs_res = self.localization_pval_dep_scan( + cs_res = self.depletion_pval_scan( gmm1, gene_prior, debug=True, - c_values=( - rc_c_values - if rc_c_values is not None + lambda_values=( + rc_lambda_values + if rc_lambda_values is not None else np.concatenate([[1.0], np.geomspace(1.05, 3.0, 12)]) ), soft_bound=rc_soft_bound, @@ -825,28 +836,32 @@ def gmm_scan_new( rho_bb=rc_rho_bb, eps_rel=rc_eps_rel, ) - localization_pval = _safe_p(cs_res["p_value"]) + depletion_pval = _safe_p(cs_res["p_value"]) concentration_pval = _safe_p(float(normal_sf(zscore, 0.0, 1.0))) - p_cauchy = cauchy_combine([localization_pval, concentration_pval]) - p_size = _safe_p(1.0 - np.exp(-1.0 / (sample_size + 1.0))) + p_cauchy = cauchy_combine([depletion_pval, concentration_pval]) + h_size = _safe_p(1.0 - np.exp(-1.0 / (sample_size + 1.0))) # match notebook behavior: clip sens then use (1 - sens) (sens_power==1.0) sens = np.clip(sens_score, 1e-6, 1.0) - p_sens = _safe_p(1.0 - sens) + h_sens = _safe_p(1.0 - sens) # use notebook weights (w_size=0.05, w_sens=0.12) - p_final = 1.0 - (1.0 - p_cauchy) * (1.0 - 0.05 * p_size) * (1.0 - 0.12 * p_sens) + p_final = 1.0 - (1.0 - p_cauchy) * (1.0 - 0.05 * h_size) * (1.0 - 0.12 * h_sens) p_final = float(smooth_qvals(np.array([_safe_p(p_final)]))[0]) locally_enriched[self._adata.var_names[i_gene]] = { "bic": self.bic_score(gmm1, gene_prior), "zscore": zscore, "sens_score": sens_score, - "localization_pval": localization_pval, + "depletion_pval": depletion_pval, + "localization_pval": depletion_pval, # backward-compatible alias "concentration_pval": concentration_pval, + "h_size": h_size, + "h_sens": h_sens, "pval": p_final, - "n_components": n_comp, + "K_components": n_comp, "sample_size": sample_size, + "depletion_scan": cs_res, "depl_scan": cs_res, } @@ -869,7 +884,7 @@ def get_genes_indices(self, genes): return inclgenes def get_gene_prior(self, i_gene, weights_transform): - gene_prior = self.X[:, i_gene] + gene_prior = self.W[:, i_gene] if weights_transform is not None: gene_prior = weights_transform(gene_prior) return gene_prior @@ -1153,7 +1168,7 @@ def gmm_local_scan(self, 'localization_pval': localization_pval, 'concentration_pval': concentration_pval, 'local_pvalue': localization_pval + concentration_pval - localization_pval*concentration_pval, - 'n_components': n_comp + 'K_components': n_comp } except Exception as e: logger.exception(e) # prints full traceback @@ -1374,12 +1389,12 @@ def cauchy_combine(pvals, weights=None): def summarize_rc_debug(cs_res, top=8): """ Convenience helper to inspect per-threshold diagnostics from - localization_pval_dep_scan(..., debug=True). + depletion_pval_scan(..., debug=True). """ - if "per_c" not in cs_res: + if "per_lambda" not in cs_res: print("No per-threshold diagnostics captured. Run with debug=True.") return - rows = cs_res["per_c"] + rows = cs_res["per_lambda"] if not rows: print("No thresholds scanned.") return @@ -1393,7 +1408,7 @@ def summarize_rc_debug(cs_res, top=8): print(f"\nTop {min(top, len(cand))} failing thresholds by n_eff_expected:") for r in cand[:top]: print( - f" c={r['c']:.3f} p0_abs={r['p0_abs']:.3f} " + f" lambda={r['lambda']:.3f} p0_abs={r['p0_abs']:.3f} " f"obs_prop={r['obs_prop']:.3f} n_eff_expected={r['n_eff_expected']:.1f} " f"reason={r['reason']}" ) diff --git a/locat/plotting_and_other_methods.py b/locat/plotting_and_other_methods.py index 35e4ff6..e92560b 100755 --- a/locat/plotting_and_other_methods.py +++ b/locat/plotting_and_other_methods.py @@ -267,7 +267,7 @@ def plot_gene_localization_summary( f"{gene}\nLocalized: {loc_mask.sum()} | " f"Unloc: {unloc_mask.sum()} | " f"ExpUnloc: {rec.get('expected_unlocalized', np.nan):.1f}\n" - f"LocPval: {rec['localization_pval']:.2e}" + f"DepPval: {rec['depletion_pval']:.2e}" ), fontsize=11 ) @@ -347,7 +347,7 @@ def plotgenes( f"{gene}\n" f"pval: {d0.loc[gene]['pval']:.2e}\n" f"conc_pval: {d0.loc[gene]['concentration_pval']:.2e}\n" - f"loca_pval: {d0.loc[gene]['localization_pval']:.2e}", + f"dep_pval: {d0.loc[gene]['depletion_pval']:.2e}", fontsize=text_size, pad=10 ) @@ -380,4 +380,3 @@ def plotgenes( plt.tight_layout() fig.subplots_adjust(hspace=0.4) plt.show() - From 41b1e83a587199afb6849374c651a368b7ae019b Mon Sep 17 00:00:00 2001 From: weslewis Date: Mon, 2 Mar 2026 11:47:59 -0500 Subject: [PATCH 5/7] Drop locat_condensed_git.py from PR; keep upstream file set --- locat/locat_condensed_git.py | 1414 ---------------------------------- 1 file changed, 1414 deletions(-) delete mode 100644 locat/locat_condensed_git.py diff --git a/locat/locat_condensed_git.py b/locat/locat_condensed_git.py deleted file mode 100644 index ed09282..0000000 --- a/locat/locat_condensed_git.py +++ /dev/null @@ -1,1414 +0,0 @@ -import math - -import numba -import numpy as np -from loguru import logger -from scanpy import AnnData -from scipy.interpolate import PchipInterpolator -from scipy.special import logsumexp -from scipy.stats import binom, betabinom, chi2 -from sklearn.metrics.pairwise import euclidean_distances as edist -from tqdm import tqdm -from sklearn.neighbors import NearestNeighbors -from scipy.stats import norm -from locat.wgmm import WGMM -from locat.wgmms import wgmm -from locat.rgmm import softbootstrap_gmm - -class LOCATNullDistribution: - mean = None - std = None - - def __init__(self, mean_func, std_func): - self.mean = mean_func - self.std = std_func - - def to_zscore(self, raw_score, p): - return (raw_score - self.mean(p)) / self.std(p) - - @classmethod - def from_estimates(cls, p, means, stds): - return cls( - mean_func=PchipInterpolator(p, means), - std_func=PchipInterpolator(p, stds), - ) - - -class LOCAT: - _cell_dist = None - _min_dist = None - _knn = None - _background_gmm: WGMM = None - _background_pdf: np.ndarray = None - _background_logpdf: np.ndarray = None - _null_distribution: LOCATNullDistribution = None - _X = None - _n_components_waypoints = None - _disable_progress_info = True - - def __init__( - self, - adata: AnnData, - cell_embedding: np.ndarray, - k: int, - n_bootstrap_inits: int, - show_progress=False, - wgmm_dtype: str = "same", # "same" | "float32" | "float64" - knn=None, # <-- NEW: precomputed adjacency/connectivities - knn_k: int | None = None, # <-- NEW: if computing, how many neighbors (defaults to k) - knn_mode: str = "binary", # <-- NEW: "binary" or "connectivity" - ): - self._disable_progress_info = not show_progress - self._adata = adata - - emb = np.asarray(cell_embedding) - emb = (emb - emb.mean(0)) / (emb.std(0) + emb.dtype.type(1e-6)) - self._embedding = emb - self._dtype = self._embedding.dtype - self._wgmm_dtype = wgmm_dtype - - self._k = k - self.n_cells, self.n_genes = self._adata.shape - self.n_dims = self._embedding.shape[1] - self.n_bootstrap_inits = n_bootstrap_inits - self._knn = None - self._knn_k = int(knn_k) if knn_k is not None else int(k) - self._knn_mode = str(knn_mode) - - self._reg_covar = None - if knn is not None: - self.set_knn(knn) - # ------------------------------------------------------------------ - # Basic geometry / regularization - # ------------------------------------------------------------------ - @property - def cell_dist(self) -> np.ndarray: - if self._cell_dist is None: - if not self._disable_progress_info: - logger.info("recomputing cell-cell distance") - self._cell_dist = edist(self._embedding) - return self._cell_dist - - @property - def min_dist(self) -> float: - if self._min_dist is None: - if not self._disable_progress_info: - logger.info("recomputing min cell-cell distance") - self._min_dist = np.mean( - np.nanmin( - np.where(self.cell_dist > 0, self.cell_dist, np.nan), - axis=1, - ) - ) - return self._min_dist - - def reg_covar(self, sample_size=None): - if self._reg_covar is not None: - return self._dtype.type(self._reg_covar) - - base = (self.min_dist ** (2 / self.n_dims)) / 6 - if sample_size is None: - rc = base - else: - c = 1 - 2 / self.n_dims - adj = np.sqrt((self.n_cells + c) / (sample_size + c)) - rc = base * adj - - rc = max(rc, 1e-4) - return self._dtype.type(rc) - - # ------------------------------------------------------------------ - # Data access / options - # ------------------------------------------------------------------ - @property - def W(self): - if self._X is None: - X = self._adata.X - if not isinstance(X, np.ndarray): - X = X.toarray() - self._X = X.astype(self._dtype, copy=False) - return self._X - - @property - def X(self): - # Backward-compatible alias for previous property name. - return self.W - - def show_progress(self, show_progress=True): - self._disable_progress_info = not show_progress - - # ------------------------------------------------------------------ - # Background GMM and LTST null - # ------------------------------------------------------------------ - def background_n_components_init(self, weights_transform=None, min_points=10, n_reps=30): - if not self._disable_progress_info: - logger.info("Estimating number of GMM components") - - self._n_components_waypoints = np.zeros(shape=(3, 2)) - self._n_components_waypoints[0, 0] = min_points - self._n_components_waypoints[1, 0] = int(np.sqrt(self.n_cells)) - self._n_components_waypoints[2, 0] = self.n_cells - - self._n_components_waypoints[0, 1] = 1 - - bic_component_cost = self.n_dims * (self.n_dims + 3) / 2 - - if weights_transform is not None: - Xdense = self.W.copy() - for i in range(self.n_genes): - Xdense[:, i] = weights_transform(Xdense[:, i]) - weights = np.asarray(Xdense.sum(axis=1), dtype=self._dtype) - else: - weights = np.ones((self.n_cells,), dtype=self._dtype) - - weights = np.clip(weights, self._dtype.type(1e-6), np.inf).astype( - self._dtype, copy=False - ) - s = float(weights.sum()) - weights = (weights / s) if s > 0 else np.full_like(weights, 1.0 / len(weights)) - weights[~np.isfinite(weights)] = self._dtype.type(1.0 / len(weights)) - - # Full-data BIC search for big-N waypoint - min_search = 1 - max_search = 10 - keep_searching = True - bic = [] - while keep_searching: - for i in tqdm( - range(min_search, max_search), - desc=f"estimating BIC for {self._n_components_waypoints[2, 0]:.0f} cells", - position=0, - leave=True, - disable=self._disable_progress_info, - ): - p = self.fit_wgmm(n_comp=i, weights=weights).pdf(self._embedding) - bic.append( - bic_component_cost * i * np.log(self.n_cells) - - np.sum(2 * np.log(p[p > 0])) - ) - self._n_components_waypoints[2, 1] = np.argmin(bic) + 1 - if self._n_components_waypoints[2, 1] >= len(bic) - 2: - min_search = max_search - max_search = max_search + 5 - else: - keep_searching = False - - # sqrt(n_cells) waypoint - n = np.max([min_points + 1, int(np.ceil(np.sqrt(self.n_cells)))]) - bic = np.zeros(shape=(int(self._n_components_waypoints[2, 1]), n_reps)) - - o = np.arange(self.n_cells) - wgs = np.zeros(shape=(self.n_cells, n_reps)) - for i_rep in range(n_reps): - np.random.shuffle(o) - wgs[o < n, i_rep] = weights[o < n] - wgs = wgs / np.sum(wgs, axis=0) - - for i_rep in tqdm( - range(n_reps), - desc=f"estimating BIC for {self._n_components_waypoints[1, 0]:.0f} cells", - position=0, - leave=True, - disable=self._disable_progress_info, - ): - for i in range(1, bic.shape[0] + 1): - p = self.fit_wgmm(n_comp=i, weights=wgs[:, i_rep]).pdf(self._embedding) - bic[i - 1, i_rep] = ( - bic_component_cost * i * np.log(n) - np.sum(2 * np.log(p[p > 0])) - ) - self._n_components_waypoints[1, 1] = np.argmin(np.median(bic, axis=1)) + 1 - - def auto_bkg_components(self, n_points, weights_transform=None): - if self._n_components_waypoints is None: - self.background_n_components_init(weights_transform=weights_transform) - - n_components = np.interp( - np.log(n_points), - xp=np.log(self._n_components_waypoints[:, 0]), - fp=np.log(self._n_components_waypoints[:, 1]), - ) - return int(np.round(np.exp(n_components))) - - def _auto_n_effective_weights(self, weights=None, min_dist_cutoff=None): - if min_dist_cutoff is None: - min_dist_cutoff = self.min_dist * 3 - - rweights = np.zeros(shape=(self.n_cells,)) - if weights is None: - wg0 = rweights == 0 - else: - wg0 = weights > 0 - - if np.sum(wg0) > 5: - # only compute the components to describe the cells that are close enough - cell_include = ( - np.min(self.cell_dist[wg0, :][:, wg0] + np.eye(np.sum(wg0)) * 10, axis=1) - < min_dist_cutoff - ) - rweights[wg0] = cell_include.astype(float) - return rweights - return None - - def auto_n_components(self, coords, weights=None, indices=None, min_points_fraction=0.95): - if weights is None: - weights = np.ones(shape=(coords.shape[0],)) - - if indices is None: - if coords.shape[0] != self.n_cells: - raise ValueError("A selection was made but the indices were not passed") - indices = np.full_like(weights, fill_value=True) - - logpdf = self._background_logpdf[indices, :] * weights[:, None] - id_component = np.argmax(logpdf, axis=1) - - component_counts = np.bincount( - id_component[weights > 0], minlength=self._background_gmm.n_comp - ) - n_counts = np.sum(component_counts) - component_counts[component_counts < 5] = 0 - if np.sum(component_counts) < 10: - return 1 - else: - component_counts = component_counts / np.sum(component_counts) - estimated_n_components = ( - np.min( - np.flatnonzero( - np.cumsum(np.sort(component_counts)[::-1]) > min_points_fraction - ) - ) - + 1 - ) - estimated_n_components = int(np.floor(np.sqrt(estimated_n_components + 1)) + 1) - return int( - np.max( - [ - 1, - np.min( - [ - self._background_gmm.n_comp, - n_counts / 5, - estimated_n_components, - ] - ), - ] - ) - ) - def set_knn(self, knn): - """ - Store a precomputed KNN graph. - - Accepts: - - scipy sparse (csr/csc/coo) adjacency or connectivities - - dense numpy array adjacency/connectivities - - Expected shape: (n_cells, n_cells) - """ - if sp.issparse(knn): - K = knn.tocsr() - else: - K = np.asarray(knn) - - if K.shape != (self.n_cells, self.n_cells): - raise ValueError(f"knn must have shape {(self.n_cells, self.n_cells)}, got {K.shape}") - - # zero diagonal to avoid self-neighbor artifacts - if sp.issparse(K): - K = K.tolil() - K.setdiag(0.0) - K = K.tocsr() - K.eliminate_zeros() - else: - np.fill_diagonal(K, 0.0) - - self._knn = K - - - def knn(self): - """ - Return a KNN adjacency/connectivity matrix. - - If a KNN was provided at init (or via set_knn), returns it. - Otherwise computes one from the embedding. - """ - if self._knn is not None: - return self._knn - - k = int(self._knn_k) - if k <= 0: - raise ValueError("knn_k must be >= 1") - - # Build kNN from embedding (exclude self by taking k+1 then dropping self) - nbrs = NearestNeighbors(n_neighbors=min(k + 1, self.n_cells), metric="euclidean") - nbrs.fit(self._embedding) - dists, inds = nbrs.kneighbors(self._embedding, return_distance=True) - - # Drop self neighbor (usually first column) - inds = inds[:, 1: k + 1] - - rows = np.repeat(np.arange(self.n_cells), inds.shape[1]) - cols = inds.reshape(-1) - - if self._knn_mode == "binary": - data = np.ones_like(cols, dtype=self._dtype) - elif self._knn_mode == "connectivity": - # simple distance-based weights; you can swap for something else - dd = dists[:, 1: k + 1].reshape(-1) - data = (1.0 / (dd + 1e-6)).astype(self._dtype, copy=False) - else: - raise ValueError("knn_mode must be 'binary' or 'connectivity'") - - K = sp.csr_matrix((data, (rows, cols)), shape=(self.n_cells, self.n_cells)) - K.eliminate_zeros() - self._knn = K - return self._knn - - - def background_pdf( - self, - n_comp=None, - reps=10, - total_counts_weight=True, - weights_transform=None, - force_refresh=False, - ): - if (self._background_pdf is None) or force_refresh: - if not self._disable_progress_info: - logger.info("fitting background PDF") - - if n_comp is None: - if self._n_components_waypoints is None: - self.background_n_components_init(weights_transform=weights_transform) - n_comp = np.interp( - np.log(self.n_cells), - xp=np.log(self._n_components_waypoints[:, 0]), - fp=np.log(self._n_components_waypoints[:, 1]), - ) - n_comp = int(np.round(np.exp(n_comp))) - if not self._disable_progress_info: - logger.info(f"Using {n_comp} components") - - if weights_transform is not None: - Xdense = self.W.copy() - for i in range(self.n_genes): - Xdense[:, i] = weights_transform(Xdense[:, i]) - weights = np.asarray(Xdense.sum(axis=1), dtype=self._dtype) - else: - weights = np.ones((self.n_cells,), dtype=self._dtype) - - weights = np.clip(weights, self._dtype.type(1e-6), np.inf).astype( - self._dtype, copy=False - ) - s = float(weights.sum()) - weights = (weights / s) if s > 0 else np.full_like(weights, 1.0 / len(weights)) - weights[~np.isfinite(weights)] = self._dtype.type(1.0 / len(weights)) - - self._background_pdf = np.zeros(shape=(self.n_cells,)) - self._background_logpdf = np.zeros(shape=(self.n_cells, n_comp)) - - background_gmm = None - for _ in tqdm( - range(reps), - desc="fitting background", - position=0, - leave=True, - disable=self._disable_progress_info, - ): - background_gmm = self.fit_wgmm(n_comp, weights=weights) - self._background_pdf += background_gmm.pdf(self._embedding) - self._background_logpdf += background_gmm.loglikelihood_by_component( - self._embedding, np.ones(shape=self.n_cells) - ) - self._background_gmm = background_gmm - self._background_pdf /= reps - self._background_logpdf /= reps - - self.estimate_null_parameters() - return self._background_pdf - - def signal_gmm(self, weights, n_comp=None): - if n_comp is None: - comp_weights = self._auto_n_effective_weights(weights) - if comp_weights is None: - n_comp = 1 - else: - loc_indices = comp_weights > 0 - n_comp = self.auto_n_components( - self._embedding[loc_indices, :], - weights[loc_indices], - indices=loc_indices, - ) - - return self.fit_wgmm(n_comp, weights=weights) - - def fit_wgmm(self, n_comp, weights=None) -> WGMM: - if weights is None: - weights = np.ones(shape=(self.n_cells,)) - - pis, mus, sigmas, _ = wgmm( - self._embedding, - raw_weights=weights, - n_components=n_comp, - n_inits=1, - reg_covar=self.reg_covar(), - ) - pis = np.array(pis) - mus = np.array(mus) - sigmas = np.array(sigmas) - return WGMM(pis, mus, sigmas) - - def estimate_null_parameters(self, fractions=None, n_reps=50): - """ - Estimate LTST null mean/std as a function of expression fraction p - using random pseudo-genes: - - pick random expressing cells at frequency p - - fit signal GMM with the *same* pipeline as real genes - - compute LTST exactly as in gmm_scan_new - """ - if fractions is None: - fractions = 10 ** np.linspace(np.log10(10 / self.n_cells), 0, 7) - - f0 = self.background_pdf() - scores = [] - - for frac in tqdm( - fractions, - desc="null distribution parameters (perm. pseudo-genes)", - position=0, - leave=True, - disable=self._disable_progress_info, - ): - n_pos = max(5, int(round(frac * self.n_cells))) - if n_pos >= self.n_cells: - n_pos = self.n_cells - 1 - - ltst_vals = [] - - for _ in range(n_reps): - mask = np.zeros(self.n_cells, dtype=bool) - mask[np.random.choice(self.n_cells, n_pos, replace=False)] = True - gene_prior = mask.astype(self._dtype) - - comp_gene_prior = self._auto_n_effective_weights(gene_prior) - if comp_gene_prior is None: - n_comp = 1 - else: - loc_indices = comp_gene_prior > 0 - n_comp = self.auto_n_components( - self._embedding[loc_indices, :], - gene_prior[loc_indices], - loc_indices, - ) - gmm1 = self.signal_gmm(weights=gene_prior, n_comp=n_comp) - - i1 = gene_prior > 0 - loc_f1 = gmm1.pdf(self._embedding[i1, :]) - p1 = float(np.mean(i1)) - - ltst_score = ltst_score_func(f0[i1], loc_f1, p1) - - w_expr = gene_prior[i1] - w_expr = w_expr / (w_expr.sum() if w_expr.sum() > 0 else 1.0) - ltst_vals.append(float(np.dot(w_expr, ltst_score))) - - ltst_vals = np.asarray(ltst_vals, dtype=float) - scores.append( - [ - frac, - float(np.mean(ltst_vals)), - float(np.std(ltst_vals) + 1e-9), - ] - ) - - scores = np.asarray(scores) - self._null_distribution = LOCATNullDistribution.from_estimates( - p=scores[:, 0], - means=scores[:, 1], - stds=scores[:, 2], - ) - - # ------------------------------------------------------------------ - # Depletion-style localization scan - # ------------------------------------------------------------------ - def depletion_pval_scan( - self, - gmm1, - gene_prior, - *, - lambda_values=None, - soft_bound=None, # default computed from n: max((n-1)/n, 0.99) - min_p0_abs=0.10, - min_expected=30, - min_abs_deficit=0.02, - n_trials_cap=500, - weight_mode="amount", - p_floor=1e-12, - n_eff_scale=1.0, - rho_bb=0.2, # >0 enables Beta–Binomial tail - eps_rel=0.01, - debug=False, - debug_store_masks=False, - debug_max_cells=5000, - ): - if self._background_gmm is None: - _ = self.background_pdf() - - X = self._embedding - n = int(X.shape[0]) - if soft_bound is None: - soft_bound = max((n - 1) / max(n, 1), 0.99) - - gp = np.asarray(gene_prior, float) - if weight_mode == "binary": - w_obs = (gp > 0).astype(float) - elif weight_mode == "amount": - w_obs = np.clip(gp, 0.0, np.inf) - else: - raise ValueError("weight_mode must be 'amount' or 'binary'") - - # Kish n_eff_g with tempering + cap - sw = float(np.sum(w_obs)) - n_eff_g = 0.0 if sw <= 0 else (sw * sw) / max(float(np.sum(w_obs * w_obs)), 1e-12) - n_eff_g *= float(n_eff_scale) - if n_trials_cap is not None: - n_eff_g = min(n_eff_g, float(n_trials_cap)) - n_eff_g = max(1.0, n_eff_g) - n_trials_eff = int(round(n_eff_g)) - - f0_x = np.clip(self._background_gmm.pdf(X), 1e-300, np.inf) - f1_x = np.clip(gmm1.pdf(X), 1e-300, np.inf) - w0 = f0_x / float(np.sum(f0_x)) - - w_obs_alpha = w_obs / (sw if sw > 0 else 1.0) - w0_alpha = w0 - - if lambda_values is None: - lambda_values = np.concatenate([[1.0], np.geomspace(1.05, 3.0, 12)]) - - best_logp = None - best = {"lambda": None, "k_obs": None, "p0": None, "obs_prop": None} - scanned = 0 - tested = 0 - - per_lambda = [] if debug else None - - for lambda_ in lambda_values: - lambda_ = float(lambda_) - in_R_mask = f0_x > lambda_ * f1_x - - p0_abs = float(np.sum(w0_alpha * in_R_mask)) - reason = None - if p0_abs < min_p0_abs: - reason = "fail:min_p0_abs" - if n_eff_g * p0_abs < min_expected: - reason = "fail:min_expected" - if p0_abs > soft_bound: - reason = "fail:soft_bound" - - obs_prop = float(np.sum(w_obs_alpha * in_R_mask)) - if reason is None: - if (p0_abs - obs_prop) < min_abs_deficit: - reason = "fail:min_abs_deficit" - elif obs_prop > (p0_abs / lambda_) * (1.0 - float(eps_rel)): - reason = "fail:c_bound" #this checks whether observed f1 density in the region is at an equal or lower proportion than the observed f0 density in the region * c (where c is the contrast). If the region is larger than expectation, reject the gene. - - if debug: - rec = { - "lambda": lambda_, - "p0_abs": p0_abs, - "obs_prop": obs_prop, - "n_eff_g": float(n_eff_g), - "n_eff_expected": float(n_eff_g * p0_abs), - "reason": reason, - } - if debug_store_masks: - ncap = min(int(debug_max_cells), n) - rec["in_R_idx"] = np.flatnonzero(in_R_mask[:ncap]).astype(int) - if weight_mode == "binary": - expr_mask = gp > 0 - rec["expr_in_R_count"] = int(np.sum(expr_mask & in_R_mask)) - per_lambda.append(rec) - - if reason is not None: - continue - - scanned += 1 - tested += 1 - - k_eff = int(np.rint(obs_prop * n_trials_eff)) - p0_clip = np.clip(p0_abs, 1e-12, 1 - 1e-12) - - if rho_bb and rho_bb > 0.0: - ab_sum = max(1.0 / float(rho_bb) - 1.0, 2.0) - alpha = float(p0_clip * ab_sum) - beta = float((1.0 - p0_clip) * ab_sum) - p_raw = float(betabinom.cdf(k_eff, n_trials_eff, alpha, beta)) - logp_raw = np.log(max(p_raw, np.finfo(float).tiny)) - else: - lFkm1 = binom.logcdf(k_eff - 1, n_trials_eff, p0_clip) - lpk = binom.logpmf(k_eff, n_trials_eff, p0_clip) + np.log(0.5) - logp_raw = logsumexp([lFkm1, lpk]) - - if (best_logp is None) or (logp_raw < best_logp): - best_logp = float(logp_raw) - best.update( - {"lambda": lambda_, "k_obs": k_eff, "p0": p0_abs, "obs_prop": obs_prop} - ) - - if tested == 0: - out = { - "p_value": 1.0, - "raw_min_p": 1.0, - "log_p_single": 0.0, - "log_p_sidak": 0.0, - "neglog10_p_single": 0.0, - "neglog10_p_sidak": 0.0, - "best_lambda": None, - "k_obs_eff": None, - "p0_abs": None, - "obs_prop": None, - "scanned": int(scanned), - "tested": int(tested), - "sidak_penalty": 1, - "n": n, - "n_eff_g": float(n_eff_g), - "n_trials_eff": int(n_trials_eff), - "guards": { - "min_p0_abs": float(min_p0_abs), - "min_expected": float(min_expected), - "min_abs_deficit": float(min_abs_deficit), - "soft_bound": float(soft_bound), - "eps_rel": float(eps_rel), - }, - "n_eff_scale": float(n_eff_scale), - "rho_bb": float(rho_bb), - } - if debug: - out["per_lambda"] = per_lambda - return out - - m_eff = tested - sidak_logp = _logsidak_from_logp(best_logp, m_eff) - - p_value = float(np.exp(sidak_logp)) - raw_min_p = float(np.exp(best_logp)) - p_value = _safe_p(max(p_value, p_floor)) - raw_min_p = _safe_p(max(raw_min_p, p_floor)) - - out = { - "p_value": p_value, - "raw_min_p": raw_min_p, - "log_p_single": float(best_logp), - "log_p_sidak": float(sidak_logp), - "neglog10_p_single": float(-best_logp / np.log(10)), - "neglog10_p_sidak": float(-sidak_logp / np.log(10)), - "best_lambda": best["lambda"], - "k_obs_eff": best["k_obs"], - "p0_abs": best["p0"], - "obs_prop": best["obs_prop"], - "scanned": int(scanned), - "tested": int(tested), - "sidak_penalty": 1, - "n": n, - "n_eff_g": float(n_eff_g), - "n_trials_eff": int(n_trials_eff), - "guards": { - "min_p0_abs": float(min_p0_abs), - "min_expected": float(min_expected), - "min_abs_deficit": float(min_abs_deficit), - "soft_bound": float(soft_bound), - "eps_rel": float(eps_rel), - }, - "n_eff_scale": float(n_eff_scale), - "rho_bb": float(rho_bb), - } - if debug: - out["per_lambda"] = per_lambda - return out - - # Backward-compatible alias - def localization_pval_dep_scan(self, *args, **kwargs): - return self.depletion_pval_scan(*args, **kwargs) - - # ------------------------------------------------------------------ - # Main scan used in practice - # ------------------------------------------------------------------ - def bic_score(self, gmm1, gene_prior): - bic_component_cost = self.n_dims * (self.n_dims + 3) / 2 - p = gmm1.pdf(self._embedding[gene_prior > 0, :]) - n_cells = np.sum(gene_prior > 0) - bic = ( - bic_component_cost * gmm1.n_comp * np.log(np.sum(gene_prior > 0)) - - 2 * np.sum(p[p > 0]) - ) - return bic / n_cells - - def gmm_scan_new( - self, - genes=None, - weights_transform=None, - zscore_thresh=None, - max_freq=0.5, - verbose=False, - n_bootstrap_inits=None, - # Depletion-scan defaults - rc_lambda_values=None, # default inside method - rc_min_p0_abs=0.10, #minimum proportion of f0 density in depleted region required for the region pval to be estimated - rc_min_expected=30, #minimum expected cells in depleted region required for the region pval to be estimated - rc_min_abs_deficit=0.02, #minimum absolute difference in f1(x) - f0(x) for all x in depleted region - rc_n_trials_cap=500, #maximum effective sample size - rc_soft_bound=1.0, #this is unused/can be removed - rc_n_eff_scale=0.5, #scaling factor for effective sample sizes -- can be tweaked to stabilize pvalues across various gene sample sizes - rc_p_floor=1e-12, # this is just model precision, can be ignored - rc_rho_bb=0.02, #this is the strength of the beta binomial (0.0 is standard binomial, set at 0.02-0.05 for wider tails) - rc_weight_mode="amount", - rc_eps_rel=0.01, - ): - if verbose: - print("gmm_scan_new: using depletion scan for depletion_pval (depletion_pval_scan)") - - if n_bootstrap_inits is not None: - self.n_bootstrap_inits = int(n_bootstrap_inits) - - locally_enriched = dict() - gzeros, freqzeros, zzeros = [], [], [] - inclgenes = self.get_genes_indices(genes) - - f0 = self.background_pdf(weights_transform=weights_transform) - for i_gene in tqdm( - inclgenes, - desc="scanning genes", - position=0, - leave=True, - disable=self._disable_progress_info, - ): - gene_prior = self.background_pdf(i_gene, weights_transform) - try: - if np.sum(gene_prior) == 0: - gzeros.append(self._adata.var_names[i_gene]) - continue - if np.mean(gene_prior > 0) > max_freq: - freqzeros.append(self._adata.var_names[i_gene]) - continue - - comp_gene_prior = self._auto_n_effective_weights(gene_prior) - if comp_gene_prior is None: - n_comp = 1 - else: - loc_indices = comp_gene_prior > 0 - n_comp = self.auto_n_components( - self._embedding[loc_indices, :], - gene_prior[loc_indices], - loc_indices, - ) - gmm1 = self.signal_gmm(weights=gene_prior, n_comp=n_comp) - - i1 = gene_prior > 0 - loc_f1 = gmm1.pdf(self._embedding[i1, :]) - p1 = np.mean(i1) - sample_size = p1 * self.n_cells - - ltst_score = ltst_score_func(f0[i1], loc_f1, p1) - sens_score = sens_score_func(f0[i1], loc_f1, i1[i1]) - - zscore = np.dot(gene_prior[i1], ltst_score) / np.sum(gene_prior[i1]) - zscore = self._null_distribution.to_zscore(zscore, p1) - if (zscore_thresh is not None) and (zscore < zscore_thresh): - zzeros.append(self._adata.var_names[i_gene]) - continue - - cs_res = self.depletion_pval_scan( - gmm1, - gene_prior, - debug=True, - lambda_values=( - rc_lambda_values - if rc_lambda_values is not None - else np.concatenate([[1.0], np.geomspace(1.05, 3.0, 12)]) - ), - soft_bound=rc_soft_bound, - min_p0_abs=rc_min_p0_abs, - min_expected=rc_min_expected, - min_abs_deficit=rc_min_abs_deficit, - n_trials_cap=rc_n_trials_cap, - weight_mode=rc_weight_mode, - p_floor=rc_p_floor, - n_eff_scale=rc_n_eff_scale, - rho_bb=rc_rho_bb, - eps_rel=rc_eps_rel, - ) - depletion_pval = _safe_p(cs_res["p_value"]) - - concentration_pval = _safe_p(float(normal_sf(zscore, 0.0, 1.0))) - - p_cauchy = cauchy_combine([depletion_pval, concentration_pval]) - h_size = _safe_p(1.0 - np.exp(-1.0 / (sample_size + 1.0))) - # match notebook behavior: clip sens then use (1 - sens) (sens_power==1.0) - sens = np.clip(sens_score, 1e-6, 1.0) - h_sens = _safe_p(1.0 - sens) - # use notebook weights (w_size=0.05, w_sens=0.12) - p_final = 1.0 - (1.0 - p_cauchy) * (1.0 - 0.05 * h_size) * (1.0 - 0.12 * h_sens) - p_final = float(smooth_qvals(np.array([_safe_p(p_final)]))[0]) - - locally_enriched[self._adata.var_names[i_gene]] = { - "bic": self.bic_score(gmm1, gene_prior), - "zscore": zscore, - "sens_score": sens_score, - "depletion_pval": depletion_pval, - "localization_pval": depletion_pval, # backward-compatible alias - "concentration_pval": concentration_pval, - "h_size": h_size, - "h_sens": h_sens, - "pval": p_final, - "K_components": n_comp, - "sample_size": sample_size, - "depletion_scan": cs_res, - "depl_scan": cs_res, - } - - except ValueError as e: - if verbose: - logger.info(e) - - if verbose: - print("gzeros:", len(gzeros), "freqzeros:", len(freqzeros), "zzeros:", len(zzeros)) - return locally_enriched - - # ------------------------------------------------------------------ - # Small helpers still used by gmm_scan_new - # ------------------------------------------------------------------ - def get_genes_indices(self, genes): - inclgenes = range(self.n_genes) - list_genes = self._adata.var_names.tolist() - if genes is not None: - inclgenes = [list_genes.index(i) for i in genes] - return inclgenes - - def get_gene_prior(self, i_gene, weights_transform): - gene_prior = self.W[:, i_gene] - if weights_transform is not None: - gene_prior = weights_transform(gene_prior) - return gene_prior - - def signal_pdf(self, weights, n_comp=None): - gmm = self.signal_gmm(weights=weights, n_comp=n_comp, ) - return gmm.pdf(self._embedding) - - def gmm_loglikelihoodtest(self, genes=None, weights_transform=None, max_freq=0.5): - res = dict() - bkg_df = self.auto_bkg_components(self.n_cells) - log_bkg_pdf = self.background_pdf(weights_transform=weights_transform) - bkg_pdf_gt0 = log_bkg_pdf > 0 - log_bkg_pdf = np.where(bkg_pdf_gt0, np.log(log_bkg_pdf), 0) - - inclgenes = self.get_genes_indices(genes) - - for i_gene in tqdm(inclgenes, desc=f'scanning genes', - position=0, leave=True, disable=self._disable_progress_info): - gene_prior = self.get_gene_prior(i_gene, weights_transform) - gene_prior_gt0 = gene_prior > 0 - n_gene_prior = np.sum(gene_prior_gt0) - pv = 1.0 - df = None - lr = None - n_comp = 0 - sample_size = None - - if (n_gene_prior > 5) & ((n_gene_prior / len(gene_prior)) < max_freq): - comp_gene_prior = self._auto_n_effective_weights(gene_prior) - if comp_gene_prior is None: - n_comp = 1 - else: - loc_indices = comp_gene_prior > 0 - n_comp = self.auto_n_components( - self._embedding[loc_indices, :], - gene_prior[loc_indices], - indices=loc_indices, # <-- required - ) - - - f1 = self.signal_pdf(weights=gene_prior, n_comp=n_comp) - - df = (bkg_df - n_comp) + 1 - ix = gene_prior_gt0 & (f1 > 0) - if np.sum(ix) > 5: - ix = ix & bkg_pdf_gt0 - sample_size = np.sum(ix) - - if sample_size == 0: - pv = 0 - - elif sample_size > 5: - lr = self.calc_lratio(f1, ix, log_bkg_pdf, sample_size) - pv = 1 - chi2.cdf(lr, df) - - res[self._adata.var_names[i_gene]] = { - 'llratio_pvalue': pv, - 'llratio_sample_size': sample_size, - 'llratio_stat': lr, - 'llratio_df': df, - 'n_comp': n_comp, - } - - return res - - def gmm_local_pvalue( - self, - genes=None, - n_comp=None, - weights_transform=None, - alpha=0.05, - n_inits=100, - normalize_knn=True, - eps=1e-12, - ): - from scipy.stats import rankdata - from sklearn.metrics import roc_curve - import scipy.sparse as sp - import numpy as np - - f0 = np.asarray(self.background_pdf(weights_transform=weights_transform), dtype=self._dtype).ravel() - K = self.knn() - - # Optional: row-normalize K so every cell has comparable neighborhood "mass" - if normalize_knn: - if sp.issparse(K): - deg = np.asarray(K.sum(axis=1)).ravel() - inv = 1.0 / np.maximum(deg, eps) - Kuse = sp.diags(inv).dot(K) - else: - deg = K.sum(axis=1) - Kuse = K / np.maximum(deg[:, None], eps) - else: - Kuse = K - - locally_enriched = {} - inclgenes = self.get_genes_indices(genes) - - for i_gene in tqdm( - inclgenes, - desc="scanning genes", - position=0, - leave=True, - disable=self._disable_progress_info, - ): - gene_prior = self.get_gene_prior(i_gene, weights_transform) - gp = np.asarray(gene_prior, dtype=self._dtype).ravel() - - sw = float(gp.sum()) - if sw <= 0: - continue # or store null result if you prefer - - # signal pdf + LTST - f1 = np.asarray(self.signal_pdf(weights=gp, n_comp=n_comp), dtype=self._dtype).ravel() - p = float(np.mean(gp > 0)) - ltst_score = ltst_score_func(f0, f1, p) # (n_cells,) - - zscore = float(np.dot(gp, ltst_score) / sw) - - # random pseudo-genes (must return f2: (n_cells, n_inits), rweights: (n_cells, n_inits)) - f2, rweights = self.random_pdf(weights=gp, n_comp=n_comp, n_inits=n_inits) - f2 = np.asarray(f2, dtype=self._dtype) - rweights = np.asarray(rweights, dtype=self._dtype) - - # p2 is per-init expression fraction - p2 = np.mean(rweights > 0, axis=0).astype(self._dtype, copy=False) # (n_inits,) - - ltst_score2 = ltst_score_func(f0[:, None], f2, p2) # (n_cells, n_inits) - - # empirical zscore p-value (avoid 0/1 extremes) - z2 = (np.sum(rweights * ltst_score2, axis=0) / sw).astype(np.float64) # (n_inits,) - z_p = (1.0 + np.sum(z2 >= zscore)) / (n_inits + 1.0) - - # neighborhood-smoothed statistics - if sp.issparse(Kuse): - wstat1 = np.asarray(Kuse.dot(ltst_score)).ravel() - wstat2 = np.asarray(Kuse.dot(ltst_score2)) - else: - wstat1 = (Kuse @ ltst_score).ravel() - wstat2 = (Kuse @ ltst_score2) - - # empirical p-values per cell: rank among [wstat1, wstat2...] - # shape: (n_cells, n_inits+1) - M = np.concatenate([wstat1[:, None], wstat2], axis=1) - ranks = rankdata(-M, axis=1, method="average")[:, 0] # rank of observed in each row - emp_p = ranks / (n_inits + 1.0) # in (0,1] - - # pick a cutoff from ROC if there is signal - empirical_h1 = emp_p < (alpha / self.n_cells) - - wstat1_cutoff = 0.0 - if np.any(empirical_h1) and np.any(~empirical_h1): - fpr, tpr, cuts = roc_curve(empirical_h1.astype(int), wstat1) - # maximize balanced accuracy ( (tpr + (1-fpr)) / 2 ) - j = np.argmax(((1 - fpr) + tpr) / 2.0) - wstat1_cutoff = float(cuts[j]) - - local_res = { - "wstat_cutoff": wstat1_cutoff, - "wstat_alpha": float(alpha), - "wstat_pvalues": emp_p.astype(np.float32, copy=False), - "zscore": float(zscore), - "zscore_pvalue": float(z_p), - "wstat_repetitions": int(n_inits), - } - - sig_idx = np.flatnonzero(wstat1 > wstat1_cutoff) - local_res["wstat_significant"] = sig_idx if sig_idx.size else None - local_res["wstat_significant_clusters"] = None # placeholder - - locally_enriched[self._adata.var_names[i_gene]] = local_res - - return locally_enriched - - - def gmm_local_scan(self, - genes=None, - weights_transform=None, - zscore_thresh=None, - max_freq=0.5): - - locally_enriched = dict() - if zscore_thresh is None: - zscore_thresh = 1.0 - K = self.knn() - - if sp.issparse(K): - knn_neis = np.asarray(K.sum(axis=1)).ravel() - inv = 1.0 / np.maximum(knn_neis, 1e-12) - # row-normalize: D^{-1} K - knn_norm = sp.diags(inv).dot(K) - else: - knn_neis = K.sum(axis=1) - knn_norm = K / np.maximum(knn_neis[:, None], 1e-12) - - inclgenes = self.get_genes_indices(genes) - - f0 = self.background_pdf(weights_transform=weights_transform) - f0 = np.asarray(f0, dtype=self._dtype) - for i_gene in tqdm(inclgenes, desc=f'scanning genes', - position=0, leave=True, disable=self._disable_progress_info): - gene_prior = self.get_gene_prior(i_gene, weights_transform) - try: - if np.sum(gene_prior) == 0: - continue - - if np.mean(gene_prior > 0) > max_freq: - continue - - comp_gene_prior = self._auto_n_effective_weights(gene_prior) - if comp_gene_prior is None: - n_comp = 1 - else: - loc_indices = comp_gene_prior > 0 - n_comp = self.auto_n_components( - self._embedding[loc_indices, :], - gene_prior[loc_indices], - indices=loc_indices, # <-- THIS is the missing piece - ) - - - f1 = self.signal_pdf(weights=gene_prior, n_comp=n_comp) - expr = (gene_prior > 0).astype(self._dtype, copy=False) - - # p1 = local expression fraction around each cell - if sp.issparse(knn_norm): - p1 = knn_norm.dot(expr) - else: - p1 = knn_norm @ expr - p1 = np.asarray(p1, dtype=self._dtype).ravel() - f1 = np.asarray(f1, dtype=self._dtype).ravel() - # compute ltst per-cell then smooth by knn - ltst = ltst_score_func( - np.asarray(f0, dtype=self._dtype), - np.asarray(f1, dtype=self._dtype), - p1, - ) - - if sp.issparse(knn_norm): - local_zscore = knn_norm.dot(ltst) - else: - local_zscore = knn_norm @ ltst - local_zscore = np.asarray(local_zscore, dtype=self._dtype).ravel() - - local_zscore = self._null_distribution.to_zscore(local_zscore, p1) - - K = self.knn() - expr_mask = (gene_prior > 0).astype(np.float32) # (n,) - sig_mask = (f1 > f0).astype(np.float32) # (n,) - - if sp.issparse(K): - expr_neighbors = K.dot(expr_mask) # how many expressing neighbors (weighted) - sig_expr_neighbors = K.dot(expr_mask * sig_mask) # how many expressing-and-signal neighbors - else: - expr_neighbors = K @ expr_mask - sig_expr_neighbors = K @ (expr_mask * sig_mask) - - local_lscore = sig_expr_neighbors / np.maximum(expr_neighbors, 1e-12) - local_lscore = np.asarray(local_lscore).ravel() - - - # rate - # local_zscore = local_zscore * knn_neis / self.n_cells - assert K.shape == (self.n_cells, self.n_cells), (K.shape, self.n_cells) - assert f0.shape == (self.n_cells,), (f0.shape, self.n_cells) - assert f1.shape == (self.n_cells,), (f1.shape, self.n_cells) - - - localization_pval = localization_pvalue_nn_func(gene_prior, f1, f0, K) - concentration_pval = norm.sf(local_zscore) - #localization_pval = 1.0 - #concentration_pval = norm.sf(local_zscore) - - - if np.any(local_zscore > zscore_thresh): - locally_enriched[self._adata.var_names[i_gene]] = { - 'enriched': np.flatnonzero(local_zscore > zscore_thresh), - 'local_zscore': local_zscore, - 'local_lscore': local_lscore, - 'localization_pval': localization_pval, - 'concentration_pval': concentration_pval, - 'local_pvalue': localization_pval + concentration_pval - localization_pval*concentration_pval, - 'K_components': n_comp - } - except Exception as e: - logger.exception(e) # prints full traceback - raise # optional: stop on first failure - - return locally_enriched - - @staticmethod - def calc_lratio(f1, ix, log_bkg_pdf, sample_size, eps=1e-300): - """ - Per-cell LRT contribution on expressing cells: - -2 * sum_{i in ix} ( log f0(i) - log f1(i) ) / sample_size - """ - # ensure 1D arrays - f1 = np.asarray(f1).ravel() - ix = np.asarray(ix, dtype=bool).ravel() - log_bkg_pdf = np.asarray(log_bkg_pdf).ravel() - - # log f1 safely (no mutation) - log_f1 = np.log(np.clip(f1, eps, np.inf)) - - denom = float(sample_size) if sample_size > 0 else 1.0 - return float((-2.0 * np.sum((log_bkg_pdf[ix] - log_f1[ix])) ) / denom) - - def random_pdf( - self, - weights, - n_comp=None, - n_inits=300, - buckets=None, - ): - import numpy as np - from scipy.stats import multivariate_normal as mnorm # <-- add back - - weights = np.asarray(weights, dtype=self._dtype).ravel() - - if n_comp is None: - comp_weights = self._auto_n_effective_weights(weights) - if comp_weights is None: - n_comp = 1 - else: - loc_indices = comp_weights > 0 - n_comp = self.auto_n_components( - self._embedding[loc_indices, :], - weights[loc_indices], - indices=loc_indices, # <-- IMPORTANT - ) - - pis, mus, sigmas, rweights = softbootstrap_gmm( - self._embedding, - raw_weights=weights, - n_components=n_comp, - reg_covar=self.reg_covar(), - n_inits=n_inits, - buckets=buckets, - ) - - pis = np.asarray(pis) - mus = np.asarray(mus) - sigmas = np.asarray(sigmas) - rweights = np.asarray(rweights) - - dtot = np.zeros((self.n_cells, n_inits), dtype=np.float64) - - # NOTE: this loop is slow but correct; optimize later if needed - for j in range(n_inits): - for i in range(n_comp): - w = pis[j, i] - if w == 0: - continue - c0 = mnorm(mean=mus[j, i], cov=sigmas[j, i], allow_singular=True) - dtot[:, j] += w * c0.pdf(self._embedding) - - return dtot.astype(self._dtype, copy=False), rweights.astype(self._dtype, copy=False) - - -# ---------------------------------------------------------------------- -# JIT-accelerated scoring helpers -# ---------------------------------------------------------------------- -@numba.jit(nopython=True) -def ltst_score_func(f0, f1, p): - q = np.sqrt(p) - return (q * (1 - q)) * (f1 - f0) / (f1 + f0) - - -@numba.jit(nopython=True) -def sens_score_func(f0, f1, i): - return np.mean(f1[i > 0] > f0[i > 0]) - -import numpy as np -import scipy.sparse as sp - -def localization_pvalue_nn_func(x1, f1, f0, nn): - """ - Sparse-safe rewrite of the original localization_pvalue_nn_func. - - Preserves: - - i1/obs1/obs2 definitions - - n and o as neighborhood (weighted) counts / signed balance - - f2 weighting for global mu_hat, p1, p0 - - effective_n, sd_hat - - per-cell p via normal_sf(o[i], mu_hat, sd_hat[i]) - - Works for: - - nn sparse CSR/CSC/COO - - nn dense numpy array - """ - x1 = np.asarray(x1) - f1 = np.asarray(f1) - f0 = np.asarray(f0) - - # boolean masks - i1 = (x1 > 0) - obs1 = ((f1 > f0) & i1) - obs2 = ((f1 < f0) & i1) - - # convert to float vectors for dot products - i1_f = i1.astype(np.float32) - obs1_f = obs1.astype(np.float32) - obs2_f = obs2.astype(np.float32) - - # neighborhood weighted counts - if sp.issparse(nn): - # ensure CSR for fast dot - nn = nn.tocsr() - n = nn.dot(i1_f) - o = nn.dot(obs1_f) - nn.dot(obs2_f) - else: - n = nn @ i1_f - o = (nn @ obs1_f) - (nn @ obs2_f) - - n = np.asarray(n).ravel() - o = np.asarray(o).ravel() - - # normalize signed balance - o = o / np.clip(n, 1.0, np.inf) - - # same f2 + global mu_hat logic as original - f2 = 2.0 * (f1 * f0) / np.clip(f1 + f0, 1e-12, np.inf) - - denom = float(np.sum(f2)) - if denom <= 0 or not np.isfinite(denom): - # degenerate fallback: no information => p=1 everywhere - return np.ones(shape=(nn.shape[0],), dtype=np.float64) - - p1 = float(np.sum((f1 > f0) * f2) / denom) - p0 = float(np.sum((f1 < f0) * f2) / denom) - mu_hat = p1 - p0 - - effective_n = np.clip(n * (1.0 + 2.0 * np.abs(o / 2.0)), 1.0, np.inf) - sd_hat = np.sqrt(((p1 + p0) - ((p1 - p0) ** 2)) / effective_n) - - # compute p only where n>0 (same behavior as your loop) - p = np.ones(shape=(nn.shape[0],), dtype=np.float64) - idx = np.flatnonzero(n > 0) - - # vectorized normal_sf (your normal_sf is scalar-numba; keep loop or vectorize in numpy) - # We'll keep the loop for correctness with your numba normal_sf signature. - for i in idx: - p[i] = normal_sf(float(o[i]), mu=mu_hat, sigma=float(sd_hat[i])) - - return p - - -@numba.njit -def normal_sf(x, mu, sigma): - """ - Normal survival function SF = 1 - CDF, numba-jitted. - """ - z = (x - mu) / sigma - return 0.5 * math.erfc(z * 0.7071067811865476) # = 1 - Phi(z) - - -def _logsidak_from_logp(logp_min: float, m_eff: int) -> float: - """ - Sidák combine in log-space: - log p_sidák = log(1 - (1 - p_min)^m_eff) - = log(1 - exp(m_eff * log(1 - p_min))) - with log(1 - p_min) = log1p(-exp(logp_min)). - """ - if m_eff <= 1: - return float(logp_min) - l1mp = np.log1p(-np.exp(logp_min)) - return float(np.log1p(-np.exp(m_eff * l1mp))) - - -@numba.njit -def smooth_qvals(x): - return np.where( - x <= 0.99, - x, - 1 - 0.01 / (1 + 100 * (x - 0.99) + 10000 * (x - 0.99) ** 2), - ) - - -def _safe_p(p, eps=1e-15): - if not np.isfinite(p): - return 1.0 - eps - return float(min(max(p, eps), 1.0 - eps)) - - -def cauchy_combine(pvals, weights=None): - """ - Robust p-value combiner for dependent tests (Cauchy combination). - Liu & Xie (2020). - """ - ps = np.array([_safe_p(p) for p in pvals], dtype=np.float64) - if weights is None: - weights = np.ones_like(ps) - w = np.asarray(weights, dtype=np.float64) - w = w / (w.sum() if w.sum() > 0 else 1.0) - - t = np.sum(w * np.tan((0.5 - ps) * np.pi)) - pc = 0.5 - np.arctan(t) / np.pi - return _safe_p(pc) - - -def summarize_rc_debug(cs_res, top=8): - """ - Convenience helper to inspect per-threshold diagnostics from - depletion_pval_scan(..., debug=True). - """ - if "per_lambda" not in cs_res: - print("No per-threshold diagnostics captured. Run with debug=True.") - return - rows = cs_res["per_lambda"] - if not rows: - print("No thresholds scanned.") - return - from collections import Counter - - reasons = Counter(r.get("reason") for r in rows) - print("Reason counts:", dict(reasons)) - - cand = [r for r in rows if r.get("reason") is not None] - cand.sort(key=lambda r: r["n_eff_expected"], reverse=True) - print(f"\nTop {min(top, len(cand))} failing thresholds by n_eff_expected:") - for r in cand[:top]: - print( - f" lambda={r['lambda']:.3f} p0_abs={r['p0_abs']:.3f} " - f"obs_prop={r['obs_prop']:.3f} n_eff_expected={r['n_eff_expected']:.1f} " - f"reason={r['reason']}" - ) From ce53fb7dfccd6fdd04a8bf46b727ae2226774f94 Mon Sep 17 00:00:00 2001 From: weslewis Date: Mon, 2 Mar 2026 12:18:55 -0500 Subject: [PATCH 6/7] Update LOCAT/gmm_scan_new default parameters --- locat/locat_condensed.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/locat/locat_condensed.py b/locat/locat_condensed.py index 82341ea..cb89cce 100755 --- a/locat/locat_condensed.py +++ b/locat/locat_condensed.py @@ -56,7 +56,7 @@ def __init__( adata: AnnData, cell_embedding: np.ndarray, k: int, - n_bootstrap_inits: int, + n_bootstrap_inits: int = 50, show_progress=False, wgmm_dtype: str = "same", # "same" | "float32" | "float64" knn=None, # <-- NEW: precomputed adjacency/connectivities @@ -752,17 +752,17 @@ def gmm_scan_new( genes=None, weights_transform=None, zscore_thresh=None, - max_freq=0.5, + max_freq=0.9, verbose=False, n_bootstrap_inits=None, # Depletion-scan defaults rc_lambda_values=None, # default inside method rc_min_p0_abs=0.10, #minimum proportion of f0 density in depleted region required for the region pval to be estimated - rc_min_expected=30, #minimum expected cells in depleted region required for the region pval to be estimated + rc_min_expected=3, #minimum expected cells in depleted region required for the region pval to be estimated rc_min_abs_deficit=0.04, #minimum absolute difference in f1(x) - f0(x) for all x in depleted region - rc_n_trials_cap=500, #maximum effective sample size + rc_n_trials_cap=None, #if None, defaults to sqrt(n_cells) rc_soft_bound=1.0, #this is unused/can be removed - rc_n_eff_scale=0.5, #scaling factor for effective sample sizes -- can be tweaked to stabilize pvalues across various gene sample sizes + rc_n_eff_scale=0.6, #scaling factor for effective sample sizes -- can be tweaked to stabilize pvalues across various gene sample sizes rc_p_floor=1e-12, # this is just model precision, can be ignored rc_rho_bb=0.02, #this is the strength of the beta binomial (0.0 is standard binomial, set at 0.02-0.05 for wider tails) rc_weight_mode="binary", @@ -773,6 +773,11 @@ def gmm_scan_new( if n_bootstrap_inits is not None: self.n_bootstrap_inits = int(n_bootstrap_inits) + rc_n_trials_cap_eff = ( + int(max(1, np.sqrt(self.n_cells))) + if rc_n_trials_cap is None + else int(rc_n_trials_cap) + ) locally_enriched = dict() gzeros, freqzeros, zzeros = [], [], [] @@ -834,7 +839,7 @@ def gmm_scan_new( min_p0_abs=rc_min_p0_abs, min_expected=rc_min_expected, min_abs_deficit=rc_min_abs_deficit, - n_trials_cap=rc_n_trials_cap, + n_trials_cap=rc_n_trials_cap_eff, weight_mode=rc_weight_mode, p_floor=rc_p_floor, n_eff_scale=rc_n_eff_scale, From 50198438afc3caceb45d513c2e181ff0509bacd8 Mon Sep 17 00:00:00 2001 From: weslewis Date: Mon, 2 Mar 2026 15:17:54 -0500 Subject: [PATCH 7/7] Align LOCAT defaults and matrix accessors; harden LRT/depletion outputs --- locat/locat_condensed.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/locat/locat_condensed.py b/locat/locat_condensed.py index cb89cce..ee881b4 100755 --- a/locat/locat_condensed.py +++ b/locat/locat_condensed.py @@ -126,7 +126,13 @@ def reg_covar(self, sample_size=None): # Data access / options # ------------------------------------------------------------------ @property - def W(self): + def W_t(self): + """ + Implementation-oriented expression matrix (cells x genes). + + This is the transpose of the manuscript's W object, which is + described in gene x cell orientation. + """ if self._X is None: X = self._adata.X if not isinstance(X, np.ndarray): @@ -134,11 +140,6 @@ def W(self): self._X = X.astype(self._dtype, copy=False) return self._X - @property - def X(self): - # Backward-compatible alias for previous property name. - return self.W - def show_progress(self, show_progress=True): self._disable_progress_info = not show_progress @@ -159,7 +160,7 @@ def background_n_components_init(self, weights_transform=None, min_points=10, n_ bic_component_cost = self.n_dims * (self.n_dims + 3) / 2 if weights_transform is not None: - Xdense = self.W.copy() + Xdense = self.W_t.copy() for i in range(self.n_genes): Xdense[:, i] = weights_transform(Xdense[:, i]) weights = np.asarray(Xdense.sum(axis=1), dtype=self._dtype) @@ -372,7 +373,6 @@ def background_pdf( self, n_comp=None, reps=10, - total_counts_weight=True, weights_transform=None, force_refresh=False, ): @@ -393,7 +393,7 @@ def background_pdf( logger.info(f"Using {n_comp} components") if weights_transform is not None: - Xdense = self.W.copy() + Xdense = self.W_t.copy() for i in range(self.n_genes): Xdense[:, i] = weights_transform(Xdense[:, i]) weights = np.asarray(Xdense.sum(axis=1), dtype=self._dtype) @@ -545,10 +545,10 @@ def depletion_pval_scan( min_expected=30, min_abs_deficit=0.02, n_trials_cap=500, - weight_mode="amount", + weight_mode="binary", p_floor=1e-12, - n_eff_scale=1.0, - rho_bb=0.2, # >0 enables Beta–Binomial tail + n_eff_scale=0.6, + rho_bb=0.02, # >0 enables Beta–Binomial tail eps_rel=0.01, debug=False, debug_store_masks=False, @@ -673,7 +673,7 @@ def depletion_pval_scan( "obs_prop": None, "scanned": int(scanned), "tested": int(tested), - "sidak_penalty": 1, + "sidak_penalty": int(tested), "n": n, "n_eff_g": float(n_eff_g), "n_trials_eff": int(n_trials_eff), @@ -712,7 +712,7 @@ def depletion_pval_scan( "obs_prop": best["obs_prop"], "scanned": int(scanned), "tested": int(tested), - "sidak_penalty": 1, + "sidak_penalty": int(m_eff), "n": n, "n_eff_g": float(n_eff_g), "n_trials_eff": int(n_trials_eff), @@ -863,7 +863,6 @@ def gmm_scan_new( "zscore": zscore, "sens_score": sens_score, "depletion_pval": depletion_pval, - "localization_pval": depletion_pval, # backward-compatible alias "concentration_pval": concentration_pval, "h_size": h_size, "h_sens": h_sens, @@ -893,7 +892,7 @@ def get_genes_indices(self, genes): return inclgenes def get_gene_prior(self, i_gene, weights_transform): - gene_prior = self.W[:, i_gene] + gene_prior = self.W_t[:, i_gene] if weights_transform is not None: gene_prior = weights_transform(gene_prior) return gene_prior @@ -937,7 +936,8 @@ def gmm_loglikelihoodtest(self, genes=None, weights_transform=None, max_freq=0.5 f1 = self.signal_pdf(weights=gene_prior, n_comp=n_comp) - df = (bkg_df - n_comp) + 1 + # Keep df in the valid chi-square domain in edge cases. + df = max(1, int((bkg_df - n_comp) + 1)) ix = gene_prior_gt0 & (f1 > 0) if np.sum(ix) > 5: ix = ix & bkg_pdf_gt0