From 13b1b21b902c19d8c9d5047c36d98709a191242d Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Tue, 9 Sep 2025 22:37:30 +0200 Subject: [PATCH 01/25] init internal functions for permutation testing --- .../permutation_test/internal_functions.py | 67 +++++++++ src/acore/permutation_test/jaccard.py | 141 ++++++++++++++++++ 2 files changed, 208 insertions(+) create mode 100644 src/acore/permutation_test/internal_functions.py create mode 100644 src/acore/permutation_test/jaccard.py diff --git a/src/acore/permutation_test/internal_functions.py b/src/acore/permutation_test/internal_functions.py new file mode 100644 index 0000000..a48937f --- /dev/null +++ b/src/acore/permutation_test/internal_functions.py @@ -0,0 +1,67 @@ +import numpy as np +import pandas as pd + + +def _permute(*groups, rng=np.random.default_rng(seed=12345)): + """ + Perform a single permutation of the groups. + + Parameters + ---------- + *groups : iterable + The groups to permute. + rng : np.random.Generator, optional + The random number generator to use for shuffling. + Defaults to a new random number generator. + + Returns + ------- + list + The permuted groups. + """ + # shuffle the combined array + combined = np.concatenate(groups) + rng.shuffle(combined) + # split the shuffled array into new groups of og size + new_groups = [] + start = 0 + for group in groups: + end = start + len(group) + new_groups.append(combined[start:end]) + start = end + return new_groups + + +def _contingency_table(*groups, to_np:bool=True)-> np.array: + """ + Create a contingency table from the provided groups. + + Parameters + ---------- + *groups : iterable + The groups to include in the contingency table. + to_np : bool, optional + Whether to return the table as a NumPy array (default is True). + + Returns + ------- + np.array + The contingency table as a NumPy array or a pandas DataFrame. + """ + # PRECONDITIONS + for group in groups: + if not isinstance(group, (list, pd.Series, np.ndarray)): + raise TypeError("Each group must be a list, pandas Series, or numpy array.") + + + # MAIN FUNCTION + # Create contingency table + table = pd.concat( + [pd.Series(group).value_counts() for group in groups], axis=1 + ).fillna(0).T + + if to_np: + return table.to_numpy() + else: + return table + diff --git a/src/acore/permutation_test/jaccard.py b/src/acore/permutation_test/jaccard.py new file mode 100644 index 0000000..08e52ff --- /dev/null +++ b/src/acore/permutation_test/jaccard.py @@ -0,0 +1,141 @@ +import numpy as np +import itertools +from collections.abc import Iterable + +def jaccard_similarity(set1:set, set2:set) -> float: + """ + Compute the Jaccard similarity between two sets. + + Parameters + ---------- + set1 : set + First set of nodes. + set2 : set + Second set of nodes. + Returns + ------- + float + Jaccard similarity coefficient, + which is the size of the intersection divided by + the size of the union of the two sets. + + Example + ------- + >>> set1 = {1, 2, 3} + >>> set2 = {2, 3, 4} + >>> jaccard_similarity(set1, set2) + 0.5 + """ + intersection = len(set1 & set2) + union = len(set1 | set2) + + if union !=0: + return intersection / union + else: + return 0 + + +def avg_jaccard(group: Iterable)-> tuple: + """ + Compute the average pairwise Jaccard similarity within a group of graphs. + + Parameters + ---------- + group : Iterable + An iterator of iterable objects, such as a list of sets. + + Returns + ------- + tuple + A tuple containing the average Jaccard similarity and its standard deviation. + + Example + ------- + >>> group = [{1, 2, 3}, {2, 3, 4}, {3, 4, 5}] + >>> avg_jaccard(group) + (0.3333333333333333, 0.0) + """ + + ## PRECONDITIONS + if not isinstance(group, Iterable): + raise TypeError("Input must be an iterable of iterables.") + if len(group) > 0: + for g in group: + if not isinstance(g, Iterable): + raise TypeError("Each element in the group must be an iterable (set, list, or tuple).") + elif len(group) == 0: + raise ValueError("Input group is empty. Cannot compute Jaccard similarity.") + + + # init the list + scores = [] + + for g1, g2 in itertools.combinations(group, 2): + # get sets + set1 = set(g1) + set2 = set(g2) + # compute Jaccard similarity + scores.append(jaccard_similarity(set1, set2)) + + if scores: + # if scores is not empty, return the average + return np.mean(scores), np.std(scores) + # if scores is empty (less than 2 in group), return 0 + else: + return 0, 0 + + +def btwn_jaccard(group1: Iterable, group2: Iterable) -> tuple: + """ + Compute the average Jaccard similarity between two groups of graphs. + + Parameters + ---------- + group1 : Iterable + An iterator of iterable objects, such as a list of sets. + group2 : Iterable + Another iterator of iterable objects. + + Returns + ------- + tuple + A tuple containing the average Jaccard similarity and its standard deviation. + + Example + ------- + >>> group1 = [{1, 2, 3}, {2, 3, 4}] + >>> group2 = [{3, 4, 5}, {4, 5, 6}] + >>> btwn_jaccard(group1, group2) + (0.3333333333333333, 0.0) + + """ + + if not isinstance(group1, Iterable) or not isinstance(group2, Iterable): + raise TypeError("Both inputs must be iterables of iterables.") + if len(group1) == 0 or len(group2) == 0: + raise ValueError("Both input groups must be non-empty.") + + # init the list + scores = [] + + for g1, g2 in itertools.product(group1, group2): + + # check + if not isinstance(g1, Iterable) or not isinstance(g2, Iterable): + raise TypeError("Each element in the groups must be an iterable (set, list, or tuple).") + + # get sets + set1 = set(g1) + set2 = set(g2) + + # compute Jaccard similarity + scores.append(jaccard_similarity(set1, set2)) + + if scores: + # if scores is not empty, return the average + return np.mean(scores), np.std(scores) + # if scores is empty (less than 2 in group), return 0 + else: + return 0, 0 + + From cdb1705a4f1158d3cefda3e94340d0a902f9bbbd Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Tue, 9 Sep 2025 22:37:41 +0200 Subject: [PATCH 02/25] user facing in init file --- src/acore/permutation_test/__init__.py | 185 +++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 src/acore/permutation_test/__init__.py diff --git a/src/acore/permutation_test/__init__.py b/src/acore/permutation_test/__init__.py new file mode 100644 index 0000000..c0e6b8e --- /dev/null +++ b/src/acore/permutation_test/__init__.py @@ -0,0 +1,185 @@ +import numpy as np +import pandas as pd +from scipy.stats import ( + chi2_contingency, + ttest_rel, + ttest_ind, + f_oneway, +) + +from .internal_functions import _permute, _contingency_table + + +def paired_permutation( + cond1: np.ndarray, + cond2: np.ndarray, + metric: str = "t-statistic", + n_permutations: int = 10000, + rng: np.random.Generator = np.random.default_rng(seed=12345), + **kwargs +) -> dict: + """Perform a permutation test for paired samples.""" + # Validate input + if not isinstance(cond1, np.ndarray) or not isinstance(cond2, np.ndarray): + raise TypeError("Input must be numpy arrays.") + if cond1.shape != cond2.shape: + raise ValueError("Input arrays must have the same shape.") + + # paired differences + diff = cond1-cond2 + args = [diff] + + # compute observed metric + if metric == "t-statistic": + calculator = ttest_rel + args = [cond1, cond2] + elif metric == "mean": + calculator = np.mean + elif metric == "median": + calculator = np.median + elif callable(metric): + calculator = metric + else: + raise ValueError( + "Invalid metric specified. Acceptable metrics are: " + "'t-statistic', 'mean', 'median', or a custom function" + "that takes `cond1-cond2` as input." + ) + + observed_metric = calculator(*args, **kwargs) + if metric == "t-statistic": + abs_met = abs(observed_metric.statistic) + else: + abs_met = abs(observed_metric) + + # Perform permutations + permuted_f = [] + for _ in range(n_permutations): + # randomly flip direction of differences + permuted_diff = diff * rng.choice([-1, 1], size=diff.shape) + new_cond1 = np.where((permuted_diff==diff), cond1, cond2) + new_cond2 = np.where((permuted_diff==diff), cond2, cond1) + # postcondition check + if not (permuted_diff == (new_cond1 - new_cond2)).all(): + raise ArithmeticError("Postcondition failed: Issue with permuted differences") + # prep args + if metric == "t-statistic": + new_args = [new_cond1, new_cond2] + new_result = abs(calculator(*new_args, **kwargs).statistic) + else: + new_args = [permuted_diff] + new_result = abs(calculator(*new_args, **kwargs)) + # compute permuted metric + permuted_f.append(new_result) + + # Compute p-value + p_value = np.mean(permuted_f >= abs_met) + + return { + "metric": calculator, + "observed": observed_metric, + "p_value": p_value + } + + +def chi2_permutation( + *groups, + n_permutations: int = 10000, + rng: np.random.Generator = np.random.default_rng(seed=12345) +) -> dict: + """Perform a permutation test for chi-squared statistics.""" + + # generate contingency table + cont_table = _contingency_table(*groups, to_np=True) + + # Compute observed chi-squared statistic + observed_test = chi2_contingency(cont_table) + observed_chi2 = observed_test.statistic + + # Perform permutations + permuted_chi2 = [] + for _ in range(n_permutations): + # shuffle + perm_cont_table = _contingency_table(*_permute(*groups, rng=rng)) + + # calculate permuted chi-squared statistic + permuted_chi2.append(chi2_contingency(perm_cont_table).statistic) + + # Compute p-value + p_value = np.mean(permuted_chi2 >= observed_chi2) + + return { + "observed": observed_test, + "p_value": p_value + } + + +def indep_permutation( + group1: np.ndarray, + group2: np.ndarray, + metric: str = "t-statistic", + n_permutations: int = 10000, + rng: np.random.Generator = np.random.default_rng(seed=12345), + **kwargs +) -> dict: + """ + TODO + """ + + ## PRECONDITIONS + if not isinstance(group1, np.ndarray) or not isinstance(group2, np.ndarray): + raise TypeError("Input must be numpy arrays.") + + stat = False + # what metric to use + if metric == "t-statistic": + calculator = ttest_ind + stat = True + elif metric == "anova": + calculator = f_oneway + stat = True + elif metric == "mean": + calculator = np.mean + elif metric == "median": + calculator = np.median + elif callable(metric): + calculator = metric + else: + raise ValueError( + "Invalid metric specified. Acceptable metrics are: " + "'t-statistic', 'mean', 'median', or a custom function" + "that takes each group as input." + ) + + # compute observed metric + if stat: + observed_metric = calculator(group1, group2, **kwargs) + else: + observed_metric = abs(calculator(group1) - calculator(group2)) + + # Perform permutations + permuted_f = [] + for _ in range(n_permutations): + new_group1, new_group2 = _permute(group1, group2, rng=rng) + # prep args + if stat: + new_result = abs(calculator(new_group1, new_group2, **kwargs).statistic) + abs_met = abs(observed_metric.statistic) + else: + new_result = abs(calculator(new_group1) - calculator(new_group2)) + abs_met = abs(observed_metric) + # compute permuted metric + permuted_f.append(new_result) + + # Compute p-value + p_value = np.mean(permuted_f >= abs_met) + + return { + "metric": calculator, + "observed": observed_metric, + "p_value": p_value + } + + + + From a13be15c4e34f356c598e1676bc94f3faca45f38 Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Tue, 9 Sep 2025 22:38:00 +0200 Subject: [PATCH 03/25] :construction: some tests --- tests/test_permutation_test.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 tests/test_permutation_test.py diff --git a/tests/test_permutation_test.py b/tests/test_permutation_test.py new file mode 100644 index 0000000..d7eb091 --- /dev/null +++ b/tests/test_permutation_test.py @@ -0,0 +1,34 @@ +from acore import permutation_test as pt +import pytest +import numpy as np + +@pytest.fixture +def data1(): + return np.arange(1, 10) + +@pytest.fixture +def data2(): + return np.arange(31, 40) + +def test_paired_permutation(data1): + cond1 = data1 + cond2 = data1 + expected = { + "metric": np.mean, + "observed": 0.0, + "p_value": 1.0, + } + result = pt.paired_permutation(cond1, cond2, metric=np.mean) + assert result["metric"] == expected["metric"] + assert result["observed"] == expected["observed"] + assert result["p_value"] == expected["p_value"] + +def test_paired_permutation_reject(data1, data2): + cond1 = data1 + cond2 = data2 + for stat in ["median"]: + result = pt.paired_permutation(cond1, cond2, metric=stat) + print(result) + assert result["p_value"] < 0.05 + + From 5a5a42a643f4cc53197b497d5d84dcf50b3df14c Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Wed, 10 Sep 2025 16:03:46 +0200 Subject: [PATCH 04/25] :bug: degeneracy check for paired permutations --- .../permutation_test/internal_functions.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/acore/permutation_test/internal_functions.py b/src/acore/permutation_test/internal_functions.py index a48937f..58b8f46 100644 --- a/src/acore/permutation_test/internal_functions.py +++ b/src/acore/permutation_test/internal_functions.py @@ -1,7 +1,6 @@ import numpy as np import pandas as pd - def _permute(*groups, rng=np.random.default_rng(seed=12345)): """ Perform a single permutation of the groups. @@ -65,3 +64,20 @@ def _contingency_table(*groups, to_np:bool=True)-> np.array: else: return table + +def _check_degeneracy(array: np.ndarray) -> bool: + """ + Check for degeneracy in the differences of paired samples. + + Parameters + ---------- + array : np.ndarray + The differences array. e.g. cond1 - cond2 + + Returns + ------- + bool + True if the conditions are degenerate, False otherwise. + """ + return np.all(array == array[0]) + From 179ce05db36e77b99403bfdb2733217a5780b4f5 Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Wed, 10 Sep 2025 16:19:49 +0200 Subject: [PATCH 05/25] use internal functions --- src/acore/permutation_test/__init__.py | 42 ++++++++++++++++++++------ 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/src/acore/permutation_test/__init__.py b/src/acore/permutation_test/__init__.py index c0e6b8e..516b64f 100644 --- a/src/acore/permutation_test/__init__.py +++ b/src/acore/permutation_test/__init__.py @@ -6,8 +6,14 @@ ttest_ind, f_oneway, ) +import warnings +warnings.simplefilter("always", UserWarning) -from .internal_functions import _permute, _contingency_table +from .internal_functions import ( + _permute, + _contingency_table, + _check_degeneracy +) def paired_permutation( @@ -72,14 +78,28 @@ def paired_permutation( # compute permuted metric permuted_f.append(new_result) - # Compute p-value - p_value = np.mean(permuted_f >= abs_met) + if _check_degeneracy(diff): + identical_warn = ( + "Degenerate conditions detected (the data are identical). " + "Consider using a different statistical test. " + "Results may be unreliable." + ) + warnings.warn(identical_warn) - return { - "metric": calculator, - "observed": observed_metric, - "p_value": p_value - } + return { + "metric": calculator, + "observed": observed_metric, + "p_value": np.nan, + } + else: + # Compute p-value + p_value = np.mean(permuted_f >= abs_met) + + return { + "metric": calculator, + "observed": observed_metric, + "p_value": p_value + } def chi2_permutation( @@ -87,8 +107,9 @@ def chi2_permutation( n_permutations: int = 10000, rng: np.random.Generator = np.random.default_rng(seed=12345) ) -> dict: - """Perform a permutation test for chi-squared statistics.""" - + """ + + """ # generate contingency table cont_table = _contingency_table(*groups, to_np=True) @@ -183,3 +204,4 @@ def indep_permutation( + From a144f7e727d5adbfc94e476875d3e33c95d97df7 Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Wed, 10 Sep 2025 16:22:05 +0200 Subject: [PATCH 06/25] :memo: copilot docstrings --- src/acore/permutation_test/__init__.py | 71 ++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 4 deletions(-) diff --git a/src/acore/permutation_test/__init__.py b/src/acore/permutation_test/__init__.py index 516b64f..2a3cb9b 100644 --- a/src/acore/permutation_test/__init__.py +++ b/src/acore/permutation_test/__init__.py @@ -24,8 +24,32 @@ def paired_permutation( rng: np.random.Generator = np.random.default_rng(seed=12345), **kwargs ) -> dict: - """Perform a permutation test for paired samples.""" - # Validate input + """ + Perform a permutation test for paired samples. + + Parameters + ---------- + cond1 : np.ndarray + First condition (paired samples). + cond2 : np.ndarray + Second condition (paired samples). + metric : str or callable, optional + Metric to compute ('t-statistic', 'mean', 'median', or a custom function). + n_permutations : int, optional + Number of permutations to perform (default is 10000). + rng : np.random.Generator, optional + Random number generator (default is np.random.default_rng(seed=12345)). + **kwargs + Additional arguments passed to the metric function. + + Returns + ------- + dict + Dictionary with keys: + - 'metric': Metric function used. + - 'observed': Observed metric value. + - 'p_value': Permutation test p-value (np.nan if degenerate). + """ # Validate input if not isinstance(cond1, np.ndarray) or not isinstance(cond2, np.ndarray): raise TypeError("Input must be numpy arrays.") if cond1.shape != cond2.shape: @@ -108,7 +132,23 @@ def chi2_permutation( rng: np.random.Generator = np.random.default_rng(seed=12345) ) -> dict: """ - + Perform a permutation test for categorical data using the chi-squared statistic. + + Parameters + ---------- + *groups : array-like + Arrays representing categorical groups. + n_permutations : int, optional + Number of permutations to perform (default is 10000). + rng : np.random.Generator, optional + Random number generator (default is np.random.default_rng(seed=12345)). + + Returns + ------- + dict + Dictionary with keys: + - 'observed': Observed chi-squared test result. + - 'p_value': Permutation test p-value. """ # generate contingency table cont_table = _contingency_table(*groups, to_np=True) @@ -144,7 +184,30 @@ def indep_permutation( **kwargs ) -> dict: """ - TODO + Perform a permutation test for independent samples. + + Parameters + ---------- + group1 : np.ndarray + First group of samples. + group2 : np.ndarray + Second group of samples. + metric : str or callable, optional + Metric to compute ('t-statistic', 'anova', 'mean', 'median', or a custom function). + n_permutations : int, optional + Number of permutations to perform (default is 10000). + rng : np.random.Generator, optional + Random number generator (default is np.random.default_rng(seed=12345)). + **kwargs + Additional arguments passed to the metric function. + + Returns + ------- + dict + Dictionary with keys: + - 'metric': Metric function used. + - 'observed': Observed metric value. + - 'p_value': Permutation test p-value. """ ## PRECONDITIONS From e66b6b337e0c92500be2c4b8cdf9813bbdd0cda6 Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Wed, 10 Sep 2025 16:22:40 +0200 Subject: [PATCH 07/25] tests for accept reject null --- tests/test_permutation_test.py | 96 +++++++++++++++++++++++++++------- 1 file changed, 76 insertions(+), 20 deletions(-) diff --git a/tests/test_permutation_test.py b/tests/test_permutation_test.py index d7eb091..2372e31 100644 --- a/tests/test_permutation_test.py +++ b/tests/test_permutation_test.py @@ -1,34 +1,90 @@ from acore import permutation_test as pt import pytest import numpy as np +from scipy.stats import ttest_rel, ttest_ind, mannwhitneyu, wilcoxon @pytest.fixture -def data1(): +def set_array(): return np.arange(1, 10) @pytest.fixture -def data2(): - return np.arange(31, 40) - -def test_paired_permutation(data1): - cond1 = data1 - cond2 = data1 - expected = { - "metric": np.mean, - "observed": 0.0, - "p_value": 1.0, - } +def random_low(): + return np.random.negative_binomial(10, 0.3, 100) + +@pytest.fixture +def random_high(): + return np.random.negative_binomial(20, 0.1, 100) + +@pytest.fixture +def random_choice_equal(): + return np.random.choice( + ["A", "B", "C", "D"], + size=100, + replace=True, + p=[0.25, 0.25, 0.25, 0.25] + ) + +@pytest.fixture +def random_choice_unequal(): + return np.random.choice( + ["A", "B", "C", "D"], + size=100, + replace=True, + p=[0.1, 0.25, 0.05, 0.6] + ) + +def test_paired_permutation_same(set_array): + cond1 = set_array + cond2 = set_array result = pt.paired_permutation(cond1, cond2, metric=np.mean) - assert result["metric"] == expected["metric"] - assert result["observed"] == expected["observed"] - assert result["p_value"] == expected["p_value"] - -def test_paired_permutation_reject(data1, data2): - cond1 = data1 - cond2 = data2 - for stat in ["median"]: + assert result["metric"] == np.mean + assert result["observed"] == 0.0 + assert result["p_value"] is np.nan + + +def test_paired_permutation_reject(random_low, random_high): + cond1 = random_low + cond2 = random_high + for stat in ["t-statistic", "mean", "median"]: result = pt.paired_permutation(cond1, cond2, metric=stat) print(result) + print(ttest_rel(cond1, cond2)) + assert result["p_value"] < 0.05 + + +def test_paired_permutation_degen(set_array): + for stat in ["t-statistic", "mean", "median"]: + assert pt.paired_permutation( + set_array, + set_array+30, + metric=stat + )["p_value"] is np.nan + + +def test_indep_permutation_reject(random_low, random_high): + group1 = random_low + group2 = random_high + for stat in ["t-statistic", "anova", "median", "mean"]: + result = pt.indep_permutation(group1, group2, metric=stat) assert result["p_value"] < 0.05 +def test_indep_permutation_accept(random_low): + for stat in ["t-statistic", "anova", "median", "mean"]: + assert pt.indep_permutation( + random_low, + random_low, + metric=stat + )["p_value"] > 0.05 + + +def test_chi2_permutation_accept(random_choice_equal): + result = pt.chi2_permutation(random_choice_equal, random_choice_equal) + assert result["p_value"] >= 0.05 + + +def test_chi2_permutation_reject(random_choice_unequal, random_choice_equal): + result = pt.chi2_permutation(random_choice_equal, random_choice_unequal) + assert result["p_value"] < 0.05 + + From 53d5f46805d0187e2bcc6df1939c9df1bf7ad587 Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Wed, 10 Sep 2025 16:44:14 +0200 Subject: [PATCH 08/25] init tutorial --- docs/api_examples/permutation_testing.ipynb | 375 ++++++++++++++++++++ 1 file changed, 375 insertions(+) create mode 100644 docs/api_examples/permutation_testing.ipynb diff --git a/docs/api_examples/permutation_testing.ipynb b/docs/api_examples/permutation_testing.ipynb new file mode 100644 index 0000000..eff2427 --- /dev/null +++ b/docs/api_examples/permutation_testing.ipynb @@ -0,0 +1,375 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e9bdb7c9", + "metadata": {}, + "source": [ + "# Tutorial: Permutation Testing using `acore`\n", + "\n", + "In this notebook we will demonstrate how to use acore's permutation testing functions on metagenomics data collected by [Ju and colleagues (2018)](https://doi.org/10.1038/s41396-018-0277-8).\n", + "\n", + "The samples were collected from wastewaster treatment plant inffluent and effluent." + ] + }, + { + "cell_type": "markdown", + "id": "79d8b537", + "metadata": { + "vscode": { + "languageId": "markdown" + } + }, + "source": [ + "## Downloading the dataset\n", + "\n", + "The analysed samples can be downloaded via the Mgnify API." + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "8506ee79", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GOdescriptioncategoryERR2985255ERR2985256ERR2985257ERR2985258ERR2985259ERR2985260ERR2985261...ERR2985269ERR2985270ERR2985271ERR2985272ERR2985273ERR2985274ERR2985275ERR2985276ERR2985277ERR2985278
0GO:0000001mitochondrion inheritancebiological process0000030...1011120022
1GO:0000002mitochondrial genome maintenancebiological process0970274...1510000504111
2GO:0000012single strand break repairbiological process0311000...0010021211
3GO:0000015phosphopyruvate hydratase complexcellular component477384500188226232301...203432152341319520290349270292
4GO:0000030mannosyltransferase activitymolecular function189200576739129181...7042587392159671795891
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " GO description category \\\n", + "0 GO:0000001 mitochondrion inheritance biological process \n", + "1 GO:0000002 mitochondrial genome maintenance biological process \n", + "2 GO:0000012 single strand break repair biological process \n", + "3 GO:0000015 phosphopyruvate hydratase complex cellular component \n", + "4 GO:0000030 mannosyltransferase activity molecular function \n", + "\n", + " ERR2985255 ERR2985256 ERR2985257 ERR2985258 ERR2985259 ERR2985260 \\\n", + "0 0 0 0 0 0 3 \n", + "1 0 9 7 0 2 7 \n", + "2 0 3 1 1 0 0 \n", + "3 477 384 500 188 226 232 \n", + "4 189 200 57 67 39 129 \n", + "\n", + " ERR2985261 ... ERR2985269 ERR2985270 ERR2985271 ERR2985272 \\\n", + "0 0 ... 1 0 1 1 \n", + "1 4 ... 15 10 0 0 \n", + "2 0 ... 0 0 1 0 \n", + "3 301 ... 203 432 152 341 \n", + "4 181 ... 70 42 58 73 \n", + "\n", + " ERR2985273 ERR2985274 ERR2985275 ERR2985276 ERR2985277 ERR2985278 \n", + "0 1 2 0 0 2 2 \n", + "1 0 5 0 4 11 1 \n", + "2 0 2 1 2 1 1 \n", + "3 319 520 290 349 270 292 \n", + "4 92 159 67 179 58 91 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import requests \n", + "from io import StringIO\n", + "import pandas as pd\n", + "\n", + "study_accession= \"MGYS00005058\"\n", + "\n", + "url = f'https://www.ebi.ac.uk/metagenomics/api/v1/studies/{study_accession}/pipelines/4.1/file/ERP112696_GO_abundances_v4.1.tsv'\n", + "# download\n", + "response = requests.get(url)\n", + "# treat as file\n", + "file_like = StringIO(response.content.decode('utf-8'))\n", + "# read into pd df\n", + "df = pd.read_csv(file_like, sep='\\t')\n", + "# sanity check\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "a9fa25c4", + "metadata": {}, + "source": [ + "For this demo we will only look at [go term 0030655](https://gowiki.tamu.edu/wiki/index.php/Category:GO:0030655_!_beta-lactam_antibiotic_catabolic_process)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0259a24f", + "metadata": {}, + "outputs": [], + "source": [ + "go = df.query(\"GO == 'GO:0030655'\")" + ] + }, + { + "cell_type": "markdown", + "id": "239321d2", + "metadata": {}, + "source": [ + "We need the metadata for the samples. We can also get this using the MGnify API" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "e7ade2e0", + "metadata": {}, + "outputs": [], + "source": [ + "# sample metadata\n", + "sample_meta = requests.get(f\"https://www.ebi.ac.uk/metagenomics/api/v1/studies/{study_accession}/samples\")\n", + "# analysis metadata\n", + "analysis_meta = requests.get(f\"https://www.ebi.ac.uk/metagenomics/api/v1/analyses?study_accession={study_accession}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 179, + "id": "a9cb02a6", + "metadata": {}, + "outputs": [], + "source": [ + "df_sm = pd.DataFrame(sample_meta.json()['data'])\n", + "df_am = pd.DataFrame(analysis_meta.json()['data'])\n", + "\n", + "\n", + "#['relationships']['run']['data']['id']" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "id": "f50b423a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 24 entries, 0 to 23\n", + "Data columns (total 9 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 type 24 non-null object\n", + " 1 id_x 24 non-null object\n", + " 2 attributes 24 non-null object\n", + " 3 relationships 24 non-null object\n", + " 4 links 24 non-null object\n", + " 5 run_id 24 non-null object\n", + " 6 sample_id 24 non-null object\n", + " 7 id_y 24 non-null object\n", + " 8 desc 24 non-null object\n", + "dtypes: object(9)\n", + "memory usage: 1.8+ KB\n" + ] + } + ], + "source": [ + "# get out needed metadata\n", + "df_am['run_id'] = df_am['relationships'].apply(lambda x: x['run']['data']['id'])\n", + "df_am['sample_id'] = df_am['relationships'].apply(lambda x: x['sample']['data']['id'])\n", + "df_sm['desc'] = df_sm['attributes'].apply(lambda x: x['sample-desc'])\n", + "\n", + "# join\n", + "df_combined = df_am.merge(df_sm[['id', 'desc']], left_on='sample_id', right_on='id', how='left')\n", + "df_combined.info()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 0a7e33a08d03a2bc22fbb8666e20a197bbe17fc9 Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Tue, 16 Sep 2025 10:03:26 +0200 Subject: [PATCH 09/25] diff conditions were diff mgys studies --- docs/api_examples/permutation_testing.ipynb | 139 +++++++++++--------- 1 file changed, 75 insertions(+), 64 deletions(-) diff --git a/docs/api_examples/permutation_testing.ipynb b/docs/api_examples/permutation_testing.ipynb index eff2427..e72d3ef 100644 --- a/docs/api_examples/permutation_testing.ipynb +++ b/docs/api_examples/permutation_testing.ipynb @@ -28,10 +28,18 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 5, "id": "8506ee79", "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00005058/pipelines/4.1/file/ERP112696_GO_abundances_v4.1.tsv\n", + "Processing https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00005056/pipelines/4.1/file/ERP111072_GO_abundances_v4.1.tsv\n" + ] + }, { "data": { "text/html": [ @@ -234,7 +242,7 @@ "[5 rows x 27 columns]" ] }, - "execution_count": 87, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -244,17 +252,28 @@ "from io import StringIO\n", "import pandas as pd\n", "\n", - "study_accession= \"MGYS00005058\"\n", + "enf_url = 'https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00005058/pipelines/4.1/file/ERP112696_GO_abundances_v4.1.tsv'\n", + "inf_url = 'https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00005056/pipelines/4.1/file/ERP111072_GO_abundances_v4.1.tsv'\n", + "\n", + "# init dict to hold dataframes\n", + "data = {}\n", + "for url in [enf_url, inf_url]:\n", + " print(f\"Processing {url}\")\n", + " # download\n", + " response = requests.get(url)\n", + " if response.status_code == 200:\n", + " # treat as file\n", + " file_like = StringIO(response.content.decode('utf-8'))\n", + " # read into pd df\n", + " df = pd.read_csv(file_like, sep='\\t')\n", + " # to the dict\n", + " data[url] = df\n", + " else: \n", + " print(response)\n", + " print(url, \" download skipped\")\n", "\n", - "url = f'https://www.ebi.ac.uk/metagenomics/api/v1/studies/{study_accession}/pipelines/4.1/file/ERP112696_GO_abundances_v4.1.tsv'\n", - "# download\n", - "response = requests.get(url)\n", - "# treat as file\n", - "file_like = StringIO(response.content.decode('utf-8'))\n", - "# read into pd df\n", - "df = pd.read_csv(file_like, sep='\\t')\n", "# sanity check\n", - "df.head()" + "data[enf_url].head()" ] }, { @@ -267,87 +286,79 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "0259a24f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(array([ 54, 101, 69, 52, 61, 68, 401, 72, 67, 80, 87, 49, 63,\n", + " 53, 345, 131, 67, 69, 42, 67, 107, 126, 63, 94]),\n", + " array([ 23, 87, 1026, 63, 311, 47, 71, 121, 109, 334, 40,\n", + " 111, 111, 31, 21, 919, 58, 95, 98, 120, 96, 84,\n", + " 26, 89]))" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "go = df.query(\"GO == 'GO:0030655'\")" + "term = 'GO:0030655'\n", + "\n", + "inf_go = data[inf_url].query(\"GO == @term\").drop(columns=['GO', 'description', 'category']).to_numpy()[0]\n", + "enf_go = data[enf_url].query(\"GO == @term\").drop(columns=['GO', 'description', 'category']).to_numpy()[0]\n", + "\n", + "inf_go, enf_go" ] }, { "cell_type": "markdown", - "id": "239321d2", + "id": "13469e23", "metadata": {}, "source": [ - "We need the metadata for the samples. We can also get this using the MGnify API" + "get metadata" ] }, { "cell_type": "code", - "execution_count": 92, - "id": "e7ade2e0", + "execution_count": 29, + "id": "5eefb720", "metadata": {}, "outputs": [], "source": [ - "# sample metadata\n", - "sample_meta = requests.get(f\"https://www.ebi.ac.uk/metagenomics/api/v1/studies/{study_accession}/samples\")\n", - "# analysis metadata\n", - "analysis_meta = requests.get(f\"https://www.ebi.ac.uk/metagenomics/api/v1/analyses?study_accession={study_accession}\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 179, - "id": "a9cb02a6", - "metadata": {}, - "outputs": [], - "source": [ - "df_sm = pd.DataFrame(sample_meta.json()['data'])\n", - "df_am = pd.DataFrame(analysis_meta.json()['data'])\n", + "from acore.permutation_test import paired_permutation\n", "\n", - "\n", - "#['relationships']['run']['data']['id']" + "# optional for reproducibility\n", + "import numpy as np\n", + "rng = np.random.default_rng(12345)" ] }, { "cell_type": "code", - "execution_count": 195, - "id": "f50b423a", + "execution_count": 31, + "id": "9eabb911", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 24 entries, 0 to 23\n", - "Data columns (total 9 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 type 24 non-null object\n", - " 1 id_x 24 non-null object\n", - " 2 attributes 24 non-null object\n", - " 3 relationships 24 non-null object\n", - " 4 links 24 non-null object\n", - " 5 run_id 24 non-null object\n", - " 6 sample_id 24 non-null object\n", - " 7 id_y 24 non-null object\n", - " 8 desc 24 non-null object\n", - "dtypes: object(9)\n", - "memory usage: 1.8+ KB\n" - ] + "data": { + "text/plain": [ + "{'metric': ,\n", + " 'observed': np.float64(-70.95833333333333),\n", + " 'p_value': np.float64(0.2664)}" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "# get out needed metadata\n", - "df_am['run_id'] = df_am['relationships'].apply(lambda x: x['run']['data']['id'])\n", - "df_am['sample_id'] = df_am['relationships'].apply(lambda x: x['sample']['data']['id'])\n", - "df_sm['desc'] = df_sm['attributes'].apply(lambda x: x['sample-desc'])\n", + "result = paired_permutation(inf_go, enf_go, metric='mean', n_permutations=10000, rng=rng)\n", "\n", - "# join\n", - "df_combined = df_am.merge(df_sm[['id', 'desc']], left_on='sample_id', right_on='id', how='left')\n", - "df_combined.info()" + "result" ] } ], From dc648f6647b300cba5f88104272d7c316449c022 Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Tue, 16 Sep 2025 13:40:08 +0200 Subject: [PATCH 10/25] other go term --- docs/api_examples/permutation_testing.ipynb | 500 ++++++++++++++++++-- 1 file changed, 466 insertions(+), 34 deletions(-) diff --git a/docs/api_examples/permutation_testing.ipynb b/docs/api_examples/permutation_testing.ipynb index e72d3ef..1b0d644 100644 --- a/docs/api_examples/permutation_testing.ipynb +++ b/docs/api_examples/permutation_testing.ipynb @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "8506ee79", "metadata": {}, "outputs": [ @@ -252,8 +252,11 @@ "from io import StringIO\n", "import pandas as pd\n", "\n", - "enf_url = 'https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00005058/pipelines/4.1/file/ERP112696_GO_abundances_v4.1.tsv'\n", - "inf_url = 'https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00005056/pipelines/4.1/file/ERP111072_GO_abundances_v4.1.tsv'\n", + "enf_accession = \"MGYS00005058\"\n", + "inf_accession = \"MGYS00005056\"\n", + "\n", + "enf_url = f'https://www.ebi.ac.uk/metagenomics/api/v1/studies/{enf_accession}/pipelines/4.1/file/ERP112696_GO_abundances_v4.1.tsv'\n", + "inf_url = f'https://www.ebi.ac.uk/metagenomics/api/v1/studies/{inf_accession}/pipelines/4.1/file/ERP111072_GO_abundances_v4.1.tsv'\n", "\n", "# init dict to hold dataframes\n", "data = {}\n", @@ -276,55 +279,472 @@ "data[enf_url].head()" ] }, + { + "cell_type": "code", + "execution_count": 62, + "id": "84364bf5", + "metadata": {}, + "outputs": [], + "source": [ + "# preprocessing\n", + "from sklearn.preprocessing import FunctionTransformer\n", + "import numpy as np\n", + "from sklearn.pipeline import make_pipeline\n", + "\n", + "coda = FunctionTransformer(lambda x: x / x.sum(axis=0), feature_names_out=\"one-to-one\").set_output(transform=\"pandas\")\n", + "\n", + "def calc_clr(x):\n", + " # replace zeros with small \n", + " new_x = np.where(x > 0, x, 1e-10)\n", + " return np.log(new_x) - np.mean(np.log(new_x), axis=0)\n", + "\n", + "clr = FunctionTransformer(calc_clr, feature_names_out=\"one-to-one\").set_output(transform=\"pandas\")\n", + "\n", + "def coda_clr(df):\n", + " pipe = make_pipeline(coda, clr).set_output(transform=\"pandas\")\n", + " transformed = pipe.fit_transform(df.select_dtypes(include='number'))\n", + " return pd.concat([df.select_dtypes(include='object'), transformed], axis=1)\n", + "\n", + "clr_enf = coda_clr(data[enf_url])\n", + "clr_inf = coda_clr(data[inf_url])" + ] + }, { "cell_type": "markdown", "id": "a9fa25c4", "metadata": {}, "source": [ - "For this demo we will only look at [go term 0030655](https://gowiki.tamu.edu/wiki/index.php/Category:GO:0030655_!_beta-lactam_antibiotic_catabolic_process)" + "For this demo we will only look at [go term GO:0017001](https://www.ebi.ac.uk/QuickGO/term/GO:0017001)\n", + "\n", + "It's expected that antibiotic catabolic processes to be higher in influent vs efluent" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 130, "id": "0259a24f", "metadata": {}, + "outputs": [], + "source": [ + "term = 'GO:0017001'\n", + "\n", + "inf_go = clr_inf.query(\"GO == @term\").drop(columns=['GO', 'description', 'category']).T.reset_index()\n", + "enf_go = clr_enf.query(\"GO == @term\").drop(columns=['GO', 'description', 'category']).T.reset_index()\n", + "\n", + "inf_go.columns = ['sample', 'abundance']\n", + "enf_go.columns = ['sample', 'abundance']" + ] + }, + { + "cell_type": "markdown", + "id": "13469e23", + "metadata": {}, + "source": [ + "The samples are paired. We need the metadata for the samples which we can also download using the MGnify API." + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "id": "a7eb5848", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing MGYS00005058\n", + "Processing MGYS00005056\n" + ] + } + ], + "source": [ + "# getting sampling and analysis metadata\n", + "metadata = {}\n", + "for accession in [enf_accession, inf_accession]:\n", + " # prep urls\n", + " url_sam = f'https://www.ebi.ac.uk/metagenomics/api/v1/studies/{accession}/samples'\n", + " url_ana = f\"https://www.ebi.ac.uk/metagenomics/api/v1/analyses?study_accession={accession}\"\n", + "\n", + " print(f\"Processing {accession}\")\n", + " # download\n", + " sam_response = requests.get(url_sam)\n", + " if sam_response.status_code == 200:\n", + " # json\n", + " df_sam = pd.DataFrame(sam_response.json()['data'])\n", + " # extract needed metadata\n", + " df_sam['desc'] = df_sam['attributes'].apply(lambda x: x['sample-desc'])\n", + " df_sam['sam_source'] = df_sam['desc'].str.split('.', expand=True)[0]\n", + " df_sam['sam_read'] = df_sam['desc'].str.split('.', expand=True)[2]\n", + " else: \n", + " raise Exception(f\"Failed to download {url_sam}\")\n", + " print(sam_response)\n", + " ana_response = requests.get(url_ana)\n", + " if ana_response.status_code == 200:\n", + " # json\n", + " df_ana = pd.DataFrame(ana_response.json()['data'])\n", + " # extract needed metadata\n", + " df_ana['run_id'] = df_ana['relationships'].apply(lambda x: x['run']['data']['id'])\n", + " df_ana['sample_id'] = df_ana['relationships'].apply(lambda x: x['sample']['data']['id'])\n", + " else: \n", + " raise Exception(f\"Failed to download {url_ana}\")\n", + " print(ana_response)\n", + " # merge\n", + " metadata[accession] = df_ana.merge(df_sam[['id', 'desc', 'sam_source', 'sam_read']], left_on='sample_id', right_on='id', how='left')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "id": "487cd4da", + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(array([ 54, 101, 69, 52, 61, 68, 401, 72, 67, 80, 87, 49, 63,\n", - " 53, 345, 131, 67, 69, 42, 67, 107, 126, 63, 94]),\n", - " array([ 23, 87, 1026, 63, 311, 47, 71, 121, 109, 334, 40,\n", - " 111, 111, 31, 21, 919, 58, 95, 98, 120, 96, 84,\n", - " 26, 89]))" + "((24, 13), (24, 13))" ] }, - "execution_count": 21, + "execution_count": 132, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "term = 'GO:0030655'\n", - "\n", - "inf_go = data[inf_url].query(\"GO == @term\").drop(columns=['GO', 'description', 'category']).to_numpy()[0]\n", - "enf_go = data[enf_url].query(\"GO == @term\").drop(columns=['GO', 'description', 'category']).to_numpy()[0]\n", - "\n", - "inf_go, enf_go" + "enf_all = metadata[enf_accession].merge(enf_go, left_on='run_id', right_on='sample', how='inner')\n", + "inf_all = metadata[inf_accession].merge(inf_go, left_on='run_id', right_on='sample', how='inner')\n", + "# sanity check\n", + "enf_all.shape, inf_all.shape" ] }, { - "cell_type": "markdown", - "id": "13469e23", + "cell_type": "code", + "execution_count": 136, + "id": "66056854", "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
run_id_enfrun_id_infsam_sourcesam_readabundance_enfabundance_inf
0ERR2985255ERR2814663TGREAD2 Taxonomy ID:2563183.2574.227
1ERR2985256ERR2814664MNREAD2 Taxonomy ID:2563182.5733.847
2ERR2985257ERR2814651AHREAD1 Taxonomy ID:2563184.2994.087
3ERR2985258ERR2814667TEREAD1 Taxonomy ID:2563182.7593.437
4ERR2985259ERR2814660FDREAD1 Taxonomy ID:2563183.3653.487
5ERR2985260ERR2814652ADREAD1 Taxonomy ID:2563183.1233.196
6ERR2985261ERR2814655EMREAD2 Taxonomy ID:2563182.8113.680
7ERR2985262ERR2814647MNREAD1 Taxonomy ID:2563182.6483.854
8ERR2985263ERR2814649ZRREAD2 Taxonomy ID:2563183.5753.899
9ERR2985264ERR2814646FDREAD2 Taxonomy ID:2563183.1933.451
10ERR2985265ERR2814658ADREAD2 Taxonomy ID:2563182.9093.303
11ERR2985266ERR2814650ZRREAD1 Taxonomy ID:2563183.3893.849
12ERR2985267ERR2814648WTREAD2 Taxonomy ID:2563182.8073.532
13ERR2985268ERR2814645TGREAD1 Taxonomy ID:2563183.2414.053
14ERR2985269ERR2814656TUREAD2 Taxonomy ID:2563182.9303.616
15ERR2985270ERR2814659AHREAD2 Taxonomy ID:2563184.4014.149
16ERR2985271ERR2814657TEREAD2 Taxonomy ID:2563182.9013.758
17ERR2985272ERR2814666BEREAD1 Taxonomy ID:2563184.1254.398
18ERR2985273ERR2814665BEREAD2 Taxonomy ID:2563184.0774.395
19ERR2985274ERR2814662WTREAD1 Taxonomy ID:2563183.1093.513
20ERR2985275ERR2814653FRREAD2 Taxonomy ID:2563183.0003.595
21ERR2985276ERR2814668EMREAD1 Taxonomy ID:2563182.7533.547
22ERR2985277ERR2814661TUREAD1 Taxonomy ID:2563182.8773.513
23ERR2985278ERR2814654FRREAD1 Taxonomy ID:2563182.7343.310
\n", + "
" + ], + "text/plain": [ + " run_id_enf run_id_inf sam_source sam_read \\\n", + "0 ERR2985255 ERR2814663 TG READ2 Taxonomy ID:256318 \n", + "1 ERR2985256 ERR2814664 MN READ2 Taxonomy ID:256318 \n", + "2 ERR2985257 ERR2814651 AH READ1 Taxonomy ID:256318 \n", + "3 ERR2985258 ERR2814667 TE READ1 Taxonomy ID:256318 \n", + "4 ERR2985259 ERR2814660 FD READ1 Taxonomy ID:256318 \n", + "5 ERR2985260 ERR2814652 AD READ1 Taxonomy ID:256318 \n", + "6 ERR2985261 ERR2814655 EM READ2 Taxonomy ID:256318 \n", + "7 ERR2985262 ERR2814647 MN READ1 Taxonomy ID:256318 \n", + "8 ERR2985263 ERR2814649 ZR READ2 Taxonomy ID:256318 \n", + "9 ERR2985264 ERR2814646 FD READ2 Taxonomy ID:256318 \n", + "10 ERR2985265 ERR2814658 AD READ2 Taxonomy ID:256318 \n", + "11 ERR2985266 ERR2814650 ZR READ1 Taxonomy ID:256318 \n", + "12 ERR2985267 ERR2814648 WT READ2 Taxonomy ID:256318 \n", + "13 ERR2985268 ERR2814645 TG READ1 Taxonomy ID:256318 \n", + "14 ERR2985269 ERR2814656 TU READ2 Taxonomy ID:256318 \n", + "15 ERR2985270 ERR2814659 AH READ2 Taxonomy ID:256318 \n", + "16 ERR2985271 ERR2814657 TE READ2 Taxonomy ID:256318 \n", + "17 ERR2985272 ERR2814666 BE READ1 Taxonomy ID:256318 \n", + "18 ERR2985273 ERR2814665 BE READ2 Taxonomy ID:256318 \n", + "19 ERR2985274 ERR2814662 WT READ1 Taxonomy ID:256318 \n", + "20 ERR2985275 ERR2814653 FR READ2 Taxonomy ID:256318 \n", + "21 ERR2985276 ERR2814668 EM READ1 Taxonomy ID:256318 \n", + "22 ERR2985277 ERR2814661 TU READ1 Taxonomy ID:256318 \n", + "23 ERR2985278 ERR2814654 FR READ1 Taxonomy ID:256318 \n", + "\n", + " abundance_enf abundance_inf \n", + "0 3.257 4.227 \n", + "1 2.573 3.847 \n", + "2 4.299 4.087 \n", + "3 2.759 3.437 \n", + "4 3.365 3.487 \n", + "5 3.123 3.196 \n", + "6 2.811 3.680 \n", + "7 2.648 3.854 \n", + "8 3.575 3.899 \n", + "9 3.193 3.451 \n", + "10 2.909 3.303 \n", + "11 3.389 3.849 \n", + "12 2.807 3.532 \n", + "13 3.241 4.053 \n", + "14 2.930 3.616 \n", + "15 4.401 4.149 \n", + "16 2.901 3.758 \n", + "17 4.125 4.398 \n", + "18 4.077 4.395 \n", + "19 3.109 3.513 \n", + "20 3.000 3.595 \n", + "21 2.753 3.547 \n", + "22 2.877 3.513 \n", + "23 2.734 3.310 " + ] + }, + "execution_count": 136, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "get metadata" + "# pairing \n", + "df_data = enf_all.merge(\n", + " inf_all,\n", + " on=['sam_source', 'sam_read'], suffixes=('_enf', '_inf')\n", + ")[['run_id_enf', 'run_id_inf', 'sam_source', 'sam_read', 'abundance_enf', 'abundance_inf']]\n", + "\n", + "df_data" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 134, "id": "5eefb720", "metadata": {}, "outputs": [], @@ -338,28 +758,40 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 135, "id": "9eabb911", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "{'metric': ,\n", - " 'observed': np.float64(-70.95833333333333),\n", - " 'p_value': np.float64(0.2664)}" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "{'metric': , 'observed': TtestResult(statistic=np.float64(6.7389860601792275), pvalue=np.float64(7.122287781830209e-07), df=np.int64(23)), 'p_value': np.float64(0.0)}\n", + "{'metric': , 'observed': np.float64(0.5350826397500547), 'p_value': np.float64(0.0)}\n", + "{'metric': , 'observed': np.float64(0.5350826397500547), 'p_value': np.float64(0.0)}\n" + ] } ], "source": [ - "result = paired_permutation(inf_go, enf_go, metric='mean', n_permutations=10000, rng=rng)\n", + "for metric in ['t-statistic', 'mean', np.mean]:\n", + " result = paired_permutation(\n", + " df_data['abundance_inf'].to_numpy(),\n", + " df_data['abundance_enf'].to_numpy(), \n", + " metric=metric, \n", + " n_permutations=10000, \n", + " rng=rng\n", + " )\n", "\n", - "result" + " print(result)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9da7d601", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 5492203774f5e18d74e0038270c48310e8d907ec Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Tue, 14 Oct 2025 12:52:45 +0200 Subject: [PATCH 11/25] :construction: instead saving processed dataset to example_data/mgnify --- docs/api_examples/permutation_testing.ipynb | 331 +----------------- .../Ju2018_GO0017001_enf_inf_paired.csv | 25 ++ src/acore/microbiome/internal_functions.py | 54 +++ 3 files changed, 90 insertions(+), 320 deletions(-) create mode 100644 example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv create mode 100644 src/acore/microbiome/internal_functions.py diff --git a/docs/api_examples/permutation_testing.ipynb b/docs/api_examples/permutation_testing.ipynb index 1b0d644..03f82dd 100644 --- a/docs/api_examples/permutation_testing.ipynb +++ b/docs/api_examples/permutation_testing.ipynb @@ -9,7 +9,7 @@ "\n", "In this notebook we will demonstrate how to use acore's permutation testing functions on metagenomics data collected by [Ju and colleagues (2018)](https://doi.org/10.1038/s41396-018-0277-8).\n", "\n", - "The samples were collected from wastewaster treatment plant inffluent and effluent." + "The samples were collected from wastewaster treatment plant inffluent (MGYS00005056) and effluent (MGYS00005058).\n" ] }, { @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "8506ee79", "metadata": {}, "outputs": [ @@ -242,7 +242,7 @@ "[5 rows x 27 columns]" ] }, - "execution_count": 5, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -281,7 +281,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 4, "id": "84364bf5", "metadata": {}, "outputs": [], @@ -321,7 +321,7 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 5, "id": "0259a24f", "metadata": {}, "outputs": [], @@ -345,7 +345,7 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 6, "id": "a7eb5848", "metadata": {}, "outputs": [ @@ -395,7 +395,7 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": 7, "id": "487cd4da", "metadata": {}, "outputs": [ @@ -405,7 +405,7 @@ "((24, 13), (24, 13))" ] }, - "execution_count": 132, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -419,319 +419,10 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": null, "id": "66056854", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
run_id_enfrun_id_infsam_sourcesam_readabundance_enfabundance_inf
0ERR2985255ERR2814663TGREAD2 Taxonomy ID:2563183.2574.227
1ERR2985256ERR2814664MNREAD2 Taxonomy ID:2563182.5733.847
2ERR2985257ERR2814651AHREAD1 Taxonomy ID:2563184.2994.087
3ERR2985258ERR2814667TEREAD1 Taxonomy ID:2563182.7593.437
4ERR2985259ERR2814660FDREAD1 Taxonomy ID:2563183.3653.487
5ERR2985260ERR2814652ADREAD1 Taxonomy ID:2563183.1233.196
6ERR2985261ERR2814655EMREAD2 Taxonomy ID:2563182.8113.680
7ERR2985262ERR2814647MNREAD1 Taxonomy ID:2563182.6483.854
8ERR2985263ERR2814649ZRREAD2 Taxonomy ID:2563183.5753.899
9ERR2985264ERR2814646FDREAD2 Taxonomy ID:2563183.1933.451
10ERR2985265ERR2814658ADREAD2 Taxonomy ID:2563182.9093.303
11ERR2985266ERR2814650ZRREAD1 Taxonomy ID:2563183.3893.849
12ERR2985267ERR2814648WTREAD2 Taxonomy ID:2563182.8073.532
13ERR2985268ERR2814645TGREAD1 Taxonomy ID:2563183.2414.053
14ERR2985269ERR2814656TUREAD2 Taxonomy ID:2563182.9303.616
15ERR2985270ERR2814659AHREAD2 Taxonomy ID:2563184.4014.149
16ERR2985271ERR2814657TEREAD2 Taxonomy ID:2563182.9013.758
17ERR2985272ERR2814666BEREAD1 Taxonomy ID:2563184.1254.398
18ERR2985273ERR2814665BEREAD2 Taxonomy ID:2563184.0774.395
19ERR2985274ERR2814662WTREAD1 Taxonomy ID:2563183.1093.513
20ERR2985275ERR2814653FRREAD2 Taxonomy ID:2563183.0003.595
21ERR2985276ERR2814668EMREAD1 Taxonomy ID:2563182.7533.547
22ERR2985277ERR2814661TUREAD1 Taxonomy ID:2563182.8773.513
23ERR2985278ERR2814654FRREAD1 Taxonomy ID:2563182.7343.310
\n", - "
" - ], - "text/plain": [ - " run_id_enf run_id_inf sam_source sam_read \\\n", - "0 ERR2985255 ERR2814663 TG READ2 Taxonomy ID:256318 \n", - "1 ERR2985256 ERR2814664 MN READ2 Taxonomy ID:256318 \n", - "2 ERR2985257 ERR2814651 AH READ1 Taxonomy ID:256318 \n", - "3 ERR2985258 ERR2814667 TE READ1 Taxonomy ID:256318 \n", - "4 ERR2985259 ERR2814660 FD READ1 Taxonomy ID:256318 \n", - "5 ERR2985260 ERR2814652 AD READ1 Taxonomy ID:256318 \n", - "6 ERR2985261 ERR2814655 EM READ2 Taxonomy ID:256318 \n", - "7 ERR2985262 ERR2814647 MN READ1 Taxonomy ID:256318 \n", - "8 ERR2985263 ERR2814649 ZR READ2 Taxonomy ID:256318 \n", - "9 ERR2985264 ERR2814646 FD READ2 Taxonomy ID:256318 \n", - "10 ERR2985265 ERR2814658 AD READ2 Taxonomy ID:256318 \n", - "11 ERR2985266 ERR2814650 ZR READ1 Taxonomy ID:256318 \n", - "12 ERR2985267 ERR2814648 WT READ2 Taxonomy ID:256318 \n", - "13 ERR2985268 ERR2814645 TG READ1 Taxonomy ID:256318 \n", - "14 ERR2985269 ERR2814656 TU READ2 Taxonomy ID:256318 \n", - "15 ERR2985270 ERR2814659 AH READ2 Taxonomy ID:256318 \n", - "16 ERR2985271 ERR2814657 TE READ2 Taxonomy ID:256318 \n", - "17 ERR2985272 ERR2814666 BE READ1 Taxonomy ID:256318 \n", - "18 ERR2985273 ERR2814665 BE READ2 Taxonomy ID:256318 \n", - "19 ERR2985274 ERR2814662 WT READ1 Taxonomy ID:256318 \n", - "20 ERR2985275 ERR2814653 FR READ2 Taxonomy ID:256318 \n", - "21 ERR2985276 ERR2814668 EM READ1 Taxonomy ID:256318 \n", - "22 ERR2985277 ERR2814661 TU READ1 Taxonomy ID:256318 \n", - "23 ERR2985278 ERR2814654 FR READ1 Taxonomy ID:256318 \n", - "\n", - " abundance_enf abundance_inf \n", - "0 3.257 4.227 \n", - "1 2.573 3.847 \n", - "2 4.299 4.087 \n", - "3 2.759 3.437 \n", - "4 3.365 3.487 \n", - "5 3.123 3.196 \n", - "6 2.811 3.680 \n", - "7 2.648 3.854 \n", - "8 3.575 3.899 \n", - "9 3.193 3.451 \n", - "10 2.909 3.303 \n", - "11 3.389 3.849 \n", - "12 2.807 3.532 \n", - "13 3.241 4.053 \n", - "14 2.930 3.616 \n", - "15 4.401 4.149 \n", - "16 2.901 3.758 \n", - "17 4.125 4.398 \n", - "18 4.077 4.395 \n", - "19 3.109 3.513 \n", - "20 3.000 3.595 \n", - "21 2.753 3.547 \n", - "22 2.877 3.513 \n", - "23 2.734 3.310 " - ] - }, - "execution_count": 136, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# pairing \n", "df_data = enf_all.merge(\n", @@ -739,7 +430,7 @@ " on=['sam_source', 'sam_read'], suffixes=('_enf', '_inf')\n", ")[['run_id_enf', 'run_id_inf', 'sam_source', 'sam_read', 'abundance_enf', 'abundance_inf']]\n", "\n", - "df_data" + "df_data.to_csv('/Users/anglup/GitHub/acore/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv', index=False)" ] }, { diff --git a/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv b/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv new file mode 100644 index 0000000..07477c5 --- /dev/null +++ b/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv @@ -0,0 +1,25 @@ +run_id_enf,run_id_inf,sam_source,sam_read,abundance_enf,abundance_inf +ERR2985255,ERR2814663,TG,READ2 Taxonomy ID:256318,3.2572833167019404,4.226818522502555 +ERR2985256,ERR2814664,MN,READ2 Taxonomy ID:256318,2.5728406558389647,3.8471905353288207 +ERR2985257,ERR2814651,AH,READ1 Taxonomy ID:256318,4.298777376040054,4.08684074171776 +ERR2985258,ERR2814667,TE,READ1 Taxonomy ID:256318,2.758981522176203,3.4367515193484355 +ERR2985259,ERR2814660,FD,READ1 Taxonomy ID:256318,3.3646753829076825,3.4866734935494126 +ERR2985260,ERR2814652,AD,READ1 Taxonomy ID:256318,3.122830873754367,3.1959892495165043 +ERR2985261,ERR2814655,EM,READ2 Taxonomy ID:256318,2.8111912227938394,3.6796595362263957 +ERR2985262,ERR2814647,MN,READ1 Taxonomy ID:256318,2.64787572226993,3.854349549040254 +ERR2985263,ERR2814649,ZR,READ2 Taxonomy ID:256318,3.5745130928438336,3.898617360802433 +ERR2985264,ERR2814646,FD,READ2 Taxonomy ID:256318,3.1932134470444886,3.4511721120494254 +ERR2985265,ERR2814658,AD,READ2 Taxonomy ID:256318,2.9090652423743357,3.3030032029809693 +ERR2985266,ERR2814650,ZR,READ1 Taxonomy ID:256318,3.3891206874721362,3.84895839035568 +ERR2985267,ERR2814648,WT,READ2 Taxonomy ID:256318,2.8069435917365073,3.5322169927895697 +ERR2985268,ERR2814645,TG,READ1 Taxonomy ID:256318,3.241378979031497,4.052892167654379 +ERR2985269,ERR2814656,TU,READ2 Taxonomy ID:256318,2.929680670135557,3.616255621958018 +ERR2985270,ERR2814659,AH,READ2 Taxonomy ID:256318,4.400504997765587,4.149306529774245 +ERR2985271,ERR2814657,TE,READ2 Taxonomy ID:256318,2.901002904935801,3.75827367299984 +ERR2985272,ERR2814666,BE,READ1 Taxonomy ID:256318,4.125262599471004,4.398085076927169 +ERR2985273,ERR2814665,BE,READ2 Taxonomy ID:256318,4.076901525292074,4.39475485971623 +ERR2985274,ERR2814662,WT,READ1 Taxonomy ID:256318,3.1091661937084183,3.5130581102452503 +ERR2985275,ERR2814653,FR,READ2 Taxonomy ID:256318,3.0000096382997192,3.594857935168328 +ERR2985276,ERR2814668,EM,READ1 Taxonomy ID:256318,2.752778722206287,3.547368740393445 +ERR2985277,ERR2814661,TU,READ1 Taxonomy ID:256318,2.8771702480026953,3.5132674150221757 +ERR2985278,ERR2814654,FR,READ1 Taxonomy ID:256318,2.7336315345563857,3.310422165293325 diff --git a/src/acore/microbiome/internal_functions.py b/src/acore/microbiome/internal_functions.py new file mode 100644 index 0000000..1afe23c --- /dev/null +++ b/src/acore/microbiome/internal_functions.py @@ -0,0 +1,54 @@ +# preprocessing +from sklearn.preprocessing import FunctionTransformer +import numpy as np +from sklearn.pipeline import make_pipeline +import pandas as pd + +def calc_clr(x): + """ + Calculate the centered log-ratio (CLR) transformation. + + Parameters + ---------- + x : array-like + Input data to transform. + + Returns + ------- + array-like + CLR transformed data. + """ + # replace zeros with small + new_x = np.where(x > 0, x, 1e-10) + return np.log(new_x) - np.mean(np.log(new_x), axis=0) + + +def coda_clr(df: pd.DataFrame) -> pd.DataFrame: + """ + Apply CoDA and CLR transformations to the numeric columns of a DataFrame. + Non-numeric columns are retained without transformation. + + Parameters + ---------- + df : pd.DataFrame + Input DataFrame with numeric and non-numeric columns. + + Returns + ------- + pd.DataFrame + DataFrame with transformed numeric columns and original non-numeric columns. + """ + # Define the CoDA and CLR transformers + coda = FunctionTransformer( + lambda x: x / x.sum(axis=0), + feature_names_out="one-to-one" + ).set_output(transform="pandas") + clr = FunctionTransformer( + calc_clr, + feature_names_out="one-to-one" + ).set_output(transform="pandas") + # Create a pipeline with CoDA and CLR transformations + pipe = make_pipeline(coda, clr).set_output(transform="pandas") + # Apply the pipeline to numeric columns and concatenate with non-numeric columns + transformed = pipe.fit_transform(df.select_dtypes(include='number')) + return pd.concat([df.select_dtypes(include='object'), transformed], axis=1) \ No newline at end of file From 8ebab69a76582941aa4929777b1dff2051cded95 Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Tue, 14 Oct 2025 17:06:28 +0200 Subject: [PATCH 12/25] :memo:docs: Update API examples for permutation testing - saved processed dataset to example_data - removed data cleaning code from notebook - added descriptions and permutation explanation in markdown --- docs/api_examples/permutation_testing.ipynb | 471 +++++------------- docs/index.rst | 1 + .../Ju2018_GO0017001_enf_inf_paired.csv | 2 +- 3 files changed, 113 insertions(+), 361 deletions(-) diff --git a/docs/api_examples/permutation_testing.ipynb b/docs/api_examples/permutation_testing.ipynb index 03f82dd..de1b503 100644 --- a/docs/api_examples/permutation_testing.ipynb +++ b/docs/api_examples/permutation_testing.ipynb @@ -9,37 +9,56 @@ "\n", "In this notebook we will demonstrate how to use acore's permutation testing functions on metagenomics data collected by [Ju and colleagues (2018)](https://doi.org/10.1038/s41396-018-0277-8).\n", "\n", - "The samples were collected from wastewaster treatment plant inffluent (MGYS00005056) and effluent (MGYS00005058).\n" + "The samples in this demo were collected from wastewaster treatment plant inffluent (MGYS00005056) and effluent (MGYS00005058).\n", + "\n", + "For this demo we look at the GO term abundance tables generated by the Mgnify pipeline. The values in the table are the absolute abundance of selected GO terms for each sample, which we then transform to relative abundances and centred-log ratios. \n" ] }, { "cell_type": "markdown", - "id": "79d8b537", - "metadata": { - "vscode": { - "languageId": "markdown" - } - }, + "id": "dc9f17b2", + "metadata": {}, "source": [ - "## Downloading the dataset\n", + "## Data preparation details\n", + "\n", + "### Downloading\n", + "The analysed samples were downloaded via the [MGnify API](https://www.ebi.ac.uk/metagenomics/api/docs/). The inffluent (INF) and effluent (EFFF) datasets have paired samples and we also needed to download the sample metadata (also available via Mgnify API) to assign the correct pairing.\n", + "\n", + "### Preprocessing of abundances\n", + "- To account for technical variation due to sequencing technology limitations, we first transform the abundance values so they are relative to the total reads for the sample aka getting relative abundances. \n", + "- The relative abundances are compositional data (CoDa) so we map them to unconstrained vectors using centred log-ratio transformation `acore.microbiome.internal_functions.calc_clr()` to not violate assumptions of any frequentist stats we do\n", "\n", - "The analysed samples can be downloaded via the Mgnify API." + "### Preprocessing of the metadata \n", + "- the sample metadata needed for this demo (sampling location) were available in their \"sample-desc\" \n", + "- the sample-desc for each sample in both INF and EFF were parsed and used for pairing off\n", + "\n", + "### Subset of data for demo\n", + "- For this demo we only look at [go term GO:0017001](https://www.ebi.ac.uk/QuickGO/term/GO:0017001)\n", + "- It's expected that antibiotic catabolic processes to be higher in INF vs EFF\n", + "\n", + "### Saving the demo dataset\n", + "This example subset of data was saved to a CSV, ./example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv. The data dictionary is below:\n", + "\n", + "| column | description | dtype |\n", + "|-------------------|-------------------------------------------------------------------------------------------------------------------|-------|\n", + "| eff_id | The run id for the mgnify analysis of the effluent sample. | str |\n", + "| inf_id | The run id for the mgnify analysis of the influent sample. | str |\n", + "| sampling_location | [The ISO 3166-1 alpha-2 code](http://iso.org/obp/ui/#iso:pub:PUB500001:en) for the country where the sample was from. | str |\n", + "| sampling_read | Replicates? | str |\n", + "| eff_abundance | The relative abundance of the GO term for a given effluent sample following preprocessing (i.e., CoDA and CLR) | float |\n", + "| inf_abundance | The relative abundance of the GO term for a given influent sample following preprocessing (i.e., CoDA and CLR) | float |\n", + "\n", + "-----\n", + "\n", + "We will now proceed with reading in the prepared dataset. " ] }, { "cell_type": "code", "execution_count": 1, - "id": "8506ee79", + "id": "00d8f038", "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00005058/pipelines/4.1/file/ERP112696_GO_abundances_v4.1.tsv\n", - "Processing https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00005056/pipelines/4.1/file/ERP111072_GO_abundances_v4.1.tsv\n" - ] - }, { "data": { "text/html": [ @@ -61,185 +80,78 @@ " \n", " \n", " \n", - " GO\n", - " description\n", - " category\n", - " ERR2985255\n", - " ERR2985256\n", - " ERR2985257\n", - " ERR2985258\n", - " ERR2985259\n", - " ERR2985260\n", - " ERR2985261\n", - " ...\n", - " ERR2985269\n", - " ERR2985270\n", - " ERR2985271\n", - " ERR2985272\n", - " ERR2985273\n", - " ERR2985274\n", - " ERR2985275\n", - " ERR2985276\n", - " ERR2985277\n", - " ERR2985278\n", + " eff_id\n", + " inf_id\n", + " sampling_location\n", + " sampling_read\n", + " eff_abundance\n", + " inf_abundance\n", " \n", " \n", " \n", " \n", " 0\n", - " GO:0000001\n", - " mitochondrion inheritance\n", - " biological process\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 3\n", - " 0\n", - " ...\n", - " 1\n", - " 0\n", - " 1\n", - " 1\n", - " 1\n", - " 2\n", - " 0\n", - " 0\n", - " 2\n", - " 2\n", + " ERR2985255\n", + " ERR2814663\n", + " TG\n", + " READ2 Taxonomy ID:256318\n", + " 3.257283\n", + " 4.226819\n", " \n", " \n", " 1\n", - " GO:0000002\n", - " mitochondrial genome maintenance\n", - " biological process\n", - " 0\n", - " 9\n", - " 7\n", - " 0\n", - " 2\n", - " 7\n", - " 4\n", - " ...\n", - " 15\n", - " 10\n", - " 0\n", - " 0\n", - " 0\n", - " 5\n", - " 0\n", - " 4\n", - " 11\n", - " 1\n", + " ERR2985256\n", + " ERR2814664\n", + " MN\n", + " READ2 Taxonomy ID:256318\n", + " 2.572841\n", + " 3.847191\n", " \n", " \n", " 2\n", - " GO:0000012\n", - " single strand break repair\n", - " biological process\n", - " 0\n", - " 3\n", - " 1\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " ...\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 2\n", - " 1\n", - " 2\n", - " 1\n", - " 1\n", + " ERR2985257\n", + " ERR2814651\n", + " AH\n", + " READ1 Taxonomy ID:256318\n", + " 4.298777\n", + " 4.086841\n", " \n", " \n", " 3\n", - " GO:0000015\n", - " phosphopyruvate hydratase complex\n", - " cellular component\n", - " 477\n", - " 384\n", - " 500\n", - " 188\n", - " 226\n", - " 232\n", - " 301\n", - " ...\n", - " 203\n", - " 432\n", - " 152\n", - " 341\n", - " 319\n", - " 520\n", - " 290\n", - " 349\n", - " 270\n", - " 292\n", + " ERR2985258\n", + " ERR2814667\n", + " TE\n", + " READ1 Taxonomy ID:256318\n", + " 2.758982\n", + " 3.436752\n", " \n", " \n", " 4\n", - " GO:0000030\n", - " mannosyltransferase activity\n", - " molecular function\n", - " 189\n", - " 200\n", - " 57\n", - " 67\n", - " 39\n", - " 129\n", - " 181\n", - " ...\n", - " 70\n", - " 42\n", - " 58\n", - " 73\n", - " 92\n", - " 159\n", - " 67\n", - " 179\n", - " 58\n", - " 91\n", + " ERR2985259\n", + " ERR2814660\n", + " FD\n", + " READ1 Taxonomy ID:256318\n", + " 3.364675\n", + " 3.486673\n", " \n", " \n", "\n", - "

5 rows × 27 columns

\n", "" ], "text/plain": [ - " GO description category \\\n", - "0 GO:0000001 mitochondrion inheritance biological process \n", - "1 GO:0000002 mitochondrial genome maintenance biological process \n", - "2 GO:0000012 single strand break repair biological process \n", - "3 GO:0000015 phosphopyruvate hydratase complex cellular component \n", - "4 GO:0000030 mannosyltransferase activity molecular function \n", - "\n", - " ERR2985255 ERR2985256 ERR2985257 ERR2985258 ERR2985259 ERR2985260 \\\n", - "0 0 0 0 0 0 3 \n", - "1 0 9 7 0 2 7 \n", - "2 0 3 1 1 0 0 \n", - "3 477 384 500 188 226 232 \n", - "4 189 200 57 67 39 129 \n", - "\n", - " ERR2985261 ... ERR2985269 ERR2985270 ERR2985271 ERR2985272 \\\n", - "0 0 ... 1 0 1 1 \n", - "1 4 ... 15 10 0 0 \n", - "2 0 ... 0 0 1 0 \n", - "3 301 ... 203 432 152 341 \n", - "4 181 ... 70 42 58 73 \n", + " eff_id inf_id sampling_location sampling_read \\\n", + "0 ERR2985255 ERR2814663 TG READ2 Taxonomy ID:256318 \n", + "1 ERR2985256 ERR2814664 MN READ2 Taxonomy ID:256318 \n", + "2 ERR2985257 ERR2814651 AH READ1 Taxonomy ID:256318 \n", + "3 ERR2985258 ERR2814667 TE READ1 Taxonomy ID:256318 \n", + "4 ERR2985259 ERR2814660 FD READ1 Taxonomy ID:256318 \n", "\n", - " ERR2985273 ERR2985274 ERR2985275 ERR2985276 ERR2985277 ERR2985278 \n", - "0 1 2 0 0 2 2 \n", - "1 0 5 0 4 11 1 \n", - "2 0 2 1 2 1 1 \n", - "3 319 520 290 349 270 292 \n", - "4 92 159 67 179 58 91 \n", - "\n", - "[5 rows x 27 columns]" + " eff_abundance inf_abundance \n", + "0 3.257283 4.226819 \n", + "1 2.572841 3.847191 \n", + "2 4.298777 4.086841 \n", + "3 2.758982 3.436752 \n", + "4 3.364675 3.486673 " ] }, "execution_count": 1, @@ -248,208 +160,44 @@ } ], "source": [ - "import requests \n", - "from io import StringIO\n", - "import pandas as pd\n", - "\n", - "enf_accession = \"MGYS00005058\"\n", - "inf_accession = \"MGYS00005056\"\n", - "\n", - "enf_url = f'https://www.ebi.ac.uk/metagenomics/api/v1/studies/{enf_accession}/pipelines/4.1/file/ERP112696_GO_abundances_v4.1.tsv'\n", - "inf_url = f'https://www.ebi.ac.uk/metagenomics/api/v1/studies/{inf_accession}/pipelines/4.1/file/ERP111072_GO_abundances_v4.1.tsv'\n", + "import pandas as pd \n", "\n", - "# init dict to hold dataframes\n", - "data = {}\n", - "for url in [enf_url, inf_url]:\n", - " print(f\"Processing {url}\")\n", - " # download\n", - " response = requests.get(url)\n", - " if response.status_code == 200:\n", - " # treat as file\n", - " file_like = StringIO(response.content.decode('utf-8'))\n", - " # read into pd df\n", - " df = pd.read_csv(file_like, sep='\\t')\n", - " # to the dict\n", - " data[url] = df\n", - " else: \n", - " print(response)\n", - " print(url, \" download skipped\")\n", - "\n", - "# sanity check\n", - "data[enf_url].head()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "84364bf5", - "metadata": {}, - "outputs": [], - "source": [ - "# preprocessing\n", - "from sklearn.preprocessing import FunctionTransformer\n", - "import numpy as np\n", - "from sklearn.pipeline import make_pipeline\n", - "\n", - "coda = FunctionTransformer(lambda x: x / x.sum(axis=0), feature_names_out=\"one-to-one\").set_output(transform=\"pandas\")\n", - "\n", - "def calc_clr(x):\n", - " # replace zeros with small \n", - " new_x = np.where(x > 0, x, 1e-10)\n", - " return np.log(new_x) - np.mean(np.log(new_x), axis=0)\n", - "\n", - "clr = FunctionTransformer(calc_clr, feature_names_out=\"one-to-one\").set_output(transform=\"pandas\")\n", - "\n", - "def coda_clr(df):\n", - " pipe = make_pipeline(coda, clr).set_output(transform=\"pandas\")\n", - " transformed = pipe.fit_transform(df.select_dtypes(include='number'))\n", - " return pd.concat([df.select_dtypes(include='object'), transformed], axis=1)\n", - "\n", - "clr_enf = coda_clr(data[enf_url])\n", - "clr_inf = coda_clr(data[inf_url])" + "df_data = pd.read_csv('/Users/anglup/GitHub/acore/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv')\n", + "# sanity check \n", + "df_data.head()" ] }, { "cell_type": "markdown", - "id": "a9fa25c4", + "id": "7cf6c3d0", "metadata": {}, "source": [ - "For this demo we will only look at [go term GO:0017001](https://www.ebi.ac.uk/QuickGO/term/GO:0017001)\n", + "## The permutation test\n", "\n", - "It's expected that antibiotic catabolic processes to be higher in influent vs efluent" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "0259a24f", - "metadata": {}, - "outputs": [], - "source": [ - "term = 'GO:0017001'\n", - "\n", - "inf_go = clr_inf.query(\"GO == @term\").drop(columns=['GO', 'description', 'category']).T.reset_index()\n", - "enf_go = clr_enf.query(\"GO == @term\").drop(columns=['GO', 'description', 'category']).T.reset_index()\n", - "\n", - "inf_go.columns = ['sample', 'abundance']\n", - "enf_go.columns = ['sample', 'abundance']" - ] - }, - { - "cell_type": "markdown", - "id": "13469e23", - "metadata": {}, - "source": [ - "The samples are paired. We need the metadata for the samples which we can also download using the MGnify API." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "a7eb5848", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing MGYS00005058\n", - "Processing MGYS00005056\n" - ] - } - ], - "source": [ - "# getting sampling and analysis metadata\n", - "metadata = {}\n", - "for accession in [enf_accession, inf_accession]:\n", - " # prep urls\n", - " url_sam = f'https://www.ebi.ac.uk/metagenomics/api/v1/studies/{accession}/samples'\n", - " url_ana = f\"https://www.ebi.ac.uk/metagenomics/api/v1/analyses?study_accession={accession}\"\n", + "Since these are paird samples we will proceed with paired sample permutation test using `acore.perumutation_test.paired_permutation()`. \n", "\n", - " print(f\"Processing {accession}\")\n", - " # download\n", - " sam_response = requests.get(url_sam)\n", - " if sam_response.status_code == 200:\n", - " # json\n", - " df_sam = pd.DataFrame(sam_response.json()['data'])\n", - " # extract needed metadata\n", - " df_sam['desc'] = df_sam['attributes'].apply(lambda x: x['sample-desc'])\n", - " df_sam['sam_source'] = df_sam['desc'].str.split('.', expand=True)[0]\n", - " df_sam['sam_read'] = df_sam['desc'].str.split('.', expand=True)[2]\n", - " else: \n", - " raise Exception(f\"Failed to download {url_sam}\")\n", - " print(sam_response)\n", - " ana_response = requests.get(url_ana)\n", - " if ana_response.status_code == 200:\n", - " # json\n", - " df_ana = pd.DataFrame(ana_response.json()['data'])\n", - " # extract needed metadata\n", - " df_ana['run_id'] = df_ana['relationships'].apply(lambda x: x['run']['data']['id'])\n", - " df_ana['sample_id'] = df_ana['relationships'].apply(lambda x: x['sample']['data']['id'])\n", - " else: \n", - " raise Exception(f\"Failed to download {url_ana}\")\n", - " print(ana_response)\n", - " # merge\n", - " metadata[accession] = df_ana.merge(df_sam[['id', 'desc', 'sam_source', 'sam_read']], left_on='sample_id', right_on='id', how='left')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "487cd4da", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "((24, 13), (24, 13))" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "enf_all = metadata[enf_accession].merge(enf_go, left_on='run_id', right_on='sample', how='inner')\n", - "inf_all = metadata[inf_accession].merge(inf_go, left_on='run_id', right_on='sample', how='inner')\n", - "# sanity check\n", - "enf_all.shape, inf_all.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "66056854", - "metadata": {}, - "outputs": [], - "source": [ - "# pairing \n", - "df_data = enf_all.merge(\n", - " inf_all,\n", - " on=['sam_source', 'sam_read'], suffixes=('_enf', '_inf')\n", - ")[['run_id_enf', 'run_id_inf', 'sam_source', 'sam_read', 'abundance_enf', 'abundance_inf']]\n", + "The permutation test compares the actual observed chosen metric (e.g., t-statistic, mean difference) with metrics calculated when the dataset values are randomly shuffled permutations of the dataset. \n", "\n", - "df_data.to_csv('/Users/anglup/GitHub/acore/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv', index=False)" + "If we do 100 permutations of our data (although we should do a bunch more) and only 1 of those permutations falsely showed a larger effect size than the actual observed effect than it suggests there is a 1/100 chance (p value of 0.01) of the observed effect sizze having occurred by chance. " ] }, { "cell_type": "code", - "execution_count": 134, + "execution_count": 2, "id": "5eefb720", "metadata": {}, "outputs": [], "source": [ "from acore.permutation_test import paired_permutation\n", "\n", - "# optional for reproducibility\n", + "# optional choice of random number generatorfor repro\n", "import numpy as np\n", "rng = np.random.default_rng(12345)" ] }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 3, "id": "9eabb911", "metadata": {}, "outputs": [ @@ -457,32 +205,35 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'metric': , 'observed': TtestResult(statistic=np.float64(6.7389860601792275), pvalue=np.float64(7.122287781830209e-07), df=np.int64(23)), 'p_value': np.float64(0.0)}\n", - "{'metric': , 'observed': np.float64(0.5350826397500547), 'p_value': np.float64(0.0)}\n", - "{'metric': , 'observed': np.float64(0.5350826397500547), 'p_value': np.float64(0.0)}\n" + "{'metric': , 'observed': TtestResult(statistic=np.float64(6.7389860601792275), pvalue=np.float64(7.122287781830209e-07), df=np.int64(23)), 'p_value': np.float64(0.0)}\n", + "{'metric': , 'observed': np.float64(0.5350826397500547), 'p_value': np.float64(0.0)}\n", + "{'metric': , 'observed': np.float64(0.5350826397500547), 'p_value': np.float64(0.0)}\n" ] } ], "source": [ + "# trying diff metrics to demo functionality also\n", "for metric in ['t-statistic', 'mean', np.mean]:\n", " result = paired_permutation(\n", - " df_data['abundance_inf'].to_numpy(),\n", - " df_data['abundance_enf'].to_numpy(), \n", + " df_data['inf_abundance'].to_numpy(),\n", + " df_data['eff_abundance'].to_numpy(), \n", " metric=metric, \n", " n_permutations=10000, \n", " rng=rng\n", " )\n", - "\n", + " # verbosity\n", " print(result)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "9da7d601", + "cell_type": "markdown", + "id": "1ad22530", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "## Result\n", + "\n", + "Based on the permutation tests by test statistic and mean difference, the probability of the observed metrics (t=6.739 and mean diff=0.535) occurring at random would be <0.00001." + ] } ], "metadata": { diff --git a/docs/index.rst b/docs/index.rst index 35f5751..af0844f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -19,6 +19,7 @@ Acore documentation api_examples/normalization_analysis api_examples/ANCOVA_analysis api_examples/enrichment_analysis + api_examples/permutation_testing .. toctree:: :maxdepth: 1 diff --git a/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv b/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv index 07477c5..ca3c6b6 100644 --- a/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv +++ b/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv @@ -1,4 +1,4 @@ -run_id_enf,run_id_inf,sam_source,sam_read,abundance_enf,abundance_inf +eff_id,inf_id,sampling_location,sampling_read,eff_abundance,inf_abundance ERR2985255,ERR2814663,TG,READ2 Taxonomy ID:256318,3.2572833167019404,4.226818522502555 ERR2985256,ERR2814664,MN,READ2 Taxonomy ID:256318,2.5728406558389647,3.8471905353288207 ERR2985257,ERR2814651,AH,READ1 Taxonomy ID:256318,4.298777376040054,4.08684074171776 From b6d049cab85b3d68053968c1847e0a408289dab9 Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Tue, 14 Oct 2025 17:13:32 +0200 Subject: [PATCH 13/25] jupytext of the permutation testing demo notebook --- docs/api_examples/permutation_testing.py | 98 ++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 docs/api_examples/permutation_testing.py diff --git a/docs/api_examples/permutation_testing.py b/docs/api_examples/permutation_testing.py new file mode 100644 index 0000000..d2bac7f --- /dev/null +++ b/docs/api_examples/permutation_testing.py @@ -0,0 +1,98 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.17.3 +# kernelspec: +# display_name: .venv +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Tutorial: Permutation Testing using `acore` +# +# In this notebook we will demonstrate how to use acore's permutation testing functions on metagenomics data collected by [Ju and colleagues (2018)](https://doi.org/10.1038/s41396-018-0277-8). +# +# The samples in this demo were collected from wastewaster treatment plant inffluent (MGYS00005056) and effluent (MGYS00005058). +# +# For this demo we look at the GO term abundance tables generated by the Mgnify pipeline. The values in the table are the absolute abundance of selected GO terms for each sample, which we then transform to relative abundances and centred-log ratios. +# + +# %% [markdown] +# ## Data preparation details +# +# ### Downloading +# The analysed samples were downloaded via the [MGnify API](https://www.ebi.ac.uk/metagenomics/api/docs/). The inffluent (INF) and effluent (EFFF) datasets have paired samples and we also needed to download the sample metadata (also available via Mgnify API) to assign the correct pairing. +# +# ### Preprocessing of abundances +# - To account for technical variation due to sequencing technology limitations, we first transform the abundance values so they are relative to the total reads for the sample aka getting relative abundances. +# - The relative abundances are compositional data (CoDa) so we map them to unconstrained vectors using centred log-ratio transformation `acore.microbiome.internal_functions.calc_clr()` to not violate assumptions of any frequentist stats we do +# +# ### Preprocessing of the metadata +# - the sample metadata needed for this demo (sampling location) were available in their "sample-desc" +# - the sample-desc for each sample in both INF and EFF were parsed and used for pairing off +# +# ### Subset of data for demo +# - For this demo we only look at [go term GO:0017001](https://www.ebi.ac.uk/QuickGO/term/GO:0017001) +# - It's expected that antibiotic catabolic processes to be higher in INF vs EFF +# +# ### Saving the demo dataset +# This example subset of data was saved to a CSV, ./example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv. The data dictionary is below: +# +# | column | description | dtype | +# |-------------------|-------------------------------------------------------------------------------------------------------------------|-------| +# | eff_id | The run id for the mgnify analysis of the effluent sample. | str | +# | inf_id | The run id for the mgnify analysis of the influent sample. | str | +# | sampling_location | [The ISO 3166-1 alpha-2 code](http://iso.org/obp/ui/#iso:pub:PUB500001:en) for the country where the sample was from. | str | +# | sampling_read | Replicates? | str | +# | eff_abundance | The relative abundance of the GO term for a given effluent sample following preprocessing (i.e., CoDA and CLR) | float | +# | inf_abundance | The relative abundance of the GO term for a given influent sample following preprocessing (i.e., CoDA and CLR) | float | +# +# ----- +# +# We will now proceed with reading in the prepared dataset. + +# %% +import pandas as pd + +df_data = pd.read_csv('/Users/anglup/GitHub/acore/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv') +# sanity check +df_data.head() + +# %% [markdown] +# ## The permutation test +# +# Since these are paird samples we will proceed with paired sample permutation test using `acore.perumutation_test.paired_permutation()`. +# +# The permutation test compares the actual observed chosen metric (e.g., t-statistic, mean difference) with metrics calculated when the dataset values are randomly shuffled permutations of the dataset. +# +# If we do 100 permutations of our data (although we should do a bunch more) and only 1 of those permutations falsely showed a larger effect size than the actual observed effect than it suggests there is a 1/100 chance (p value of 0.01) of the observed effect sizze having occurred by chance. + +# %% +from acore.permutation_test import paired_permutation + +# optional choice of random number generatorfor repro +import numpy as np +rng = np.random.default_rng(12345) + +# %% +# trying diff metrics to demo functionality also +for metric in ['t-statistic', 'mean', np.mean]: + result = paired_permutation( + df_data['inf_abundance'].to_numpy(), + df_data['eff_abundance'].to_numpy(), + metric=metric, + n_permutations=10000, + rng=rng + ) + # verbosity + print(result) + +# %% [markdown] +# ## Result +# +# Based on the permutation tests by test statistic and mean difference, the probability of the observed metrics (t=6.739 and mean diff=0.535) occurring at random would be <0.00001. From 74255e360d8ea12d0b276e5f50960d77d9b39e57 Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Tue, 14 Oct 2025 17:18:22 +0200 Subject: [PATCH 14/25] :lipstick:style: lint and reformatd code --- src/acore/permutation_test/__init__.py | 53 ++++--------- .../permutation_test/internal_functions.py | 15 ++-- src/acore/permutation_test/jaccard.py | 77 ++++++++++--------- 3 files changed, 64 insertions(+), 81 deletions(-) diff --git a/src/acore/permutation_test/__init__.py b/src/acore/permutation_test/__init__.py index 2a3cb9b..dc8e4a8 100644 --- a/src/acore/permutation_test/__init__.py +++ b/src/acore/permutation_test/__init__.py @@ -1,19 +1,14 @@ import numpy as np -import pandas as pd from scipy.stats import ( - chi2_contingency, + chi2_contingency, ttest_rel, ttest_ind, f_oneway, ) +from .internal_functions import _permute, _contingency_table, _check_degeneracy import warnings -warnings.simplefilter("always", UserWarning) -from .internal_functions import ( - _permute, - _contingency_table, - _check_degeneracy -) +warnings.simplefilter("always", UserWarning) def paired_permutation( @@ -22,7 +17,7 @@ def paired_permutation( metric: str = "t-statistic", n_permutations: int = 10000, rng: np.random.Generator = np.random.default_rng(seed=12345), - **kwargs + **kwargs, ) -> dict: """ Perform a permutation test for paired samples. @@ -49,14 +44,14 @@ def paired_permutation( - 'metric': Metric function used. - 'observed': Observed metric value. - 'p_value': Permutation test p-value (np.nan if degenerate). - """ # Validate input + """ # Validate input if not isinstance(cond1, np.ndarray) or not isinstance(cond2, np.ndarray): raise TypeError("Input must be numpy arrays.") if cond1.shape != cond2.shape: raise ValueError("Input arrays must have the same shape.") # paired differences - diff = cond1-cond2 + diff = cond1 - cond2 args = [diff] # compute observed metric @@ -87,11 +82,13 @@ def paired_permutation( for _ in range(n_permutations): # randomly flip direction of differences permuted_diff = diff * rng.choice([-1, 1], size=diff.shape) - new_cond1 = np.where((permuted_diff==diff), cond1, cond2) - new_cond2 = np.where((permuted_diff==diff), cond2, cond1) + new_cond1 = np.where((permuted_diff == diff), cond1, cond2) + new_cond2 = np.where((permuted_diff == diff), cond2, cond1) # postcondition check if not (permuted_diff == (new_cond1 - new_cond2)).all(): - raise ArithmeticError("Postcondition failed: Issue with permuted differences") + raise ArithmeticError( + "Postcondition failed: Issue with permuted differences" + ) # prep args if metric == "t-statistic": new_args = [new_cond1, new_cond2] @@ -119,17 +116,13 @@ def paired_permutation( # Compute p-value p_value = np.mean(permuted_f >= abs_met) - return { - "metric": calculator, - "observed": observed_metric, - "p_value": p_value - } + return {"metric": calculator, "observed": observed_metric, "p_value": p_value} def chi2_permutation( *groups, n_permutations: int = 10000, - rng: np.random.Generator = np.random.default_rng(seed=12345) + rng: np.random.Generator = np.random.default_rng(seed=12345), ) -> dict: """ Perform a permutation test for categorical data using the chi-squared statistic. @@ -169,10 +162,7 @@ def chi2_permutation( # Compute p-value p_value = np.mean(permuted_chi2 >= observed_chi2) - return { - "observed": observed_test, - "p_value": p_value - } + return {"observed": observed_test, "p_value": p_value} def indep_permutation( @@ -181,7 +171,7 @@ def indep_permutation( metric: str = "t-statistic", n_permutations: int = 10000, rng: np.random.Generator = np.random.default_rng(seed=12345), - **kwargs + **kwargs, ) -> dict: """ Perform a permutation test for independent samples. @@ -236,7 +226,7 @@ def indep_permutation( ) # compute observed metric - if stat: + if stat: observed_metric = calculator(group1, group2, **kwargs) else: observed_metric = abs(calculator(group1) - calculator(group2)) @@ -258,13 +248,4 @@ def indep_permutation( # Compute p-value p_value = np.mean(permuted_f >= abs_met) - return { - "metric": calculator, - "observed": observed_metric, - "p_value": p_value - } - - - - - + return {"metric": calculator, "observed": observed_metric, "p_value": p_value} diff --git a/src/acore/permutation_test/internal_functions.py b/src/acore/permutation_test/internal_functions.py index 58b8f46..80fdddd 100644 --- a/src/acore/permutation_test/internal_functions.py +++ b/src/acore/permutation_test/internal_functions.py @@ -1,6 +1,7 @@ -import numpy as np +import numpy as np import pandas as pd + def _permute(*groups, rng=np.random.default_rng(seed=12345)): """ Perform a single permutation of the groups. @@ -31,7 +32,7 @@ def _permute(*groups, rng=np.random.default_rng(seed=12345)): return new_groups -def _contingency_table(*groups, to_np:bool=True)-> np.array: +def _contingency_table(*groups, to_np: bool = True) -> np.array: """ Create a contingency table from the provided groups. @@ -52,12 +53,13 @@ def _contingency_table(*groups, to_np:bool=True)-> np.array: if not isinstance(group, (list, pd.Series, np.ndarray)): raise TypeError("Each group must be a list, pandas Series, or numpy array.") - # MAIN FUNCTION # Create contingency table - table = pd.concat( - [pd.Series(group).value_counts() for group in groups], axis=1 - ).fillna(0).T + table = ( + pd.concat([pd.Series(group).value_counts() for group in groups], axis=1) + .fillna(0) + .T + ) if to_np: return table.to_numpy() @@ -80,4 +82,3 @@ def _check_degeneracy(array: np.ndarray) -> bool: True if the conditions are degenerate, False otherwise. """ return np.all(array == array[0]) - diff --git a/src/acore/permutation_test/jaccard.py b/src/acore/permutation_test/jaccard.py index 08e52ff..f8ad37c 100644 --- a/src/acore/permutation_test/jaccard.py +++ b/src/acore/permutation_test/jaccard.py @@ -1,8 +1,9 @@ -import numpy as np +import numpy as np import itertools from collections.abc import Iterable -def jaccard_similarity(set1:set, set2:set) -> float: + +def jaccard_similarity(set1: set, set2: set) -> float: """ Compute the Jaccard similarity between two sets. @@ -15,9 +16,9 @@ def jaccard_similarity(set1:set, set2:set) -> float: Returns ------- float - Jaccard similarity coefficient, - which is the size of the intersection divided by - the size of the union of the two sets. + Jaccard similarity coefficient, + which is the size of the intersection divided by + the size of the union of the two sets. Example ------- @@ -29,31 +30,31 @@ def jaccard_similarity(set1:set, set2:set) -> float: intersection = len(set1 & set2) union = len(set1 | set2) - if union !=0: + if union != 0: return intersection / union else: return 0 -def avg_jaccard(group: Iterable)-> tuple: +def avg_jaccard(group: Iterable) -> tuple: """ - Compute the average pairwise Jaccard similarity within a group of graphs. + Compute the average pairwise Jaccard similarity within a group of graphs. - Parameters - ---------- - group : Iterable - An iterator of iterable objects, such as a list of sets. + Parameters + ---------- + group : Iterable + An iterator of iterable objects, such as a list of sets. - Returns - ------- - tuple - A tuple containing the average Jaccard similarity and its standard deviation. - - Example - ------- - >>> group = [{1, 2, 3}, {2, 3, 4}, {3, 4, 5}] - >>> avg_jaccard(group) - (0.3333333333333333, 0.0) + Returns + ------- + tuple + A tuple containing the average Jaccard similarity and its standard deviation. + + Example + ------- + >>> group = [{1, 2, 3}, {2, 3, 4}, {3, 4, 5}] + >>> avg_jaccard(group) + (0.3333333333333333, 0.0) """ ## PRECONDITIONS @@ -62,12 +63,13 @@ def avg_jaccard(group: Iterable)-> tuple: if len(group) > 0: for g in group: if not isinstance(g, Iterable): - raise TypeError("Each element in the group must be an iterable (set, list, or tuple).") + raise TypeError( + "Each element in the group must be an iterable (set, list, or tuple)." + ) elif len(group) == 0: raise ValueError("Input group is empty. Cannot compute Jaccard similarity.") - - # init the list + # init the list scores = [] for g1, g2 in itertools.combinations(group, 2): @@ -100,42 +102,41 @@ def btwn_jaccard(group1: Iterable, group2: Iterable) -> tuple: ------- tuple A tuple containing the average Jaccard similarity and its standard deviation. - + Example ------- >>> group1 = [{1, 2, 3}, {2, 3, 4}] >>> group2 = [{3, 4, 5}, {4, 5, 6}] >>> btwn_jaccard(group1, group2) (0.3333333333333333, 0.0) - + """ - + if not isinstance(group1, Iterable) or not isinstance(group2, Iterable): raise TypeError("Both inputs must be iterables of iterables.") if len(group1) == 0 or len(group2) == 0: raise ValueError("Both input groups must be non-empty.") - + # init the list scores = [] - + for g1, g2 in itertools.product(group1, group2): - # check if not isinstance(g1, Iterable) or not isinstance(g2, Iterable): - raise TypeError("Each element in the groups must be an iterable (set, list, or tuple).") - - # get sets + raise TypeError( + "Each element in the groups must be an iterable (set, list, or tuple)." + ) + + # get sets set1 = set(g1) set2 = set(g2) # compute Jaccard similarity scores.append(jaccard_similarity(set1, set2)) - + if scores: # if scores is not empty, return the average return np.mean(scores), np.std(scores) # if scores is empty (less than 2 in group), return 0 else: - return 0, 0 - - + return 0, 0 From 23a79ff3c4bb4c5b5632f78a2bad3a3b4ca66418 Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Fri, 24 Oct 2025 13:35:51 +0200 Subject: [PATCH 15/25] :bug:fix: update PR module link in contributing guide --- CONTRIBUTING.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 10f7421..9af11a3 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -125,7 +125,7 @@ Before you submit a pull request, check that it meets these guidelines: 3. The pull request should pass the workflows on GitHub. See for example the PR-Template for a module: -`Add module PR template `_. +`Add module PR template `_. From 2194b08d5627fb04068467dc0f47d13b8df8d2f5 Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Fri, 24 Oct 2025 14:03:43 +0200 Subject: [PATCH 16/25] :bug: fix: add quotation marks to command --- CONTRIBUTING.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 9af11a3..842eaae 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -70,7 +70,7 @@ advanced editor to help you with some of the common steps described below, e.g. $ cd acore/ $ python -m venv .env $ source .env/bin/activate - $ pip install -e .[dev] + $ pip install -e ".[dev]" If you work on a Windows shell, see the docs for instructions: `How venvs work `_ From 5b0124e3802b5b5154c1f0d1c3b69d5420ac90d3 Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Wed, 29 Oct 2025 16:43:50 +0100 Subject: [PATCH 17/25] :sparkles: feat: add pydantic model for perm test output --- src/acore/types/permutation_test.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 src/acore/types/permutation_test.py diff --git a/src/acore/types/permutation_test.py b/src/acore/types/permutation_test.py new file mode 100644 index 0000000..436fc12 --- /dev/null +++ b/src/acore/types/permutation_test.py @@ -0,0 +1,9 @@ +# using pydantic since these are just dictionary outputs no df +from typing import Any, Optional, Callable, Union +from pydantic import BaseModel, Field + +class PermutationResult(BaseModel): + metric: Optional[Union[str, Callable]] = Field(default=None, description="Name of the metric used in the permutation test") + observed: Any = Field(default=None, description="Observed value of the metric") + p_value: float = Field(description="p-value from the permutation test") + From 83bbfda98f6f1a11a89df02fcd022a8597008160 Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Wed, 29 Oct 2025 16:44:14 +0100 Subject: [PATCH 18/25] :sparkles: feat: add pydantic model for perm test output --- docs/api_examples/permutation_testing.ipynb | 10 +++--- src/acore/permutation_test/__init__.py | 36 +++++++++++++++++---- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/docs/api_examples/permutation_testing.ipynb b/docs/api_examples/permutation_testing.ipynb index de1b503..cb92f75 100644 --- a/docs/api_examples/permutation_testing.ipynb +++ b/docs/api_examples/permutation_testing.ipynb @@ -162,7 +162,9 @@ "source": [ "import pandas as pd \n", "\n", - "df_data = pd.read_csv('/Users/anglup/GitHub/acore/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv')\n", + "df_data = pd.read_csv(\n", + " 'https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/anglup-learning/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv'\n", + ")\n", "# sanity check \n", "df_data.head()" ] @@ -205,9 +207,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'metric': , 'observed': TtestResult(statistic=np.float64(6.7389860601792275), pvalue=np.float64(7.122287781830209e-07), df=np.int64(23)), 'p_value': np.float64(0.0)}\n", - "{'metric': , 'observed': np.float64(0.5350826397500547), 'p_value': np.float64(0.0)}\n", - "{'metric': , 'observed': np.float64(0.5350826397500547), 'p_value': np.float64(0.0)}\n" + "{'metric': , 'observed': (np.float64(6.7389860601792275), np.float64(7.122287781830209e-07)), 'p_value': 0.0}\n", + "{'metric': , 'observed': np.float64(0.5350826397500547), 'p_value': 0.0}\n", + "{'metric': , 'observed': np.float64(0.5350826397500547), 'p_value': 0.0}\n" ] } ], diff --git a/src/acore/permutation_test/__init__.py b/src/acore/permutation_test/__init__.py index dc8e4a8..0c0bbd7 100644 --- a/src/acore/permutation_test/__init__.py +++ b/src/acore/permutation_test/__init__.py @@ -5,11 +5,17 @@ ttest_ind, f_oneway, ) -from .internal_functions import _permute, _contingency_table, _check_degeneracy +from .internal_functions import ( + _permute, + _contingency_table, + _check_degeneracy +) import warnings warnings.simplefilter("always", UserWarning) +from acore.types.permutation_test import PermutationResult + def paired_permutation( cond1: np.ndarray, @@ -107,16 +113,23 @@ def paired_permutation( ) warnings.warn(identical_warn) - return { + val_result = PermutationResult.model_validate({ "metric": calculator, "observed": observed_metric, "p_value": np.nan, - } + }) + else: # Compute p-value p_value = np.mean(permuted_f >= abs_met) - return {"metric": calculator, "observed": observed_metric, "p_value": p_value} + val_result = PermutationResult.model_validate({ + "metric": calculator, + "observed": observed_metric, + "p_value": p_value + }) + + return val_result.model_dump() def chi2_permutation( @@ -162,7 +175,12 @@ def chi2_permutation( # Compute p-value p_value = np.mean(permuted_chi2 >= observed_chi2) - return {"observed": observed_test, "p_value": p_value} + val_result = PermutationResult.model_validate({ + "observed": observed_test, + "p_value": p_value + }) + + return val_result.model_dump(exclude_none=True) def indep_permutation( @@ -248,4 +266,10 @@ def indep_permutation( # Compute p-value p_value = np.mean(permuted_f >= abs_met) - return {"metric": calculator, "observed": observed_metric, "p_value": p_value} + val_result = PermutationResult.model_validate({ + "metric": calculator, + "observed": observed_metric, + "p_value": p_value + }) + + return val_result.model_dump() From e61404904898b5594229d08273128f7f2b2f90f1 Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Wed, 29 Oct 2025 16:49:03 +0100 Subject: [PATCH 19/25] :memo: docs: jupytexted the notebook --- docs/api_examples/permutation_testing.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/api_examples/permutation_testing.py b/docs/api_examples/permutation_testing.py index d2bac7f..f533b66 100644 --- a/docs/api_examples/permutation_testing.py +++ b/docs/api_examples/permutation_testing.py @@ -59,7 +59,9 @@ # %% import pandas as pd -df_data = pd.read_csv('/Users/anglup/GitHub/acore/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv') +df_data = pd.read_csv( + 'https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/anglup-learning/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv' +) # sanity check df_data.head() From 6bea62d2b19f7c47b6cfe17b99307e8481bbfa83 Mon Sep 17 00:00:00 2001 From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com> Date: Wed, 29 Oct 2025 16:53:09 +0100 Subject: [PATCH 20/25] :lipstick: style: lint and format --- docs/api_examples/permutation_testing.py | 35 ++++++++-------- src/acore/decomposition/umap.py | 1 + src/acore/enrichment_analysis/__init__.py | 4 +- src/acore/io/uniprot/uniprot.py | 2 - src/acore/microbiome/internal_functions.py | 19 +++++---- src/acore/multiple_testing/__init__.py | 2 +- src/acore/permutation_test/__init__.py | 47 +++++++++------------- src/acore/types/permutation_test.py | 6 ++- tests/test_permutation_test.py | 37 ++++++++--------- 9 files changed, 71 insertions(+), 82 deletions(-) diff --git a/docs/api_examples/permutation_testing.py b/docs/api_examples/permutation_testing.py index f533b66..91f5777 100644 --- a/docs/api_examples/permutation_testing.py +++ b/docs/api_examples/permutation_testing.py @@ -19,7 +19,7 @@ # # The samples in this demo were collected from wastewaster treatment plant inffluent (MGYS00005056) and effluent (MGYS00005058). # -# For this demo we look at the GO term abundance tables generated by the Mgnify pipeline. The values in the table are the absolute abundance of selected GO terms for each sample, which we then transform to relative abundances and centred-log ratios. +# For this demo we look at the GO term abundance tables generated by the Mgnify pipeline. The values in the table are the absolute abundance of selected GO terms for each sample, which we then transform to relative abundances and centred-log ratios. # # %% [markdown] @@ -29,11 +29,11 @@ # The analysed samples were downloaded via the [MGnify API](https://www.ebi.ac.uk/metagenomics/api/docs/). The inffluent (INF) and effluent (EFFF) datasets have paired samples and we also needed to download the sample metadata (also available via Mgnify API) to assign the correct pairing. # # ### Preprocessing of abundances -# - To account for technical variation due to sequencing technology limitations, we first transform the abundance values so they are relative to the total reads for the sample aka getting relative abundances. +# - To account for technical variation due to sequencing technology limitations, we first transform the abundance values so they are relative to the total reads for the sample aka getting relative abundances. # - The relative abundances are compositional data (CoDa) so we map them to unconstrained vectors using centred log-ratio transformation `acore.microbiome.internal_functions.calc_clr()` to not violate assumptions of any frequentist stats we do # -# ### Preprocessing of the metadata -# - the sample metadata needed for this demo (sampling location) were available in their "sample-desc" +# ### Preprocessing of the metadata +# - the sample metadata needed for this demo (sampling location) were available in their "sample-desc" # - the sample-desc for each sample in both INF and EFF were parsed and used for pairing off # # ### Subset of data for demo @@ -54,42 +54,43 @@ # # ----- # -# We will now proceed with reading in the prepared dataset. +# We will now proceed with reading in the prepared dataset. # %% -import pandas as pd +import pandas as pd df_data = pd.read_csv( - 'https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/anglup-learning/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv' + "https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/anglup-learning/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv" ) -# sanity check +# sanity check df_data.head() # %% [markdown] # ## The permutation test # -# Since these are paird samples we will proceed with paired sample permutation test using `acore.perumutation_test.paired_permutation()`. +# Since these are paird samples we will proceed with paired sample permutation test using `acore.perumutation_test.paired_permutation()`. # -# The permutation test compares the actual observed chosen metric (e.g., t-statistic, mean difference) with metrics calculated when the dataset values are randomly shuffled permutations of the dataset. +# The permutation test compares the actual observed chosen metric (e.g., t-statistic, mean difference) with metrics calculated when the dataset values are randomly shuffled permutations of the dataset. # -# If we do 100 permutations of our data (although we should do a bunch more) and only 1 of those permutations falsely showed a larger effect size than the actual observed effect than it suggests there is a 1/100 chance (p value of 0.01) of the observed effect sizze having occurred by chance. +# If we do 100 permutations of our data (although we should do a bunch more) and only 1 of those permutations falsely showed a larger effect size than the actual observed effect than it suggests there is a 1/100 chance (p value of 0.01) of the observed effect sizze having occurred by chance. # %% from acore.permutation_test import paired_permutation # optional choice of random number generatorfor repro import numpy as np + rng = np.random.default_rng(12345) # %% # trying diff metrics to demo functionality also -for metric in ['t-statistic', 'mean', np.mean]: +for metric in ["t-statistic", "mean", np.mean]: result = paired_permutation( - df_data['inf_abundance'].to_numpy(), - df_data['eff_abundance'].to_numpy(), - metric=metric, - n_permutations=10000, - rng=rng + df_data["inf_abundance"].to_numpy(), + df_data["eff_abundance"].to_numpy(), + metric=metric, + n_permutations=10000, + rng=rng, ) # verbosity print(result) diff --git a/src/acore/decomposition/umap.py b/src/acore/decomposition/umap.py index 2a05dd7..6d5ab21 100644 --- a/src/acore/decomposition/umap.py +++ b/src/acore/decomposition/umap.py @@ -1,4 +1,5 @@ """Run UMAP on DataFrame and return result.""" + import pandas as pd import umap diff --git a/src/acore/enrichment_analysis/__init__.py b/src/acore/enrichment_analysis/__init__.py index b0a09a6..b3ae590 100644 --- a/src/acore/enrichment_analysis/__init__.py +++ b/src/acore/enrichment_analysis/__init__.py @@ -552,13 +552,13 @@ def run_ssgsea( index.index = name df = df.set_index("Name").transpose() - if not annotation_col in annotation: + if annotation_col not in annotation: raise ValueError( f"Missing Annotation Column: {annotation_col}" " as specified by `annotation_col`" ) - if not identifier_col in annotation: + if identifier_col not in annotation: raise ValueError( f"Missing Identifier Column: {identifier_col}" " as specified by `identifier_col`" diff --git a/src/acore/io/uniprot/uniprot.py b/src/acore/io/uniprot/uniprot.py index c9abf05..f359eb1 100644 --- a/src/acore/io/uniprot/uniprot.py +++ b/src/acore/io/uniprot/uniprot.py @@ -177,8 +177,6 @@ def get_id_mapping_results_stream(url): return decode_results(request, file_format, compressed) - - if __name__ == "__main__": # id mapping is used to create a link to a query (you can see the json in the browser) # UniProtKB is the knowleadgebase integrating all kind of other databases diff --git a/src/acore/microbiome/internal_functions.py b/src/acore/microbiome/internal_functions.py index 1afe23c..637bc21 100644 --- a/src/acore/microbiome/internal_functions.py +++ b/src/acore/microbiome/internal_functions.py @@ -4,10 +4,11 @@ from sklearn.pipeline import make_pipeline import pandas as pd + def calc_clr(x): """ Calculate the centered log-ratio (CLR) transformation. - + Parameters ---------- x : array-like @@ -18,7 +19,7 @@ def calc_clr(x): array-like CLR transformed data. """ - # replace zeros with small + # replace zeros with small new_x = np.where(x > 0, x, 1e-10) return np.log(new_x) - np.mean(np.log(new_x), axis=0) @@ -40,15 +41,13 @@ def coda_clr(df: pd.DataFrame) -> pd.DataFrame: """ # Define the CoDA and CLR transformers coda = FunctionTransformer( - lambda x: x / x.sum(axis=0), - feature_names_out="one-to-one" - ).set_output(transform="pandas") - clr = FunctionTransformer( - calc_clr, - feature_names_out="one-to-one" + lambda x: x / x.sum(axis=0), feature_names_out="one-to-one" ).set_output(transform="pandas") + clr = FunctionTransformer(calc_clr, feature_names_out="one-to-one").set_output( + transform="pandas" + ) # Create a pipeline with CoDA and CLR transformations pipe = make_pipeline(coda, clr).set_output(transform="pandas") # Apply the pipeline to numeric columns and concatenate with non-numeric columns - transformed = pipe.fit_transform(df.select_dtypes(include='number')) - return pd.concat([df.select_dtypes(include='object'), transformed], axis=1) \ No newline at end of file + transformed = pipe.fit_transform(df.select_dtypes(include="number")) + return pd.concat([df.select_dtypes(include="object"), transformed], axis=1) diff --git a/src/acore/multiple_testing/__init__.py b/src/acore/multiple_testing/__init__.py index 3ca9316..1949daa 100644 --- a/src/acore/multiple_testing/__init__.py +++ b/src/acore/multiple_testing/__init__.py @@ -207,7 +207,7 @@ def correct_pairwise_ttest(df, alpha, correction="fdr_bh"): required_col = ["group1", "group2", "posthoc pvalue"] for _col in required_col: - if not _col in df: + if _col not in df: raise KeyError(f"Did not find '{_col}' in columns of data.") for comparison in df.groupby(["group1", "group2"]).groups: diff --git a/src/acore/permutation_test/__init__.py b/src/acore/permutation_test/__init__.py index 0c0bbd7..b6bc69c 100644 --- a/src/acore/permutation_test/__init__.py +++ b/src/acore/permutation_test/__init__.py @@ -5,17 +5,13 @@ ttest_ind, f_oneway, ) -from .internal_functions import ( - _permute, - _contingency_table, - _check_degeneracy -) +from .internal_functions import _permute, _contingency_table, _check_degeneracy import warnings -warnings.simplefilter("always", UserWarning) - from acore.types.permutation_test import PermutationResult +warnings.simplefilter("always", UserWarning) + def paired_permutation( cond1: np.ndarray, @@ -113,21 +109,21 @@ def paired_permutation( ) warnings.warn(identical_warn) - val_result = PermutationResult.model_validate({ - "metric": calculator, - "observed": observed_metric, - "p_value": np.nan, - }) - + val_result = PermutationResult.model_validate( + { + "metric": calculator, + "observed": observed_metric, + "p_value": np.nan, + } + ) + else: # Compute p-value p_value = np.mean(permuted_f >= abs_met) - val_result = PermutationResult.model_validate({ - "metric": calculator, - "observed": observed_metric, - "p_value": p_value - }) + val_result = PermutationResult.model_validate( + {"metric": calculator, "observed": observed_metric, "p_value": p_value} + ) return val_result.model_dump() @@ -175,10 +171,9 @@ def chi2_permutation( # Compute p-value p_value = np.mean(permuted_chi2 >= observed_chi2) - val_result = PermutationResult.model_validate({ - "observed": observed_test, - "p_value": p_value - }) + val_result = PermutationResult.model_validate( + {"observed": observed_test, "p_value": p_value} + ) return val_result.model_dump(exclude_none=True) @@ -266,10 +261,8 @@ def indep_permutation( # Compute p-value p_value = np.mean(permuted_f >= abs_met) - val_result = PermutationResult.model_validate({ - "metric": calculator, - "observed": observed_metric, - "p_value": p_value - }) + val_result = PermutationResult.model_validate( + {"metric": calculator, "observed": observed_metric, "p_value": p_value} + ) return val_result.model_dump() diff --git a/src/acore/types/permutation_test.py b/src/acore/types/permutation_test.py index 436fc12..42d3831 100644 --- a/src/acore/types/permutation_test.py +++ b/src/acore/types/permutation_test.py @@ -2,8 +2,10 @@ from typing import Any, Optional, Callable, Union from pydantic import BaseModel, Field + class PermutationResult(BaseModel): - metric: Optional[Union[str, Callable]] = Field(default=None, description="Name of the metric used in the permutation test") + metric: Optional[Union[str, Callable]] = Field( + default=None, description="Name of the metric used in the permutation test" + ) observed: Any = Field(default=None, description="Observed value of the metric") p_value: float = Field(description="p-value from the permutation test") - diff --git a/tests/test_permutation_test.py b/tests/test_permutation_test.py index 2372e31..c755cbc 100644 --- a/tests/test_permutation_test.py +++ b/tests/test_permutation_test.py @@ -3,36 +3,36 @@ import numpy as np from scipy.stats import ttest_rel, ttest_ind, mannwhitneyu, wilcoxon + @pytest.fixture def set_array(): return np.arange(1, 10) + @pytest.fixture def random_low(): return np.random.negative_binomial(10, 0.3, 100) + @pytest.fixture def random_high(): return np.random.negative_binomial(20, 0.1, 100) + @pytest.fixture def random_choice_equal(): return np.random.choice( - ["A", "B", "C", "D"], - size=100, - replace=True, - p=[0.25, 0.25, 0.25, 0.25] + ["A", "B", "C", "D"], size=100, replace=True, p=[0.25, 0.25, 0.25, 0.25] ) + @pytest.fixture def random_choice_unequal(): return np.random.choice( - ["A", "B", "C", "D"], - size=100, - replace=True, - p=[0.1, 0.25, 0.05, 0.6] + ["A", "B", "C", "D"], size=100, replace=True, p=[0.1, 0.25, 0.05, 0.6] ) + def test_paired_permutation_same(set_array): cond1 = set_array cond2 = set_array @@ -53,12 +53,11 @@ def test_paired_permutation_reject(random_low, random_high): def test_paired_permutation_degen(set_array): - for stat in ["t-statistic", "mean", "median"]: - assert pt.paired_permutation( - set_array, - set_array+30, - metric=stat - )["p_value"] is np.nan + for stat in ["t-statistic", "mean", "median"]: + assert ( + pt.paired_permutation(set_array, set_array + 30, metric=stat)["p_value"] + is np.nan + ) def test_indep_permutation_reject(random_low, random_high): @@ -71,11 +70,9 @@ def test_indep_permutation_reject(random_low, random_high): def test_indep_permutation_accept(random_low): for stat in ["t-statistic", "anova", "median", "mean"]: - assert pt.indep_permutation( - random_low, - random_low, - metric=stat - )["p_value"] > 0.05 + assert ( + pt.indep_permutation(random_low, random_low, metric=stat)["p_value"] > 0.05 + ) def test_chi2_permutation_accept(random_choice_equal): @@ -86,5 +83,3 @@ def test_chi2_permutation_accept(random_choice_equal): def test_chi2_permutation_reject(random_choice_unequal, random_choice_equal): result = pt.chi2_permutation(random_choice_equal, random_choice_unequal) assert result["p_value"] < 0.05 - - From f27cf2ecce917c65cd03bdc709e7437162755c54 Mon Sep 17 00:00:00 2001 From: "Angel L. P." <59593766+angelphanth@users.noreply.github.com> Date: Fri, 7 Nov 2025 13:40:56 +0100 Subject: [PATCH 21/25] Update docs/api_examples/permutation_testing.py Co-authored-by: Henry Webel --- docs/api_examples/permutation_testing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api_examples/permutation_testing.py b/docs/api_examples/permutation_testing.py index 91f5777..6a48ef7 100644 --- a/docs/api_examples/permutation_testing.py +++ b/docs/api_examples/permutation_testing.py @@ -30,7 +30,7 @@ # # ### Preprocessing of abundances # - To account for technical variation due to sequencing technology limitations, we first transform the abundance values so they are relative to the total reads for the sample aka getting relative abundances. -# - The relative abundances are compositional data (CoDa) so we map them to unconstrained vectors using centred log-ratio transformation `acore.microbiome.internal_functions.calc_clr()` to not violate assumptions of any frequentist stats we do +# - The relative abundances are compositional data (CoDa) so we map them to unconstrained vectors using centred log-ratio transformation [`acore.microbiome.internal_functions.calc_clr`](`acore.microbiome.internal_functions.calc_clr`) to not violate assumptions of any frequentist stats we do # # ### Preprocessing of the metadata # - the sample metadata needed for this demo (sampling location) were available in their "sample-desc" From 8415f0f6b0b2549576f110d6762f07fd2f1244dd Mon Sep 17 00:00:00 2001 From: "Angel L. P." <59593766+angelphanth@users.noreply.github.com> Date: Fri, 7 Nov 2025 13:41:14 +0100 Subject: [PATCH 22/25] Update docs/api_examples/permutation_testing.py Co-authored-by: Henry Webel --- docs/api_examples/permutation_testing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api_examples/permutation_testing.py b/docs/api_examples/permutation_testing.py index 6a48ef7..e920df2 100644 --- a/docs/api_examples/permutation_testing.py +++ b/docs/api_examples/permutation_testing.py @@ -13,7 +13,7 @@ # --- # %% [markdown] -# # Tutorial: Permutation Testing using `acore` +# # Permutation Tests # # In this notebook we will demonstrate how to use acore's permutation testing functions on metagenomics data collected by [Ju and colleagues (2018)](https://doi.org/10.1038/s41396-018-0277-8). # From ccfe00e987876faa6a49788ffbbfb055793aacd6 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 12 Jan 2026 16:59:21 +0100 Subject: [PATCH 23/25] Apply suggestions from code review mainly typos Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- docs/api_examples/permutation_testing.ipynb | 6 +++--- docs/api_examples/permutation_testing.py | 6 +++--- src/acore/permutation_test/__init__.py | 4 ++-- tests/test_permutation_test.py | 2 -- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/docs/api_examples/permutation_testing.ipynb b/docs/api_examples/permutation_testing.ipynb index cb92f75..5471464 100644 --- a/docs/api_examples/permutation_testing.ipynb +++ b/docs/api_examples/permutation_testing.ipynb @@ -9,7 +9,7 @@ "\n", "In this notebook we will demonstrate how to use acore's permutation testing functions on metagenomics data collected by [Ju and colleagues (2018)](https://doi.org/10.1038/s41396-018-0277-8).\n", "\n", - "The samples in this demo were collected from wastewaster treatment plant inffluent (MGYS00005056) and effluent (MGYS00005058).\n", + "The samples in this demo were collected from wastewater treatment plant influent (MGYS00005056) and effluent (MGYS00005058).\n", "\n", "For this demo we look at the GO term abundance tables generated by the Mgnify pipeline. The values in the table are the absolute abundance of selected GO terms for each sample, which we then transform to relative abundances and centred-log ratios. \n" ] @@ -180,7 +180,7 @@ "\n", "The permutation test compares the actual observed chosen metric (e.g., t-statistic, mean difference) with metrics calculated when the dataset values are randomly shuffled permutations of the dataset. \n", "\n", - "If we do 100 permutations of our data (although we should do a bunch more) and only 1 of those permutations falsely showed a larger effect size than the actual observed effect than it suggests there is a 1/100 chance (p value of 0.01) of the observed effect sizze having occurred by chance. " + "If we do 100 permutations of our data (although we should do a bunch more) and only 1 of those permutations falsely showed a larger effect size than the actual observed effect than it suggests there is a 1/100 chance (p value of 0.01) of the observed effect size having occurred by chance. " ] }, { @@ -192,7 +192,7 @@ "source": [ "from acore.permutation_test import paired_permutation\n", "\n", - "# optional choice of random number generatorfor repro\n", + "# optional choice of random number generator for repro\n", "import numpy as np\n", "rng = np.random.default_rng(12345)" ] diff --git a/docs/api_examples/permutation_testing.py b/docs/api_examples/permutation_testing.py index e920df2..e54723c 100644 --- a/docs/api_examples/permutation_testing.py +++ b/docs/api_examples/permutation_testing.py @@ -17,7 +17,7 @@ # # In this notebook we will demonstrate how to use acore's permutation testing functions on metagenomics data collected by [Ju and colleagues (2018)](https://doi.org/10.1038/s41396-018-0277-8). # -# The samples in this demo were collected from wastewaster treatment plant inffluent (MGYS00005056) and effluent (MGYS00005058). +# The samples in this demo were collected from wastewater treatment plant influent (MGYS00005056) and effluent (MGYS00005058). # # For this demo we look at the GO term abundance tables generated by the Mgnify pipeline. The values in the table are the absolute abundance of selected GO terms for each sample, which we then transform to relative abundances and centred-log ratios. # @@ -72,12 +72,12 @@ # # The permutation test compares the actual observed chosen metric (e.g., t-statistic, mean difference) with metrics calculated when the dataset values are randomly shuffled permutations of the dataset. # -# If we do 100 permutations of our data (although we should do a bunch more) and only 1 of those permutations falsely showed a larger effect size than the actual observed effect than it suggests there is a 1/100 chance (p value of 0.01) of the observed effect sizze having occurred by chance. +# If we do 100 permutations of our data (although we should do a bunch more) and only 1 of those permutations falsely showed a larger effect size than the actual observed effect than it suggests there is a 1/100 chance (p value of 0.01) of the observed effect size having occurred by chance. # %% from acore.permutation_test import paired_permutation -# optional choice of random number generatorfor repro +# optional choice of random number generator for repro import numpy as np rng = np.random.default_rng(12345) diff --git a/src/acore/permutation_test/__init__.py b/src/acore/permutation_test/__init__.py index b6bc69c..a88dd5d 100644 --- a/src/acore/permutation_test/__init__.py +++ b/src/acore/permutation_test/__init__.py @@ -69,7 +69,7 @@ def paired_permutation( else: raise ValueError( "Invalid metric specified. Acceptable metrics are: " - "'t-statistic', 'mean', 'median', or a custom function" + "'t-statistic', 'mean', 'median', or a custom function " "that takes `cond1-cond2` as input." ) @@ -234,7 +234,7 @@ def indep_permutation( else: raise ValueError( "Invalid metric specified. Acceptable metrics are: " - "'t-statistic', 'mean', 'median', or a custom function" + "'t-statistic', 'mean', 'median', or a custom function " "that takes each group as input." ) diff --git a/tests/test_permutation_test.py b/tests/test_permutation_test.py index c755cbc..ad2db46 100644 --- a/tests/test_permutation_test.py +++ b/tests/test_permutation_test.py @@ -47,8 +47,6 @@ def test_paired_permutation_reject(random_low, random_high): cond2 = random_high for stat in ["t-statistic", "mean", "median"]: result = pt.paired_permutation(cond1, cond2, metric=stat) - print(result) - print(ttest_rel(cond1, cond2)) assert result["p_value"] < 0.05 From 0ef5a84951ec10d638b624eda13db10b5f3e2d41 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Wed, 14 Jan 2026 13:45:58 +0100 Subject: [PATCH 24/25] Merge branch 'main' into anglup-learning --- AUTHORS.md | 10 + AUTHORS.rst | 13 - CONTRIBUTING.md | 130 ++++++ CONTRIBUTING.rst | 138 ------ HISTORY.md | 5 + HISTORY.rst | 8 - docs/api_examples/batch_correction.ipynb | 430 ++++++++++++++++++ docs/api_examples/batch_correction.py | 233 ++++++++++ .../api_examples/normalization_analysis.ipynb | 93 +--- docs/api_examples/normalization_analysis.py | 35 +- docs/authors.md | 4 + docs/authors.rst | 1 - docs/conf.py | 205 +++------ docs/contributing.md | 4 + docs/contributing.rst | 1 - docs/history.md | 4 + docs/history.rst | 1 - docs/index.md | 69 +++ docs/index.rst | 63 --- .../syntethic_pep_enrichment/1_analysis.py | 5 +- src/acore/batch_correction/__init__.py | 53 +++ src/acore/normalization/__init__.py | 48 -- tests/test_normalization.py | 4 +- 23 files changed, 1012 insertions(+), 545 deletions(-) create mode 100644 AUTHORS.md delete mode 100644 AUTHORS.rst create mode 100644 CONTRIBUTING.md delete mode 100644 CONTRIBUTING.rst create mode 100644 HISTORY.md delete mode 100644 HISTORY.rst create mode 100644 docs/api_examples/batch_correction.ipynb create mode 100644 docs/api_examples/batch_correction.py create mode 100644 docs/authors.md delete mode 100644 docs/authors.rst create mode 100644 docs/contributing.md delete mode 100644 docs/contributing.rst create mode 100644 docs/history.md delete mode 100644 docs/history.rst create mode 100644 docs/index.md delete mode 100644 docs/index.rst create mode 100644 src/acore/batch_correction/__init__.py diff --git a/AUTHORS.md b/AUTHORS.md new file mode 100644 index 0000000..f3e0d81 --- /dev/null +++ b/AUTHORS.md @@ -0,0 +1,10 @@ +# Credits + +## Development Lead + +- Alberto Santos Delgado +- Henry Webel + +## Contributors + +- Sebastián Ayala Ruano diff --git a/AUTHORS.rst b/AUTHORS.rst deleted file mode 100644 index 5048a97..0000000 --- a/AUTHORS.rst +++ /dev/null @@ -1,13 +0,0 @@ -======= -Credits -======= - -Development Lead ----------------- - -* Alberto Santos Delgado - -Contributors ------------- - -None yet. Why not be the first? diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..fab7d04 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,130 @@ +# Contributing + +Contributions are welcome, and they are greatly appreciated! Every little bit +helps, and credit will always be given. + +You can contribute in many ways: + +## Report Bugs + +Report bugs at +[github.com/Multiomics-Analytics-Group/acore/issues](https://github.com/Multiomics-Analytics-Group/acore/issues). + +If you are reporting a bug, please include: + +- Your operating system name and version. +- Any details about your local setup that might be helpful in troubleshooting. +- Detailed steps to reproduce the bug. + +## Fix Bugs + +Look through the GitHub issues for bugs. Anything tagged with "bug" and "help +wanted" is open to whoever wants to implement it. + +## Implement Features + +Look through the GitHub issues for features. Anything tagged with "enhancement" +and "help wanted" is open to whoever wants to implement it. + +## Write Documentation + +acore could always use more documentation, whether as part of the +official acore docs, in docstrings, or even on the web in blog posts, +articles, and such. + +## Submit Feedback + +The best way to send feedback is to file an issue at +[github.com/Multiomics-Analytics-Group/acore/issues](https://github.com/Multiomics-Analytics-Group/acore/issues). + +If you are proposing a feature: + +- Explain in detail how it would work. +- Keep the scope as narrow as possible, to make it easier to implement. +- Remember that this is a volunteer-driven project, and that contributions are welcome :) + +## Get Started! + +Ready to contribute? Here's how to set up `acore` for local development. Consider an +advanced editor to help you with some of the common steps described below, e.g. +[VSCode](https://code.visualstudio.com/docs/introvideos/basics). + +1. Fork the `acore` repo on GitHub. +2. Clone your fork locally: + +```sh +git clone https://github.com/Multiomics-Analytics-Group/acore.git +``` + +3. Install your local copy into a virtual environment. Assuming you have Python available + on your system, this can be done using `venv`. Alternatives are conda environments + or uv to create and manage virtual environments. + +```sh +cd acore/ +python -m venv .env +source .env/bin/activate +pip install -e ".[dev]" +``` + +If you work on Windows, see the docs: https://docs.python.org/3/library/venv.html#how-venvs-work + +4. Create a branch for local development: + +```sh +git checkout -b name-of-your-bugfix-or-feature +``` + +5. When you're done making changes, check formatting and run tests locally: + +```sh +black . +ruff check src +pytest . +``` + +Some `ruff` fixes can be applied automatically with `--fix`: + +```sh +ruff check src --fix +``` + +6. Commit your changes and push your branch to GitHub: + +```sh +git add . +git commit -m "Your detailed description of your changes." +git push origin name-of-your-bugfix-or-feature +``` + +7. Submit a pull request through the GitHub website. + +## General design principles in the library + +- Prefer a single `DataFrame` output type per module (subpackage). For example, + the enrichment module should output a `DataFrame` that adheres to a single pandera + schema defined under `src/acore/types/enrichment_analysis.py`. Thus there is a 'type' + of enrichment analysis results. +- User-facing functions should have clear names and good docstrings + (e.g. `run_analysis`, `run_enrichment_analysis`, `apply_normalization`). +- Docstrings may follow numpy, google, or classic Sphinx styles. + +## Pull Request Guidelines + +Before you submit a pull request, check that it meets these guidelines: + +1. The pull request should include tests. +2. If the pull request adds functionality, update the docs. Put your new functionality + into a function with a docstring, and add the feature to the list in README.rst. +3. The pull request should pass the GitHub workflows. + +See the PR template example: +[Add module PR template](https://github.com/Multiomics-Analytics-Group/acore/blob/main/.github/workflows/PULL_REQUEST_TEMPLATE/module.md) + +## Deploying + +A reminder for maintainers on how to deploy: + +- Make sure all changes are committed (including an entry in HISTORY.rst). +- Create a new [GitHub release](https://github.com/Multiomics-Analytics-Group/acore/releases) +- The package will be deployed to PyPI if the tests pass. diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst deleted file mode 100644 index 842eaae..0000000 --- a/CONTRIBUTING.rst +++ /dev/null @@ -1,138 +0,0 @@ -.. highlight:: shell - -============ -Contributing -============ - -Contributions are welcome, and they are greatly appreciated! Every little bit -helps, and credit will always be given. - -You can contribute in many ways: - -Report Bugs -~~~~~~~~~~~ - -Report bugs at https://github.com/Multiomics-Analytics-Group/acore/issues. - -If you are reporting a bug, please include: - -* Your operating system name and version. -* Any details about your local setup that might be helpful in troubleshooting. -* Detailed steps to reproduce the bug. - -Fix Bugs -~~~~~~~~ - -Look through the GitHub issues for bugs. Anything tagged with "bug" and "help -wanted" is open to whoever wants to implement it. - -Implement Features -~~~~~~~~~~~~~~~~~~ - -Look through the GitHub issues for features. Anything tagged with "enhancement" -and "help wanted" is open to whoever wants to implement it. - -Write Documentation -~~~~~~~~~~~~~~~~~~~ - -acore could always use more documentation, whether as part of the -official acore docs, in docstrings, or even on the web in blog posts, -articles, and such. - -Submit Feedback -~~~~~~~~~~~~~~~ - -The best way to send feedback is to file an issue at https://github.com/Multiomics-Analytics-Group/acore/issues. - -If you are proposing a feature: - -* Explain in detail how it would work. -* Keep the scope as narrow as possible, to make it easier to implement. -* Remember that this is a volunteer-driven project, and that contributions - are welcome :) - -Get Started! ------------- - -Ready to contribute? Here's how to set up `acore` for local development. Consider an -advanced editor to help you with some of the common steps described below, e.g. -`VSCode `_. - -1. Fork the `acore` repo on GitHub. -2. Clone your fork locally:: - - $ git clone https://github.com/Multiomics-Analytics-Group/acore.git - -3. Install your local copy into a virtual environment. Assuming you have Python available - on your system, this can be done using `venv`. Alternatives are conda environments - or uv to create and manage virtual environments:: - - $ cd acore/ - $ python -m venv .env - $ source .env/bin/activate - $ pip install -e ".[dev]" - -If you work on a Windows shell, see the docs for instructions: -`How venvs work `_ - -4. Create a branch for local development:: - - $ git checkout -b name-of-your-bugfix-or-feature - - Now you can make your changes locally. - -5. When you're done making changes, check that your changes are formatted and pass ruff - checks and the tests at least in your development environment:: - - $ black . - $ ruff check src - $ pytest . - - Some changes ruff can automatically fix for you, if you pass the `--fix` flag: - - $ ruff check src --fix - -6. Commit your changes and push your branch to GitHub:: - - $ git add . - $ git commit -m "Your detailed description of your changes." - $ git push origin name-of-your-bugfix-or-feature - -7. Submit a pull request through the GitHub website. - -General design principles in the library ----------------------------------------- - -- at best only one type of `DataFrame` output per module (subpackage.). For example, - the enrichment module should output a `DataFrame` which adheres to a single pandera - schema defined under `src/acore/types/enrichment_analysis.py`. Thus there is a 'type' - of enrichment analysis results. -- User facing functions should have clear names and good docstrings. They can something - along `run_analysis` (`run_enrichment_analysis`) or - `apply_normalization` (`apply_step` or `apply_method`). -- docstrings for functions can be numpy, google or the classic Sphinx one. - - -Pull Request Guidelines ------------------------ - -Before you submit a pull request, check that it meets these guidelines: - -1. The pull request should include tests. -2. If the pull request adds functionality, the docs should be updated. Put - your new functionality into a function with a docstring, and add the - feature to the list in README.rst. -3. The pull request should pass the workflows on GitHub. - -See for example the PR-Template for a module: -`Add module PR template `_. - - - -Deploying ---------- - -A reminder for the maintainers on how to deploy. -Make sure all your changes are committed (including an entry in HISTORY.rst). -Then run create a new `GitHub release `_. -The code will then be deployed to PyPI if the tests pass. diff --git a/HISTORY.md b/HISTORY.md new file mode 100644 index 0000000..c8c33b1 --- /dev/null +++ b/HISTORY.md @@ -0,0 +1,5 @@ +# History + +Please see the Release notes on GitHub: + +[Multiomics-Analytics-Group/acore/releases](https://github.com/Multiomics-Analytics-Group/acore/releases) diff --git a/HISTORY.rst b/HISTORY.rst deleted file mode 100644 index 85c741f..0000000 --- a/HISTORY.rst +++ /dev/null @@ -1,8 +0,0 @@ -======= -History -======= - -0.1.0 (2023-09-26) ------------------- - -* First release on PyPI. diff --git a/docs/api_examples/batch_correction.ipynb b/docs/api_examples/batch_correction.ipynb new file mode 100644 index 0000000..6974363 --- /dev/null +++ b/docs/api_examples/batch_correction.ipynb @@ -0,0 +1,430 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "27b8d53e", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "# Batch correction of samples\n", + "\n", + "We will explore an Alzheimer dataset where the data was collected in four different sites.\n", + "We will see that the sites have a an effect where the data is in principal component space\n", + "and in UMAP space. We will then batch correct the data and see how the effect on these plots.\n", + "\n", + "Refers to the [`acore.batch_correction`](acore.batch_correction) module." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dcc1505c", + "metadata": { + "tags": [ + "hide-output" + ] + }, + "outputs": [], + "source": [ + "%pip install acore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "563697dc", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "from typing import Optional\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import sklearn\n", + "import sklearn.impute\n", + "import sklearn.preprocessing\n", + "import vuecore.decomposition\n", + "\n", + "import acore.batch_correction\n", + "import acore.decomposition\n", + "\n", + "\n", + "def plot_umap(X_scaled, y, meta_column, random_state=42) -> plt.Axes:\n", + " \"\"\"Fit and plot UMAP embedding with two components with colors defined by meta_column.\"\"\"\n", + " embedding = acore.decomposition.umap.run_umap(\n", + " X_scaled, y, random_state=random_state\n", + " )\n", + " ax = embedding.plot.scatter(\"UMAP 1\", \"UMAP 2\", c=meta_column, cmap=\"Paired\")\n", + " return ax\n", + "\n", + "\n", + "def standard_normalize(X: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"Standard normalize data and keep indices of DataFrame.\"\"\"\n", + " X_scaled = (\n", + " sklearn.preprocessing.StandardScaler()\n", + " .set_output(transform=\"pandas\")\n", + " .fit_transform(X)\n", + " )\n", + " return X_scaled\n", + "\n", + "\n", + "def median_impute(X: pd.DataFrame) -> pd.DataFrame:\n", + " X_imputed = (\n", + " sklearn.impute.SimpleImputer(strategy=\"median\")\n", + " .set_output(transform=\"pandas\")\n", + " .fit_transform(X)\n", + " )\n", + " return X_imputed\n", + "\n", + "\n", + "def run_and_plot_pca(\n", + " X_scaled,\n", + " y,\n", + " meta_column: Optional[str] = None,\n", + " n_components: int = 4,\n", + ") -> tuple[pd.DataFrame, plt.Figure]:\n", + " PCs, _ = acore.decomposition.pca.run_pca(X_scaled, n_components=n_components)\n", + " PCs.columns = [s.replace(\"principal component\", \"PC\") for s in PCs.columns]\n", + " fig = vuecore.decomposition.pca_grid(\n", + " PCs=PCs, meta_column=y, n_components=n_components, meta_col_name=meta_column\n", + " )\n", + " return PCs, fig" + ] + }, + { + "cell_type": "markdown", + "id": "60ee225e", + "metadata": {}, + "source": [ + "\n", + "## Set some parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a32698b4", + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "fname_metadata: str = (\n", + " \"https://raw.githubusercontent.com/RasmussenLab/\"\n", + " \"njab/HEAD/docs/tutorial/data/alzheimer/meta.csv\" # clincial data\n", + ")\n", + "fname_omics: str = (\n", + " \"https://raw.githubusercontent.com/RasmussenLab/\"\n", + " \"njab/HEAD/docs/tutorial/data/alzheimer/proteome.csv\" # omics data\n", + ")\n", + "METACOL: str = \"_collection site\" # target column in fname_metadata dataset (binary)\n", + "METACOL_LABEL: Optional[str] = \"site\" # optional: rename target variable\n", + "n_features_max: int = 5\n", + "freq_cutoff: float = 0.5 # Omics cutoff for sample completeness\n", + "VAL_IDS: str = \"\" #\n", + "VAL_IDS_query: str = \"\"\n", + "weights: bool = True\n", + "FOLDER = \"alzheimer\"\n", + "model_name = \"all\"" + ] + }, + { + "cell_type": "markdown", + "id": "9bd7687c", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "id": "55b42bc2", + "metadata": {}, + "source": [ + "### Load proteomics (protein groups) data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "857b0522", + "metadata": {}, + "outputs": [], + "source": [ + "if METACOL_LABEL is None:\n", + " METACOL_LABEL = METACOL\n", + "metadata = (\n", + " pd.read_csv(fname_metadata, usecols=[\"Sample ID\", METACOL], index_col=0)\n", + " .convert_dtypes()\n", + " .rename(columns={METACOL: METACOL_LABEL})\n", + ")\n", + "omics = pd.read_csv(fname_omics, index_col=0)" + ] + }, + { + "cell_type": "markdown", + "id": "4ad54348", + "metadata": {}, + "source": [ + "Data shapes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fedfab62", + "metadata": {}, + "outputs": [], + "source": [ + "omics.shape, metadata.shape" + ] + }, + { + "cell_type": "markdown", + "id": "ed9362eb", + "metadata": {}, + "source": [ + "See how common omics features are and remove feature below choosen frequency cutoff" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4fca7328", + "metadata": {}, + "outputs": [], + "source": [ + "ax = omics.notna().sum().sort_values().plot(rot=90)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "954c4682", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "M_before = omics.shape[1]\n", + "omics = omics.dropna(thresh=int(len(omics) * freq_cutoff), axis=1)\n", + "M_after = omics.shape[1]\n", + "msg = (\n", + " f\"Removed {M_before-M_after} features with more \"\n", + " f\"than {freq_cutoff*100}% missing values.\"\n", + " f\"\\nRemaining features: {M_after} (of {M_before})\"\n", + ")\n", + "print(msg)\n", + "# keep a map of all proteins in protein group, but only display first protein\n", + "# proteins are unique to protein groups\n", + "pg_map = {k: k.split(\";\")[0] for k in omics.columns}\n", + "omics = omics.rename(columns=pg_map)\n", + "# log2 transform raw intensity data:\n", + "omics = np.log2(omics + 1)\n", + "ax = (\n", + " omics.notna()\n", + " .sum()\n", + " .sort_values()\n", + " .plot(\n", + " rot=90,\n", + " ylabel=\"Number of samples\",\n", + " xlabel=\"Proteins (ranked by missing values)\",\n", + " )\n", + ")\n", + "omics" + ] + }, + { + "cell_type": "markdown", + "id": "b1ba957f", + "metadata": {}, + "source": [ + "### Sample metadata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2ba978a", + "metadata": {}, + "outputs": [], + "source": [ + "metadata" + ] + }, + { + "cell_type": "markdown", + "id": "2877f00e", + "metadata": {}, + "source": [ + "Tabulate selected metadata and check for missing values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f0d0364", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "metadata[METACOL_LABEL].value_counts(dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9688dd46", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "target_counts = metadata[METACOL_LABEL].value_counts()\n", + "\n", + "if target_counts.sum() < len(metadata):\n", + " print(\n", + " \"Target has missing values.\"\n", + " f\" Can only use {target_counts.sum()} of {len(metadata)} samples.\"\n", + " )\n", + " mask = metadata[METACOL_LABEL].notna()\n", + " metadata, omics = metadata.loc[mask], omics.loc[mask]\n", + "\n", + "if METACOL_LABEL is None:\n", + " METACOL_LABEL = METACOL_LABEL\n", + "y = metadata[METACOL_LABEL].astype(\"category\")" + ] + }, + { + "cell_type": "markdown", + "id": "713a7b35", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "## Before batch correction\n", + "Explore data in PCA and UMAP space before batch correction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "986f7cab", + "metadata": { + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "omics_imp = median_impute(omics)\n", + "omics_imp_scaled = standard_normalize(omics_imp)\n", + "PCs, fig = run_and_plot_pca(omics_imp, y, METACOL_LABEL, n_components=4)\n", + "ax = plot_umap(omics_imp, y, METACOL_LABEL)" + ] + }, + { + "cell_type": "markdown", + "id": "2bf8ca74", + "metadata": {}, + "source": [ + "## Combat batch correction\n", + "Correct for batch effects in the data using a robust regression approach removing\n", + "mean and scale effects out for each provided co-variate by batch.\n", + "Assumes normally distributed data.\n", + "\n", + "> ⚠️ Combat needs imputed data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc107f61", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "X = median_impute(omics)\n", + "X = acore.batch_correction.combat_batch_correction(\n", + " X.join(y),\n", + " batch_col=\"site\",\n", + ")\n", + "X" + ] + }, + { + "cell_type": "markdown", + "id": "ca46af9e", + "metadata": {}, + "source": [ + "Plot PCA and UMAP after batch correction on standard normalized data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cdc4afa2", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "PCs, fig = run_and_plot_pca(standard_normalize(X), y, METACOL_LABEL, n_components=4)\n", + "ax = plot_umap(X, y, METACOL_LABEL)" + ] + }, + { + "cell_type": "markdown", + "id": "9fda9a50", + "metadata": {}, + "source": [ + "See change by substracting combat corrected data from original data.\n", + "- NAs in original data will remain NA below (no imputation done here)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9cecdd8", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "omics - X" + ] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "tags,-all" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/api_examples/batch_correction.py b/docs/api_examples/batch_correction.py new file mode 100644 index 0000000..8134052 --- /dev/null +++ b/docs/api_examples/batch_correction.py @@ -0,0 +1,233 @@ +# --- +# jupyter: +# jupytext: +# cell_metadata_filter: tags,-all +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.18.1 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Batch correction of samples +# +# We will explore an Alzheimer dataset where the data was collected in four different sites. +# We will see that the sites have a an effect where the data is in principal component space +# and in UMAP space. We will then batch correct the data and see how the effect on these plots. +# +# Refers to the [`acore.batch_correction`](acore.batch_correction) module. + + +# %% tags=["hide-output"] +# %pip install acore + +# %% tags=["hide-input"] +from typing import Optional + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import sklearn +import sklearn.impute +import sklearn.preprocessing +import vuecore.decomposition + +import acore.batch_correction +import acore.decomposition + + +def plot_umap(X_scaled, y, meta_column, random_state=42) -> plt.Axes: + """Fit and plot UMAP embedding with two components with colors defined by meta_column.""" + embedding = acore.decomposition.umap.run_umap( + X_scaled, y, random_state=random_state + ) + ax = embedding.plot.scatter("UMAP 1", "UMAP 2", c=meta_column, cmap="Paired") + return ax + + +def standard_normalize(X: pd.DataFrame) -> pd.DataFrame: + """Standard normalize data and keep indices of DataFrame.""" + X_scaled = ( + sklearn.preprocessing.StandardScaler() + .set_output(transform="pandas") + .fit_transform(X) + ) + return X_scaled + + +def median_impute(X: pd.DataFrame) -> pd.DataFrame: + X_imputed = ( + sklearn.impute.SimpleImputer(strategy="median") + .set_output(transform="pandas") + .fit_transform(X) + ) + return X_imputed + + +def run_and_plot_pca( + X_scaled, + y, + meta_column: Optional[str] = None, + n_components: int = 4, +) -> tuple[pd.DataFrame, plt.Figure]: + PCs, _ = acore.decomposition.pca.run_pca(X_scaled, n_components=n_components) + PCs.columns = [s.replace("principal component", "PC") for s in PCs.columns] + fig = vuecore.decomposition.pca_grid( + PCs=PCs, meta_column=y, n_components=n_components, meta_col_name=meta_column + ) + return PCs, fig + + +# %% [markdown] +# +# ## Set some parameters + +# %% tags=["parameters"] +fname_metadata: str = ( + "https://raw.githubusercontent.com/RasmussenLab/" + "njab/HEAD/docs/tutorial/data/alzheimer/meta.csv" # clincial data +) +fname_omics: str = ( + "https://raw.githubusercontent.com/RasmussenLab/" + "njab/HEAD/docs/tutorial/data/alzheimer/proteome.csv" # omics data +) +METACOL: str = "_collection site" # target column in fname_metadata dataset (binary) +METACOL_LABEL: Optional[str] = "site" # optional: rename target variable +n_features_max: int = 5 +freq_cutoff: float = 0.5 # Omics cutoff for sample completeness +VAL_IDS: str = "" # +VAL_IDS_query: str = "" +weights: bool = True +FOLDER = "alzheimer" +model_name = "all" + +# %% [markdown] +# ## Setup + +# %% [markdown] +# ### Load proteomics (protein groups) data + +# %% +if METACOL_LABEL is None: + METACOL_LABEL = METACOL +metadata = ( + pd.read_csv(fname_metadata, usecols=["Sample ID", METACOL], index_col=0) + .convert_dtypes() + .rename(columns={METACOL: METACOL_LABEL}) +) +omics = pd.read_csv(fname_omics, index_col=0) + +# %% [markdown] +# Data shapes + +# %% +omics.shape, metadata.shape + +# %% [markdown] +# See how common omics features are and remove feature below choosen frequency cutoff + +# %% +ax = omics.notna().sum().sort_values().plot(rot=90) + +# %% tags=["hide-input"] +M_before = omics.shape[1] +omics = omics.dropna(thresh=int(len(omics) * freq_cutoff), axis=1) +M_after = omics.shape[1] +msg = ( + f"Removed {M_before-M_after} features with more " + f"than {freq_cutoff*100}% missing values." + f"\nRemaining features: {M_after} (of {M_before})" +) +print(msg) +# keep a map of all proteins in protein group, but only display first protein +# proteins are unique to protein groups +pg_map = {k: k.split(";")[0] for k in omics.columns} +omics = omics.rename(columns=pg_map) +# log2 transform raw intensity data: +omics = np.log2(omics + 1) +ax = ( + omics.notna() + .sum() + .sort_values() + .plot( + rot=90, + ylabel="Number of samples", + xlabel="Proteins (ranked by missing values)", + ) +) +omics + +# %% [markdown] +# ### Sample metadata + +# %% +metadata + +# %% [markdown] +# Tabulate selected metadata and check for missing values + +# %% tags=["hide-input"] +metadata[METACOL_LABEL].value_counts(dropna=False) + +# %% tags=["hide-input"] +target_counts = metadata[METACOL_LABEL].value_counts() + +if target_counts.sum() < len(metadata): + print( + "Target has missing values." + f" Can only use {target_counts.sum()} of {len(metadata)} samples." + ) + mask = metadata[METACOL_LABEL].notna() + metadata, omics = metadata.loc[mask], omics.loc[mask] + +if METACOL_LABEL is None: + METACOL_LABEL = METACOL_LABEL +y = metadata[METACOL_LABEL].astype("category") + +# %% [markdown] +# ## Before batch correction +# Explore data in PCA and UMAP space before batch correction + + +# %% tags=["hide-input"] +omics_imp = median_impute(omics) +omics_imp_scaled = standard_normalize(omics_imp) +PCs, fig = run_and_plot_pca(omics_imp, y, METACOL_LABEL, n_components=4) +ax = plot_umap(omics_imp, y, METACOL_LABEL) + + +# %% [markdown] +# ## Combat batch correction +# Correct for batch effects in the data using a robust regression approach removing +# mean and scale effects out for each provided co-variate by batch. +# Assumes normally distributed data. +# +# > ⚠️ Combat needs imputed data + +# %% +# %%time +X = median_impute(omics) +X = acore.batch_correction.combat_batch_correction( + X.join(y), + batch_col="site", +) +X + +# %% [markdown] +# Plot PCA and UMAP after batch correction on standard normalized data + +# %% tags=["hide-input"] +PCs, fig = run_and_plot_pca(standard_normalize(X), y, METACOL_LABEL, n_components=4) +ax = plot_umap(X, y, METACOL_LABEL) + +# %% [markdown] +# See change by substracting combat corrected data from original data. +# - NAs in original data will remain NA below (no imputation done here) + +# %% tags=["hide-input"] +omics - X diff --git a/docs/api_examples/normalization_analysis.ipynb b/docs/api_examples/normalization_analysis.ipynb index 84b7360..04e8ceb 100644 --- a/docs/api_examples/normalization_analysis.ipynb +++ b/docs/api_examples/normalization_analysis.ipynb @@ -5,13 +5,13 @@ "id": "e31da646", "metadata": {}, "source": [ - "# Normalization of samples in a dataset example\n", + "# Normalization of samples\n", "\n", "We will explore an Alzheimer dataset where the data was collected in four different sites.\n", "We will see that the sites have a an effect where the data is in principal component space\n", "and in UMAP space. We will then normalize the data and see how the effect on these plots.\n", "\n", - "Refers to the `acore.normalization` module." + "Refers to the [`acore.normalization`](acore.normalization) module." ] }, { @@ -63,7 +63,6 @@ "\n", "import acore.decomposition\n", "import acore.normalization\n", - "import acore.sklearn\n", "\n", "\n", "def plot_umap(X_scaled, y, meta_column, random_state=42) -> plt.Axes:\n", @@ -470,92 +469,6 @@ "iopub.status.idle": "2024-10-15T08:04:45.540528Z", "shell.execute_reply": "2024-10-15T08:04:45.539880Z" }, - "tags": [ - "hide-input" - ] - }, - "outputs": [], - "source": [ - "omics" - ] - }, - { - "cell_type": "markdown", - "id": "8e972d1e", - "metadata": {}, - "source": [ - "## Combat normalization\n", - "Correct for batch effects in the data using a robust regression approach normalizing\n", - "mean and scale effetcs out for each feature by batch. Assumes normally distributed data.\n", - "\n", - "> ⚠️ Combat needs imputed data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c3a7d04d", - "metadata": { - "execution": { - "iopub.execute_input": "2024-10-15T08:04:45.611270Z", - "iopub.status.busy": "2024-10-15T08:04:45.611049Z", - "iopub.status.idle": "2024-10-15T08:04:45.634947Z", - "shell.execute_reply": "2024-10-15T08:04:45.634525Z" - } - }, - "outputs": [], - "source": [ - "%%time\n", - "X = median_impute(omics)\n", - "X = acore.normalization.combat_batch_correction(\n", - " X.join(y),\n", - " batch_col=\"site\",\n", - ")\n", - "X" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aff9589b", - "metadata": { - "execution": { - "iopub.execute_input": "2024-10-15T08:04:45.667463Z", - "iopub.status.busy": "2024-10-15T08:04:45.667004Z", - "iopub.status.idle": "2024-10-15T08:04:46.730589Z", - "shell.execute_reply": "2024-10-15T08:04:46.730346Z" - }, - "tags": [ - "hide-input" - ] - }, - "outputs": [], - "source": [ - "omics_imp = median_impute(X)\n", - "omics_imp_scaled = standard_normalize(omics_imp)\n", - "PCs, fig = run_and_plot_pca(omics_imp_scaled, y, METACOL_LABEL, n_components=4)\n", - "ax = plot_umap(omics_imp_scaled, y, METACOL_LABEL)" - ] - }, - { - "cell_type": "markdown", - "id": "57ffebe1", - "metadata": {}, - "source": [ - "See change by substracting combat normalized data from original data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "02d6773f", - "metadata": { - "execution": { - "iopub.execute_input": "2024-10-15T08:04:46.789884Z", - "iopub.status.busy": "2024-10-15T08:04:46.789752Z", - "iopub.status.idle": "2024-10-15T08:04:46.796400Z", - "shell.execute_reply": "2024-10-15T08:04:46.796180Z" - }, "lines_to_next_cell": 2, "tags": [ "hide-input" @@ -563,7 +476,7 @@ }, "outputs": [], "source": [ - "omics - X" + "omics" ] }, { diff --git a/docs/api_examples/normalization_analysis.py b/docs/api_examples/normalization_analysis.py index 424dac8..7729b7c 100644 --- a/docs/api_examples/normalization_analysis.py +++ b/docs/api_examples/normalization_analysis.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.17.2 +# jupytext_version: 1.18.1 # kernelspec: # display_name: Python 3 (ipykernel) # language: python @@ -14,13 +14,13 @@ # --- # %% [markdown] -# # Normalization of samples in a dataset example +# # Normalization of samples # # We will explore an Alzheimer dataset where the data was collected in four different sites. # We will see that the sites have a an effect where the data is in principal component space # and in UMAP space. We will then normalize the data and see how the effect on these plots. # -# Refers to the `acore.normalization` module. +# Refers to the [`acore.normalization`](acore.normalization) module. # %% tags=["hide-output"] # %pip install acore @@ -38,7 +38,6 @@ import acore.decomposition import acore.normalization -import acore.sklearn def plot_umap(X_scaled, y, meta_column, random_state=42) -> plt.Axes: @@ -225,34 +224,6 @@ def run_and_plot_pca( # %% tags=["hide-input"] omics -# %% [markdown] -# ## Combat normalization -# Correct for batch effects in the data using a robust regression approach normalizing -# mean and scale effetcs out for each feature by batch. Assumes normally distributed data. -# -# > ⚠️ Combat needs imputed data - -# %% -# %%time -X = median_impute(omics) -X = acore.normalization.combat_batch_correction( - X.join(y), - batch_col="site", -) -X - -# %% tags=["hide-input"] -omics_imp = median_impute(X) -omics_imp_scaled = standard_normalize(omics_imp) -PCs, fig = run_and_plot_pca(omics_imp_scaled, y, METACOL_LABEL, n_components=4) -ax = plot_umap(omics_imp_scaled, y, METACOL_LABEL) - -# %% [markdown] -# See change by substracting combat normalized data from original data. - -# %% tags=["hide-input"] -omics - X - # %% [markdown] # ## Median normalization diff --git a/docs/authors.md b/docs/authors.md new file mode 100644 index 0000000..fa8dfb5 --- /dev/null +++ b/docs/authors.md @@ -0,0 +1,4 @@ +```{include} ../AUTHORS.md +:relative-docs: docs +:relative-images: +``` diff --git a/docs/authors.rst b/docs/authors.rst deleted file mode 100644 index e122f91..0000000 --- a/docs/authors.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../AUTHORS.rst diff --git a/docs/conf.py b/docs/conf.py index f40d78e..332a4f8 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,7 +1,26 @@ import os +from importlib import metadata from pathlib import Path -import acore +# -- Project information ----------------------------------------------------- + +# General information about the project. +project = "acore" +copyright = "2026, Multi-omics Network Analytics Group" +author = "Multi-omics Network Analytics Group, Sebastián Ayala Ruano, Henry Webel, Alberto Santos" +PACKAGE_VERSION = metadata.version("acore") +version = PACKAGE_VERSION +release = PACKAGE_VERSION + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = "en" + +# -- General configuration --------------------------------------------------- + # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom @@ -14,7 +33,6 @@ # "sphinx.ext.doctest", "sphinx.ext.intersphinx", # "sphinx.ext.autosummary", - # "sphinx.ext.todo", # "sphinx.ext.mathjax", # "sphinx.ext.coverage", # "sphinx.ext.imgmath", @@ -28,12 +46,52 @@ myst_enable_extensions = [ # "strikethrough", "dollarmath", - # "amsmath", + "amsmath", + "colon_fence", ] +# https://myst-nb.readthedocs.io/en/latest/computation/execute.html +nb_execution_mode = "auto" + +# https://myst-nb.readthedocs.io/en/latest/configuration.html +# Execution +nb_execution_raise_on_error = True +# Rendering +nb_merge_streams = True +# maximum execution time per cell in seconds +nb_execution_timeout = 120 + # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = [ + "_build", + "Thumbs.db", + ".DS_Store", + "jupyter_execute", + "conf.py", + "fetch_files.py", +] + + +# Intersphinx options +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "numpy": ("https://numpy.org/doc/stable/", None), + "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), + "pandera": ("https://pandera.readthedocs.io/en/stable/", None), + "pydantic": ("https://docs.pydantic.dev/latest", None), + # "scikit-learn": ("https://scikit-learn.org/stable/", None), + # "matplotlib": ("https://matplotlib.org/stable/", None), +} + + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = "sphinx" + # If true, the current module name will be prepended to all description # unit titles (such as .. function::). add_module_names = False @@ -54,62 +112,8 @@ ## Generate autodoc stubs with summaries from code autosummary_generate = True -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = ".rst" - -# The master toctree document. -master_doc = "index" - -# General information about the project. -project = "acore" -copyright = "2023, Multi-omics Network Analytics Group" -author = "Multi-omics Network Analytics Group" - -# The version info for the project you're documenting, acts as replacement -# for |version| and |release|, also used in various other places throughout -# the built documents. -# -# The short X.Y version. -version = acore.__version__ -# The full version, including alpha/beta/rc tags. -release = acore.__version__ - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = "en" - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This patterns also effect to html_static_path and html_extra_path -exclude_patterns = [ - "_build", - "Thumbs.db", - ".DS_Store", - "jupyter_execute", - "conf.py", - "fetch_files.py", -] - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = "sphinx" - -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = False - # -- General configuration --------------------------------------------------- -# https://myst-nb.readthedocs.io/en/latest/computation/execute.html -nb_execution_mode = "auto" -nb_execution_timeout = ( - 90 # increase limit of default 30 seconds max execution time per cell -) - myst_enable_extensions = ["dollarmath", "amsmath", "colon_fence"] # Plolty support through require javascript library @@ -120,30 +124,6 @@ "https://cdn.plot.ly/plotly-3.0.1.min.js", ] -# https://myst-nb.readthedocs.io/en/latest/configuration.html -# Execution -nb_execution_raise_on_error = True -# Rendering -nb_merge_streams = True -# maximum execution time per cell in seconds -nb_execution_timeout = 120 - -# https://myst-nb.readthedocs.io/en/latest/authoring/custom-formats.html#write-custom-formats -# nb_custom_formats = {".py": ["jupytext.reads", {"fmt": "py:percent"}]} -# ! better use jupytext directly to sync the notebook formats -> allows to download ipynb -# ! and see start in colab button - - -# Intersphinx options -intersphinx_mapping = { - "python": ("https://docs.python.org/3", None), - "numpy": ("https://numpy.org/doc/stable/", None), - "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), - "pandera": ("https://pandera.readthedocs.io/en/stable/", None), - # "scikit-learn": ("https://scikit-learn.org/stable/", None), - # "matplotlib": ("https://matplotlib.org/stable/", None), -} - # -- Options for HTML output ------------------------------------------------- @@ -177,69 +157,6 @@ # so a file named "default.css" will overwrite the builtin "default.css". # html_static_path = ['_static'] - -# -- Options for HTMLHelp output --------------------------------------- - -# Output file base name for HTML help builder. -htmlhelp_basename = "acoredoc" - - -# -- Options for LaTeX output ------------------------------------------ - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). - # - # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. - # - # 'preamble': '', - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass -# [howto, manual, or own class]). -latex_documents = [ - ( - master_doc, - "acore.tex", - "Analytics Core Documentation", - "Multi-omics Network Analytics Group", - "manual", - ), -] - - -# -- Options for manual page output ------------------------------------ - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [(master_doc, "acore", "Analytics Core Documentation", [author], 1)] - - -# -- Options for Texinfo output ---------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - ( - master_doc, - "acore", - "Analytics Core Documentation", - author, - "acore", - "Oneline description of project.", - "Miscellaneous", - ), -] - - # -- Setup for sphinx-apidoc ------------------------------------------------- # sphinx-apidoc needs to be called manually if Sphinx is running there. diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 0000000..10d8391 --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,4 @@ +```{include} ../CONTRIBUTING.md +:relative-docs: docs +:relative-images: +``` diff --git a/docs/contributing.rst b/docs/contributing.rst deleted file mode 100644 index e582053..0000000 --- a/docs/contributing.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../CONTRIBUTING.rst diff --git a/docs/history.md b/docs/history.md new file mode 100644 index 0000000..26f1255 --- /dev/null +++ b/docs/history.md @@ -0,0 +1,4 @@ +```{include} ../HISTORY.md +:relative-docs: docs +:relative-images: +``` diff --git a/docs/history.rst b/docs/history.rst deleted file mode 100644 index 2506499..0000000 --- a/docs/history.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../HISTORY.rst diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..ab97f42 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,69 @@ +# Acore Documentation + +```{include} ./sections_readme/home_page.md +:caption: VueGen +:relative-docs: docs +:relative-images: +:start-line: 0 +:end-line: 40 +``` + +```{toctree} +:maxdepth: 1 +:caption: Overview +``` + +```{toctree} +:maxdepth: 1 +:caption: API usage examples +:hidden: + +api_examples/normalization_analysis +api_examples/batch_correction +api_examples/exploratory_analysis +api_examples/ANCOVA_analysis +api_examples/enrichment_analysis +api_examples/permutation_testing +``` + +```{toctree} +:maxdepth: 1 +:caption: Reanalysis +:hidden: + +PXD040621 +``` + +```{toctree} +:maxdepth: 1 +:caption: Example dataset (preprocessing) +:hidden: + +api_examples/Download_PRIDE_data +api_examples/ovarian_cancer +``` + +```{toctree} +:maxdepth: 2 +:caption: Modules +:hidden: + +reference/acore +``` + +## Indices and tables + +- [General Index](genindex) +- [Module Index](modindex) +- [Search Page](search) + +```{toctree} +:maxdepth: 1 +:caption: MISC: +:hidden: + +contributing +authors +history +README.md +``` diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index af0844f..0000000 --- a/docs/index.rst +++ /dev/null @@ -1,63 +0,0 @@ -Acore documentation -================================= - -.. include:: sections_readme/home_page.md - :parser: myst_parser.sphinx_ - :start-line: 0 - :end-line: 40 - -.. only supported in myst markdown for include... -.. :relative-docs: docs -.. :relative-images: - -.. toctree:: - :maxdepth: 1 - :caption: API usage examples - :hidden: - - api_examples/exploratory_analysis - api_examples/normalization_analysis - api_examples/ANCOVA_analysis - api_examples/enrichment_analysis - api_examples/permutation_testing - -.. toctree:: - :maxdepth: 1 - :caption: Reanalysis - :hidden: - - PXD040621 - -.. toctree:: - :maxdepth: 1 - :caption: Example dataset (preprocessing) - :hidden: - - api_examples/Download_PRIDE_data - api_examples/ovarian_cancer - -.. toctree:: - :maxdepth: 2 - :caption: Modules - :hidden: - - reference/acore - -Indices and tables ------------------- -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` - - -.. toctree:: - :maxdepth: 1 - :caption: MISC: - :hidden: - - contributing - authors - history - README.md - - diff --git a/example_data/syntethic_pep_enrichment/1_analysis.py b/example_data/syntethic_pep_enrichment/1_analysis.py index e61a792..cdd997c 100644 --- a/example_data/syntethic_pep_enrichment/1_analysis.py +++ b/example_data/syntethic_pep_enrichment/1_analysis.py @@ -8,6 +8,7 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd +from scipy.stats import fisher_exact, hypergeom import acore.enrichment_analysis as ea @@ -57,8 +58,6 @@ # playing with the components of single test to understand # fisher exact test and how it works: -# %% -from scipy.stats import fisher_exact # %% num_in_foreground = 4 @@ -87,8 +86,6 @@ # - foreground population and number of peptides in foreground are important # %% -from scipy.stats import hypergeom - [M, n, N] = [background_pop, foreground_pop, num_in_background + num_in_foreground] rv = hypergeom(M, n, N) x = np.arange(0, n + 1) diff --git a/src/acore/batch_correction/__init__.py b/src/acore/batch_correction/__init__.py new file mode 100644 index 0000000..8af5447 --- /dev/null +++ b/src/acore/batch_correction/__init__.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +import pandas as pd + +# PyPI pycombat as described in the docstring was never used: +# https://pypi.org/project/pycombat/ +# https://github.com/epigenelabs/inmoose is the update from combat.pycombat on PyPI +# from combat.pycombat import pycombat +from inmoose.pycombat import pycombat_norm + +__all__ = ["combat_batch_correction"] + + +def combat_batch_correction( + data: pd.DataFrame, + batch_col: str, + # index_cols: list[str], +) -> pd.DataFrame: + """ + This function corrects processed data for batch effects. For more information visit: + https://github.com/epigenelabs/inmoose + + :param data: pandas.DataFrame with samples as rows and protein identifiers as columns. + :param batch_col: column with the batch identifiers + :return: pandas.DataFrame with samples as rows and protein identifiers as columns. + Example:: + result = combat_batch_correction( + data, + batch_col="batch", + index_cols=["subject", "sample", "group"], + ) + + """ + # :param index_cols: list of columns that don't need to be corrected (i.e group) + df_corrected = pd.DataFrame() + # index_cols = [c for c in index_cols if c != batch_col] + # data = data.set_index(index_cols) # ? should this not be provided directly as data + df = data.drop(batch_col, axis=1) + df_numeric = df.select_dtypes("number") + num_batches = len(data[batch_col].unique()) + if df_numeric.empty: + raise ValueError("No numeric columns found in data.") + if not num_batches > 1: + raise ValueError("Only one batch found in data.") + info_cols = df.columns.difference(df_numeric.columns) + df_corrected = pd.DataFrame( + pycombat_norm(df_numeric.T, data[batch_col]).T, + index=df.index, + ) + df_corrected = df_corrected.join(df[info_cols]) + # df_corrected = df_corrected # .reset_index() # ? would also not reset index here + + return df_corrected diff --git a/src/acore/normalization/__init__.py b/src/acore/normalization/__init__.py index a24f5b3..4c05d14 100644 --- a/src/acore/normalization/__init__.py +++ b/src/acore/normalization/__init__.py @@ -10,12 +10,6 @@ import pandas as pd -# PyPI pycombat as described in the docstring was never used: -# https://pypi.org/project/pycombat/ -# https://github.com/epigenelabs/inmoose is the update from combat.pycombat on PyPI -# from combat.pycombat import pycombat -from inmoose.pycombat import pycombat_norm - from .strategies import ( linear_normalization, median_normalization, @@ -28,48 +22,6 @@ __all__ = ["combat_batch_correction", "normalize_data", "normalize_data_per_group"] -def combat_batch_correction( - data: pd.DataFrame, - batch_col: str, - # index_cols: list[str], -) -> pd.DataFrame: - """ - This function corrects processed data for batch effects. For more information visit: - https://github.com/epigenelabs/inmoose - - :param data: pandas.DataFrame with samples as rows and protein identifiers as columns. - :param batch_col: column with the batch identifiers - :return: pandas.DataFrame with samples as rows and protein identifiers as columns. - Example:: - result = combat_batch_correction( - data, - batch_col="batch", - index_cols=["subject", "sample", "group"], - ) - - """ - # :param index_cols: list of columns that don't need to be corrected (i.e group) - df_corrected = pd.DataFrame() - # index_cols = [c for c in index_cols if c != batch_col] - # data = data.set_index(index_cols) # ? should this not be provided directly as data - df = data.drop(batch_col, axis=1) - df_numeric = df.select_dtypes("number") - num_batches = len(data[batch_col].unique()) - if df_numeric.empty: - raise ValueError("No numeric columns found in data.") - if not num_batches > 1: - raise ValueError("Only one batch found in data.") - info_cols = df.columns.difference(df_numeric.columns) - df_corrected = pd.DataFrame( - pycombat_norm(df_numeric.T, data[batch_col]).T, - index=df.index, - ) - df_corrected = df_corrected.join(df[info_cols]) - # df_corrected = df_corrected # .reset_index() # ? would also not reset index here - - return df_corrected - - def normalize_data_per_group( data: pd.DataFrame, group: str | int | list[str | int], diff --git a/tests/test_normalization.py b/tests/test_normalization.py index 57f5fca..e04f48f 100644 --- a/tests/test_normalization.py +++ b/tests/test_normalization.py @@ -1,6 +1,6 @@ import pandas as pd -from acore import normalization as normalization +from acore import batch_correction, normalization def test_combat_batch_correction(): @@ -21,7 +21,7 @@ def test_combat_batch_correction(): 3: {"a": 3.3051631491010953, "b": 4.671866609626733, "c": 9.362703235564519}, 4: {"a": 2.537904319175784, "b": 3.589114860839033, "c": 6.885375054723476}, } - actual = normalization.combat_batch_correction(data, "batch").to_dict( + actual = batch_correction.combat_batch_correction(data, "batch").to_dict( orient="index" ) assert actual == expected From 7f74321624624f6881c07639891130a667115eae Mon Sep 17 00:00:00 2001 From: angelphanth Date: Thu, 12 Mar 2026 09:31:58 +0000 Subject: [PATCH 25/25] :truck: renamed file and moved to 'transform' dir --- .../internal_functions.py => transform/compositional.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/acore/{microbiome/internal_functions.py => transform/compositional.py} (100%) diff --git a/src/acore/microbiome/internal_functions.py b/src/acore/transform/compositional.py similarity index 100% rename from src/acore/microbiome/internal_functions.py rename to src/acore/transform/compositional.py