From 13b1b21b902c19d8c9d5047c36d98709a191242d Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Tue, 9 Sep 2025 22:37:30 +0200
Subject: [PATCH 01/25] init internal functions for permutation testing

---
 .../permutation_test/internal_functions.py    |  67 +++++++++
 src/acore/permutation_test/jaccard.py         | 141 ++++++++++++++++++
 2 files changed, 208 insertions(+)
 create mode 100644 src/acore/permutation_test/internal_functions.py
 create mode 100644 src/acore/permutation_test/jaccard.py

diff --git a/src/acore/permutation_test/internal_functions.py b/src/acore/permutation_test/internal_functions.py
new file mode 100644
index 0000000..a48937f
--- /dev/null
+++ b/src/acore/permutation_test/internal_functions.py
@@ -0,0 +1,67 @@
+import numpy as np 
+import pandas as pd
+
+
+def _permute(*groups, rng=np.random.default_rng(seed=12345)):
+    """
+    Perform a single permutation of the groups.
+
+    Parameters
+    ----------
+    *groups : iterable
+        The groups to permute.
+    rng : np.random.Generator, optional
+        The random number generator to use for shuffling.
+        Defaults to a new random number generator.
+
+    Returns
+    -------
+    list
+        The permuted groups.
+    """
+    # shuffle the combined array
+    combined = np.concatenate(groups)
+    rng.shuffle(combined)
+    # split the shuffled array into new groups of og size
+    new_groups = []
+    start = 0
+    for group in groups:
+        end = start + len(group)
+        new_groups.append(combined[start:end])
+        start = end
+    return new_groups
+
+
+def _contingency_table(*groups, to_np:bool=True)-> np.array:
+    """
+    Create a contingency table from the provided groups.
+
+    Parameters
+    ----------
+    *groups : iterable
+        The groups to include in the contingency table.
+    to_np : bool, optional
+        Whether to return the table as a NumPy array (default is True).
+
+    Returns
+    -------
+    np.array
+        The contingency table as a NumPy array or a pandas DataFrame.
+    """
+    # PRECONDITIONS
+    for group in groups:
+        if not isinstance(group, (list, pd.Series, np.ndarray)):
+            raise TypeError("Each group must be a list, pandas Series, or numpy array.")
+
+
+    # MAIN FUNCTION
+    # Create contingency table
+    table = pd.concat(
+        [pd.Series(group).value_counts() for group in groups], axis=1
+    ).fillna(0).T
+
+    if to_np:
+        return table.to_numpy()
+    else:
+        return table
+
diff --git a/src/acore/permutation_test/jaccard.py b/src/acore/permutation_test/jaccard.py
new file mode 100644
index 0000000..08e52ff
--- /dev/null
+++ b/src/acore/permutation_test/jaccard.py
@@ -0,0 +1,141 @@
+import numpy as np 
+import itertools
+from collections.abc import Iterable
+
+def jaccard_similarity(set1:set, set2:set) -> float:
+    """
+    Compute the Jaccard similarity between two sets.
+
+    Parameters
+    ----------
+    set1 : set
+        First set of nodes.
+    set2 : set
+        Second set of nodes.
+    Returns
+    -------
+    float
+        Jaccard similarity coefficient, 
+        which is the size of the intersection divided by 
+        the size of the union of the two sets. 
+
+    Example
+    -------
+    >>> set1 = {1, 2, 3}
+    >>> set2 = {2, 3, 4}
+    >>> jaccard_similarity(set1, set2)
+    0.5
+    """
+    intersection = len(set1 & set2)
+    union = len(set1 | set2)
+
+    if union !=0:
+        return intersection / union
+    else:
+        return 0
+
+
+def avg_jaccard(group: Iterable)-> tuple:
+    """
+    Compute the average pairwise Jaccard similarity within a group of graphs.
+
+    Parameters
+    ----------
+    group : Iterable
+        An iterator of iterable objects, such as a list of sets.
+
+   Returns
+    -------
+    tuple
+        A tuple containing the average Jaccard similarity and its standard deviation.
+    
+    Example
+    -------
+    >>> group = [{1, 2, 3}, {2, 3, 4}, {3, 4, 5}]
+    >>> avg_jaccard(group)
+    (0.3333333333333333, 0.0)
+    """
+
+    ## PRECONDITIONS
+    if not isinstance(group, Iterable):
+        raise TypeError("Input must be an iterable of iterables.")
+    if len(group) > 0:
+        for g in group:
+            if not isinstance(g, Iterable):
+                raise TypeError("Each element in the group must be an iterable (set, list, or tuple).")
+    elif len(group) == 0:
+        raise ValueError("Input group is empty. Cannot compute Jaccard similarity.")
+
+
+    # init the list 
+    scores = []
+
+    for g1, g2 in itertools.combinations(group, 2):
+        # get sets
+        set1 = set(g1)
+        set2 = set(g2)
+        # compute Jaccard similarity
+        scores.append(jaccard_similarity(set1, set2))
+
+    if scores:
+        # if scores is not empty, return the average
+        return np.mean(scores), np.std(scores)
+    # if scores is empty (less than 2 in group), return 0
+    else:
+        return 0, 0
+
+
+def btwn_jaccard(group1: Iterable, group2: Iterable) -> tuple:
+    """
+    Compute the average Jaccard similarity between two groups of graphs.
+
+    Parameters
+    ----------
+    group1 : Iterable
+        An iterator of iterable objects, such as a list of sets.
+    group2 : Iterable
+        Another iterator of iterable objects.
+
+    Returns
+    -------
+    tuple
+        A tuple containing the average Jaccard similarity and its standard deviation.
+    
+    Example
+    -------
+    >>> group1 = [{1, 2, 3}, {2, 3, 4}]
+    >>> group2 = [{3, 4, 5}, {4, 5, 6}]
+    >>> btwn_jaccard(group1, group2)
+    (0.3333333333333333, 0.0)
+    
+    """
+    
+    if not isinstance(group1, Iterable) or not isinstance(group2, Iterable):
+        raise TypeError("Both inputs must be iterables of iterables.")
+    if len(group1) == 0 or len(group2) == 0:
+        raise ValueError("Both input groups must be non-empty.")
+    
+    # init the list
+    scores = []
+    
+    for g1, g2 in itertools.product(group1, group2):
+        
+        # check
+        if not isinstance(g1, Iterable) or not isinstance(g2, Iterable):
+            raise TypeError("Each element in the groups must be an iterable (set, list, or tuple).")
+        
+        # get sets 
+        set1 = set(g1)
+        set2 = set(g2)
+
+        # compute Jaccard similarity
+        scores.append(jaccard_similarity(set1, set2))
+    
+    if scores:
+        # if scores is not empty, return the average
+        return np.mean(scores), np.std(scores)
+    # if scores is empty (less than 2 in group), return 0
+    else:
+        return 0, 0    
+
+

From cdb1705a4f1158d3cefda3e94340d0a902f9bbbd Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Tue, 9 Sep 2025 22:37:41 +0200
Subject: [PATCH 02/25] user facing in init file

---
 src/acore/permutation_test/__init__.py | 185 +++++++++++++++++++++++++
 1 file changed, 185 insertions(+)
 create mode 100644 src/acore/permutation_test/__init__.py

diff --git a/src/acore/permutation_test/__init__.py b/src/acore/permutation_test/__init__.py
new file mode 100644
index 0000000..c0e6b8e
--- /dev/null
+++ b/src/acore/permutation_test/__init__.py
@@ -0,0 +1,185 @@
+import numpy as np
+import pandas as pd
+from scipy.stats import (
+    chi2_contingency, 
+    ttest_rel,
+    ttest_ind,
+    f_oneway,
+)
+
+from .internal_functions import _permute, _contingency_table
+
+
+def paired_permutation(
+    cond1: np.ndarray,
+    cond2: np.ndarray,
+    metric: str = "t-statistic",
+    n_permutations: int = 10000,
+    rng: np.random.Generator = np.random.default_rng(seed=12345),
+    **kwargs
+) -> dict:
+    """Perform a permutation test for paired samples."""
+    # Validate input
+    if not isinstance(cond1, np.ndarray) or not isinstance(cond2, np.ndarray):
+        raise TypeError("Input must be numpy arrays.")
+    if cond1.shape != cond2.shape:
+        raise ValueError("Input arrays must have the same shape.")
+
+    # paired differences
+    diff = cond1-cond2
+    args = [diff]
+
+    # compute observed metric
+    if metric == "t-statistic":
+        calculator = ttest_rel
+        args = [cond1, cond2]
+    elif metric == "mean":
+        calculator = np.mean
+    elif metric == "median":
+        calculator = np.median
+    elif callable(metric):
+        calculator = metric
+    else:
+        raise ValueError(
+            "Invalid metric specified. Acceptable metrics are: "
+            "'t-statistic', 'mean', 'median', or a custom function"
+            "that takes `cond1-cond2` as input."
+        )
+
+    observed_metric = calculator(*args, **kwargs)
+    if metric == "t-statistic":
+        abs_met = abs(observed_metric.statistic)
+    else:
+        abs_met = abs(observed_metric)
+
+    # Perform permutations
+    permuted_f = []
+    for _ in range(n_permutations):
+        # randomly flip direction of differences
+        permuted_diff = diff * rng.choice([-1, 1], size=diff.shape)
+        new_cond1 = np.where((permuted_diff==diff), cond1, cond2)
+        new_cond2 = np.where((permuted_diff==diff), cond2, cond1)
+        # postcondition check
+        if not (permuted_diff == (new_cond1 - new_cond2)).all():
+            raise ArithmeticError("Postcondition failed: Issue with permuted differences")
+        # prep args
+        if metric == "t-statistic":
+            new_args = [new_cond1, new_cond2]
+            new_result = abs(calculator(*new_args, **kwargs).statistic)
+        else:
+            new_args = [permuted_diff]
+            new_result = abs(calculator(*new_args, **kwargs))
+        # compute permuted metric
+        permuted_f.append(new_result)
+
+    # Compute p-value
+    p_value = np.mean(permuted_f >= abs_met)
+
+    return {
+        "metric": calculator,
+        "observed": observed_metric,
+        "p_value": p_value
+    }
+
+
+def chi2_permutation(
+    *groups,
+    n_permutations: int = 10000,
+    rng: np.random.Generator = np.random.default_rng(seed=12345)
+) -> dict:
+    """Perform a permutation test for chi-squared statistics."""
+
+    # generate contingency table
+    cont_table = _contingency_table(*groups, to_np=True)
+
+    # Compute observed chi-squared statistic
+    observed_test = chi2_contingency(cont_table)
+    observed_chi2 = observed_test.statistic
+
+    # Perform permutations
+    permuted_chi2 = []
+    for _ in range(n_permutations):
+        # shuffle
+        perm_cont_table = _contingency_table(*_permute(*groups, rng=rng))
+
+        # calculate permuted chi-squared statistic
+        permuted_chi2.append(chi2_contingency(perm_cont_table).statistic)
+
+    # Compute p-value
+    p_value = np.mean(permuted_chi2 >= observed_chi2)
+
+    return {
+        "observed": observed_test,
+        "p_value": p_value
+    }
+
+
+def indep_permutation(
+    group1: np.ndarray,
+    group2: np.ndarray,
+    metric: str = "t-statistic",
+    n_permutations: int = 10000,
+    rng: np.random.Generator = np.random.default_rng(seed=12345),
+    **kwargs
+) -> dict:
+    """
+    TODO
+    """
+
+    ## PRECONDITIONS
+    if not isinstance(group1, np.ndarray) or not isinstance(group2, np.ndarray):
+        raise TypeError("Input must be numpy arrays.")
+
+    stat = False
+    # what metric to use
+    if metric == "t-statistic":
+        calculator = ttest_ind
+        stat = True
+    elif metric == "anova":
+        calculator = f_oneway
+        stat = True
+    elif metric == "mean":
+        calculator = np.mean
+    elif metric == "median":
+        calculator = np.median
+    elif callable(metric):
+        calculator = metric
+    else:
+        raise ValueError(
+            "Invalid metric specified. Acceptable metrics are: "
+            "'t-statistic', 'mean', 'median', or a custom function"
+            "that takes each group as input."
+        )
+
+    # compute observed metric
+    if stat: 
+        observed_metric = calculator(group1, group2, **kwargs)
+    else:
+        observed_metric = abs(calculator(group1) - calculator(group2))
+
+    # Perform permutations
+    permuted_f = []
+    for _ in range(n_permutations):
+        new_group1, new_group2 = _permute(group1, group2, rng=rng)
+        # prep args
+        if stat:
+            new_result = abs(calculator(new_group1, new_group2, **kwargs).statistic)
+            abs_met = abs(observed_metric.statistic)
+        else:
+            new_result = abs(calculator(new_group1) - calculator(new_group2))
+            abs_met = abs(observed_metric)
+        # compute permuted metric
+        permuted_f.append(new_result)
+
+    # Compute p-value
+    p_value = np.mean(permuted_f >= abs_met)
+
+    return {
+        "metric": calculator,
+        "observed": observed_metric,
+        "p_value": p_value
+    }
+
+
+
+

From a13be15c4e34f356c598e1676bc94f3faca45f38 Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Tue, 9 Sep 2025 22:38:00 +0200
Subject: [PATCH 03/25] :construction: some tests

---
 tests/test_permutation_test.py | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 tests/test_permutation_test.py

diff --git a/tests/test_permutation_test.py b/tests/test_permutation_test.py
new file mode 100644
index 0000000..d7eb091
--- /dev/null
+++ b/tests/test_permutation_test.py
@@ -0,0 +1,34 @@
+from acore import permutation_test as pt
+import pytest
+import numpy as np
+
+@pytest.fixture
+def data1():
+    return np.arange(1, 10)
+
+@pytest.fixture
+def data2():
+    return np.arange(31, 40)
+
+def test_paired_permutation(data1):
+    cond1 = data1
+    cond2 = data1
+    expected = {
+        "metric": np.mean,
+        "observed": 0.0,
+        "p_value": 1.0,
+    }
+    result = pt.paired_permutation(cond1, cond2, metric=np.mean)
+    assert result["metric"] == expected["metric"]
+    assert result["observed"] == expected["observed"]
+    assert result["p_value"] == expected["p_value"]
+
+def test_paired_permutation_reject(data1, data2):
+    cond1 = data1
+    cond2 = data2
+    for stat in ["median"]:
+        result = pt.paired_permutation(cond1, cond2, metric=stat)
+        print(result)
+        assert result["p_value"] < 0.05
+
+

From 5a5a42a643f4cc53197b497d5d84dcf50b3df14c Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Wed, 10 Sep 2025 16:03:46 +0200
Subject: [PATCH 04/25] :bug: degeneracy check for paired permutations

---
 .../permutation_test/internal_functions.py     | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/acore/permutation_test/internal_functions.py b/src/acore/permutation_test/internal_functions.py
index a48937f..58b8f46 100644
--- a/src/acore/permutation_test/internal_functions.py
+++ b/src/acore/permutation_test/internal_functions.py
@@ -1,7 +1,6 @@
 import numpy as np 
 import pandas as pd
 
-
 def _permute(*groups, rng=np.random.default_rng(seed=12345)):
     """
     Perform a single permutation of the groups.
@@ -65,3 +64,20 @@ def _contingency_table(*groups, to_np:bool=True)-> np.array:
     else:
         return table
 
+
+def _check_degeneracy(array: np.ndarray) -> bool:
+    """
+    Check for degeneracy in the differences of paired samples.
+
+    Parameters
+    ----------
+    array : np.ndarray
+        The differences array. e.g. cond1 - cond2
+
+    Returns
+    -------
+    bool
+        True if the conditions are degenerate, False otherwise.
+    """
+    return np.all(array == array[0])
+

From 179ce05db36e77b99403bfdb2733217a5780b4f5 Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Wed, 10 Sep 2025 16:19:49 +0200
Subject: [PATCH 05/25] use internal functions

---
 src/acore/permutation_test/__init__.py | 42 ++++++++++++++++++++------
 1 file changed, 32 insertions(+), 10 deletions(-)

diff --git a/src/acore/permutation_test/__init__.py b/src/acore/permutation_test/__init__.py
index c0e6b8e..516b64f 100644
--- a/src/acore/permutation_test/__init__.py
+++ b/src/acore/permutation_test/__init__.py
@@ -6,8 +6,14 @@
     ttest_ind,
     f_oneway,
 )
+import warnings
+warnings.simplefilter("always", UserWarning)
 
-from .internal_functions import _permute, _contingency_table
+from .internal_functions import (
+    _permute, 
+    _contingency_table, 
+    _check_degeneracy
+)
 
 
 def paired_permutation(
@@ -72,14 +78,28 @@ def paired_permutation(
         # compute permuted metric
         permuted_f.append(new_result)
 
-    # Compute p-value
-    p_value = np.mean(permuted_f >= abs_met)
+    if _check_degeneracy(diff):
+        identical_warn = (
+            "Degenerate conditions detected (the data are identical). "
+            "Consider using a different statistical test. "
+            "Results may be unreliable."
+        )
+        warnings.warn(identical_warn)
 
-    return {
-        "metric": calculator,
-        "observed": observed_metric,
-        "p_value": p_value
-    }
+        return {
+            "metric": calculator,
+            "observed": observed_metric,
+            "p_value": np.nan,
+        }
+    else:
+        # Compute p-value
+        p_value = np.mean(permuted_f >= abs_met)
+
+        return {
+            "metric": calculator,
+            "observed": observed_metric,
+            "p_value": p_value
+        }
 
 
 def chi2_permutation(
@@ -87,8 +107,9 @@ def chi2_permutation(
     n_permutations: int = 10000,
     rng: np.random.Generator = np.random.default_rng(seed=12345)
 ) -> dict:
-    """Perform a permutation test for chi-squared statistics."""
-
+    """
+    
+    """
     # generate contingency table
     cont_table = _contingency_table(*groups, to_np=True)
 
@@ -183,3 +204,4 @@ def indep_permutation(
 
 
 
+

From a144f7e727d5adbfc94e476875d3e33c95d97df7 Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Wed, 10 Sep 2025 16:22:05 +0200
Subject: [PATCH 06/25] :memo: copilot docstrings

---
 src/acore/permutation_test/__init__.py | 71 ++++++++++++++++++++++++--
 1 file changed, 67 insertions(+), 4 deletions(-)

diff --git a/src/acore/permutation_test/__init__.py b/src/acore/permutation_test/__init__.py
index 516b64f..2a3cb9b 100644
--- a/src/acore/permutation_test/__init__.py
+++ b/src/acore/permutation_test/__init__.py
@@ -24,8 +24,32 @@ def paired_permutation(
     rng: np.random.Generator = np.random.default_rng(seed=12345),
     **kwargs
 ) -> dict:
-    """Perform a permutation test for paired samples."""
-    # Validate input
+    """
+    Perform a permutation test for paired samples.
+
+    Parameters
+    ----------
+    cond1 : np.ndarray
+        First condition (paired samples).
+    cond2 : np.ndarray
+        Second condition (paired samples).
+    metric : str or callable, optional
+        Metric to compute ('t-statistic', 'mean', 'median', or a custom function).
+    n_permutations : int, optional
+        Number of permutations to perform (default is 10000).
+    rng : np.random.Generator, optional
+        Random number generator (default is np.random.default_rng(seed=12345)).
+    **kwargs
+        Additional arguments passed to the metric function.
+
+    Returns
+    -------
+    dict
+        Dictionary with keys:
+        - 'metric': Metric function used.
+        - 'observed': Observed metric value.
+        - 'p_value': Permutation test p-value (np.nan if degenerate).
+    """    # Validate input
     if not isinstance(cond1, np.ndarray) or not isinstance(cond2, np.ndarray):
         raise TypeError("Input must be numpy arrays.")
     if cond1.shape != cond2.shape:
@@ -108,7 +132,23 @@ def chi2_permutation(
     rng: np.random.Generator = np.random.default_rng(seed=12345)
 ) -> dict:
     """
-    
+    Perform a permutation test for categorical data using the chi-squared statistic.
+
+    Parameters
+    ----------
+    *groups : array-like
+        Arrays representing categorical groups.
+    n_permutations : int, optional
+        Number of permutations to perform (default is 10000).
+    rng : np.random.Generator, optional
+        Random number generator (default is np.random.default_rng(seed=12345)).
+
+    Returns
+    -------
+    dict
+        Dictionary with keys:
+        - 'observed': Observed chi-squared test result.
+        - 'p_value': Permutation test p-value.
     """
     # generate contingency table
     cont_table = _contingency_table(*groups, to_np=True)
@@ -144,7 +184,30 @@ def indep_permutation(
     **kwargs
 ) -> dict:
     """
-    TODO
+    Perform a permutation test for independent samples.
+
+    Parameters
+    ----------
+    group1 : np.ndarray
+        First group of samples.
+    group2 : np.ndarray
+        Second group of samples.
+    metric : str or callable, optional
+        Metric to compute ('t-statistic', 'anova', 'mean', 'median', or a custom function).
+    n_permutations : int, optional
+        Number of permutations to perform (default is 10000).
+    rng : np.random.Generator, optional
+        Random number generator (default is np.random.default_rng(seed=12345)).
+    **kwargs
+        Additional arguments passed to the metric function.
+
+    Returns
+    -------
+    dict
+        Dictionary with keys:
+        - 'metric': Metric function used.
+        - 'observed': Observed metric value.
+        - 'p_value': Permutation test p-value.
     """
 
     ## PRECONDITIONS

From e66b6b337e0c92500be2c4b8cdf9813bbdd0cda6 Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Wed, 10 Sep 2025 16:22:40 +0200
Subject: [PATCH 07/25] tests for accept reject null

---
 tests/test_permutation_test.py | 96 +++++++++++++++++++++++++++-------
 1 file changed, 76 insertions(+), 20 deletions(-)

diff --git a/tests/test_permutation_test.py b/tests/test_permutation_test.py
index d7eb091..2372e31 100644
--- a/tests/test_permutation_test.py
+++ b/tests/test_permutation_test.py
@@ -1,34 +1,90 @@
 from acore import permutation_test as pt
 import pytest
 import numpy as np
+from scipy.stats import ttest_rel, ttest_ind, mannwhitneyu, wilcoxon
 
 @pytest.fixture
-def data1():
+def set_array():
     return np.arange(1, 10)
 
 @pytest.fixture
-def data2():
-    return np.arange(31, 40)
-
-def test_paired_permutation(data1):
-    cond1 = data1
-    cond2 = data1
-    expected = {
-        "metric": np.mean,
-        "observed": 0.0,
-        "p_value": 1.0,
-    }
+def random_low():
+    return np.random.negative_binomial(10, 0.3, 100)
+
+@pytest.fixture
+def random_high():
+    return np.random.negative_binomial(20, 0.1, 100)
+
+@pytest.fixture
+def random_choice_equal():
+    return np.random.choice(
+        ["A", "B", "C", "D"], 
+        size=100, 
+        replace=True,
+        p=[0.25, 0.25, 0.25, 0.25]
+    )
+
+@pytest.fixture
+def random_choice_unequal():
+    return np.random.choice(
+        ["A", "B", "C", "D"], 
+        size=100, 
+        replace=True,
+        p=[0.1, 0.25, 0.05, 0.6]
+    )
+
+def test_paired_permutation_same(set_array):
+    cond1 = set_array
+    cond2 = set_array
     result = pt.paired_permutation(cond1, cond2, metric=np.mean)
-    assert result["metric"] == expected["metric"]
-    assert result["observed"] == expected["observed"]
-    assert result["p_value"] == expected["p_value"]
-
-def test_paired_permutation_reject(data1, data2):
-    cond1 = data1
-    cond2 = data2
-    for stat in ["median"]:
+    assert result["metric"] == np.mean
+    assert result["observed"] == 0.0
+    assert result["p_value"] is np.nan
+
+
+def test_paired_permutation_reject(random_low, random_high):
+    cond1 = random_low
+    cond2 = random_high
+    for stat in ["t-statistic", "mean", "median"]:
         result = pt.paired_permutation(cond1, cond2, metric=stat)
         print(result)
+        print(ttest_rel(cond1, cond2))
+        assert result["p_value"] < 0.05
+
+
+def test_paired_permutation_degen(set_array):
+    for stat in ["t-statistic", "mean", "median"]: 
+        assert pt.paired_permutation(
+            set_array, 
+            set_array+30,
+            metric=stat
+        )["p_value"] is np.nan
+
+
+def test_indep_permutation_reject(random_low, random_high):
+    group1 = random_low
+    group2 = random_high
+    for stat in ["t-statistic", "anova", "median", "mean"]:
+        result = pt.indep_permutation(group1, group2, metric=stat)
         assert result["p_value"] < 0.05
 
 
+def test_indep_permutation_accept(random_low):
+    for stat in ["t-statistic", "anova", "median", "mean"]:
+        assert pt.indep_permutation(
+            random_low,
+            random_low,
+            metric=stat
+        )["p_value"] > 0.05
+
+
+def test_chi2_permutation_accept(random_choice_equal):
+    result = pt.chi2_permutation(random_choice_equal, random_choice_equal)
+    assert result["p_value"] >= 0.05
+
+
+def test_chi2_permutation_reject(random_choice_unequal, random_choice_equal):
+    result = pt.chi2_permutation(random_choice_equal, random_choice_unequal)
+    assert result["p_value"] < 0.05
+
+

From 53d5f46805d0187e2bcc6df1939c9df1bf7ad587 Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Wed, 10 Sep 2025 16:44:14 +0200
Subject: [PATCH 08/25] init tutorial

---
 docs/api_examples/permutation_testing.ipynb | 375 ++++++++++++++++++++
 1 file changed, 375 insertions(+)
 create mode 100644 docs/api_examples/permutation_testing.ipynb

diff --git a/docs/api_examples/permutation_testing.ipynb b/docs/api_examples/permutation_testing.ipynb
new file mode 100644
index 0000000..eff2427
--- /dev/null
+++ b/docs/api_examples/permutation_testing.ipynb
@@ -0,0 +1,375 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "e9bdb7c9",
+   "metadata": {},
+   "source": [
+    "# Tutorial: Permutation Testing using `acore`\n",
+    "\n",
+    "In this notebook we will demonstrate how to use acore's permutation testing functions on metagenomics data collected by [Ju and colleagues (2018)](https://doi.org/10.1038/s41396-018-0277-8).\n",
+    "\n",
+    "The samples were collected from wastewaster treatment plant inffluent and effluent."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79d8b537",
+   "metadata": {
+    "vscode": {
+     "languageId": "markdown"
+    }
+   },
+   "source": [
+    "## Downloading the dataset\n",
+    "\n",
+    "The analysed samples can be downloaded via the Mgnify API."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "id": "8506ee79",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>GO</th>\n",
+       "      <th>description</th>\n",
+       "      <th>category</th>\n",
+       "      <th>ERR2985255</th>\n",
+       "      <th>ERR2985256</th>\n",
+       "      <th>ERR2985257</th>\n",
+       "      <th>ERR2985258</th>\n",
+       "      <th>ERR2985259</th>\n",
+       "      <th>ERR2985260</th>\n",
+       "      <th>ERR2985261</th>\n",
+       "      <th>...</th>\n",
+       "      <th>ERR2985269</th>\n",
+       "      <th>ERR2985270</th>\n",
+       "      <th>ERR2985271</th>\n",
+       "      <th>ERR2985272</th>\n",
+       "      <th>ERR2985273</th>\n",
+       "      <th>ERR2985274</th>\n",
+       "      <th>ERR2985275</th>\n",
+       "      <th>ERR2985276</th>\n",
+       "      <th>ERR2985277</th>\n",
+       "      <th>ERR2985278</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>GO:0000001</td>\n",
+       "      <td>mitochondrion inheritance</td>\n",
+       "      <td>biological process</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>GO:0000002</td>\n",
+       "      <td>mitochondrial genome maintenance</td>\n",
+       "      <td>biological process</td>\n",
+       "      <td>0</td>\n",
+       "      <td>9</td>\n",
+       "      <td>7</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>7</td>\n",
+       "      <td>4</td>\n",
+       "      <td>...</td>\n",
+       "      <td>15</td>\n",
+       "      <td>10</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>11</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>GO:0000012</td>\n",
+       "      <td>single strand break repair</td>\n",
+       "      <td>biological process</td>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>GO:0000015</td>\n",
+       "      <td>phosphopyruvate hydratase complex</td>\n",
+       "      <td>cellular component</td>\n",
+       "      <td>477</td>\n",
+       "      <td>384</td>\n",
+       "      <td>500</td>\n",
+       "      <td>188</td>\n",
+       "      <td>226</td>\n",
+       "      <td>232</td>\n",
+       "      <td>301</td>\n",
+       "      <td>...</td>\n",
+       "      <td>203</td>\n",
+       "      <td>432</td>\n",
+       "      <td>152</td>\n",
+       "      <td>341</td>\n",
+       "      <td>319</td>\n",
+       "      <td>520</td>\n",
+       "      <td>290</td>\n",
+       "      <td>349</td>\n",
+       "      <td>270</td>\n",
+       "      <td>292</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>GO:0000030</td>\n",
+       "      <td>mannosyltransferase activity</td>\n",
+       "      <td>molecular function</td>\n",
+       "      <td>189</td>\n",
+       "      <td>200</td>\n",
+       "      <td>57</td>\n",
+       "      <td>67</td>\n",
+       "      <td>39</td>\n",
+       "      <td>129</td>\n",
+       "      <td>181</td>\n",
+       "      <td>...</td>\n",
+       "      <td>70</td>\n",
+       "      <td>42</td>\n",
+       "      <td>58</td>\n",
+       "      <td>73</td>\n",
+       "      <td>92</td>\n",
+       "      <td>159</td>\n",
+       "      <td>67</td>\n",
+       "      <td>179</td>\n",
+       "      <td>58</td>\n",
+       "      <td>91</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 27 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           GO                        description            category  \\\n",
+       "0  GO:0000001          mitochondrion inheritance  biological process   \n",
+       "1  GO:0000002   mitochondrial genome maintenance  biological process   \n",
+       "2  GO:0000012         single strand break repair  biological process   \n",
+       "3  GO:0000015  phosphopyruvate hydratase complex  cellular component   \n",
+       "4  GO:0000030       mannosyltransferase activity  molecular function   \n",
+       "\n",
+       "   ERR2985255  ERR2985256  ERR2985257  ERR2985258  ERR2985259  ERR2985260  \\\n",
+       "0           0           0           0           0           0           3   \n",
+       "1           0           9           7           0           2           7   \n",
+       "2           0           3           1           1           0           0   \n",
+       "3         477         384         500         188         226         232   \n",
+       "4         189         200          57          67          39         129   \n",
+       "\n",
+       "   ERR2985261  ...  ERR2985269  ERR2985270  ERR2985271  ERR2985272  \\\n",
+       "0           0  ...           1           0           1           1   \n",
+       "1           4  ...          15          10           0           0   \n",
+       "2           0  ...           0           0           1           0   \n",
+       "3         301  ...         203         432         152         341   \n",
+       "4         181  ...          70          42          58          73   \n",
+       "\n",
+       "   ERR2985273  ERR2985274  ERR2985275  ERR2985276  ERR2985277  ERR2985278  \n",
+       "0           1           2           0           0           2           2  \n",
+       "1           0           5           0           4          11           1  \n",
+       "2           0           2           1           2           1           1  \n",
+       "3         319         520         290         349         270         292  \n",
+       "4          92         159          67         179          58          91  \n",
+       "\n",
+       "[5 rows x 27 columns]"
+      ]
+     },
+     "execution_count": 87,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import requests \n",
+    "from io import StringIO\n",
+    "import pandas as pd\n",
+    "\n",
+    "study_accession= \"MGYS00005058\"\n",
+    "\n",
+    "url = f'https://www.ebi.ac.uk/metagenomics/api/v1/studies/{study_accession}/pipelines/4.1/file/ERP112696_GO_abundances_v4.1.tsv'\n",
+    "# download\n",
+    "response = requests.get(url)\n",
+    "# treat as file\n",
+    "file_like = StringIO(response.content.decode('utf-8'))\n",
+    "# read into pd df\n",
+    "df = pd.read_csv(file_like, sep='\\t')\n",
+    "# sanity check\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a9fa25c4",
+   "metadata": {},
+   "source": [
+    "For this demo we will only look at [go term 0030655](https://gowiki.tamu.edu/wiki/index.php/Category:GO:0030655_!_beta-lactam_antibiotic_catabolic_process)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0259a24f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "go = df.query(\"GO == 'GO:0030655'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "239321d2",
+   "metadata": {},
+   "source": [
+    "We need the metadata for the samples. We can also get this using the MGnify API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "id": "e7ade2e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# sample metadata\n",
+    "sample_meta = requests.get(f\"https://www.ebi.ac.uk/metagenomics/api/v1/studies/{study_accession}/samples\")\n",
+    "# analysis metadata\n",
+    "analysis_meta = requests.get(f\"https://www.ebi.ac.uk/metagenomics/api/v1/analyses?study_accession={study_accession}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 179,
+   "id": "a9cb02a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_sm = pd.DataFrame(sample_meta.json()['data'])\n",
+    "df_am = pd.DataFrame(analysis_meta.json()['data'])\n",
+    "\n",
+    "\n",
+    "#['relationships']['run']['data']['id']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 195,
+   "id": "f50b423a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 24 entries, 0 to 23\n",
+      "Data columns (total 9 columns):\n",
+      " #   Column         Non-Null Count  Dtype \n",
+      "---  ------         --------------  ----- \n",
+      " 0   type           24 non-null     object\n",
+      " 1   id_x           24 non-null     object\n",
+      " 2   attributes     24 non-null     object\n",
+      " 3   relationships  24 non-null     object\n",
+      " 4   links          24 non-null     object\n",
+      " 5   run_id         24 non-null     object\n",
+      " 6   sample_id      24 non-null     object\n",
+      " 7   id_y           24 non-null     object\n",
+      " 8   desc           24 non-null     object\n",
+      "dtypes: object(9)\n",
+      "memory usage: 1.8+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "# get out needed metadata\n",
+    "df_am['run_id'] = df_am['relationships'].apply(lambda x: x['run']['data']['id'])\n",
+    "df_am['sample_id'] = df_am['relationships'].apply(lambda x: x['sample']['data']['id'])\n",
+    "df_sm['desc'] = df_sm['attributes'].apply(lambda x: x['sample-desc'])\n",
+    "\n",
+    "# join\n",
+    "df_combined = df_am.merge(df_sm[['id', 'desc']], left_on='sample_id', right_on='id', how='left')\n",
+    "df_combined.info()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 0a7e33a08d03a2bc22fbb8666e20a197bbe17fc9 Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Tue, 16 Sep 2025 10:03:26 +0200
Subject: [PATCH 09/25] diff conditions were diff mgys studies

---
 docs/api_examples/permutation_testing.ipynb | 139 +++++++++++---------
 1 file changed, 75 insertions(+), 64 deletions(-)

diff --git a/docs/api_examples/permutation_testing.ipynb b/docs/api_examples/permutation_testing.ipynb
index eff2427..e72d3ef 100644
--- a/docs/api_examples/permutation_testing.ipynb
+++ b/docs/api_examples/permutation_testing.ipynb
@@ -28,10 +28,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 87,
+   "execution_count": 5,
    "id": "8506ee79",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00005058/pipelines/4.1/file/ERP112696_GO_abundances_v4.1.tsv\n",
+      "Processing https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00005056/pipelines/4.1/file/ERP111072_GO_abundances_v4.1.tsv\n"
+     ]
+    },
     {
      "data": {
       "text/html": [
@@ -234,7 +242,7 @@
        "[5 rows x 27 columns]"
       ]
      },
-     "execution_count": 87,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -244,17 +252,28 @@
     "from io import StringIO\n",
     "import pandas as pd\n",
     "\n",
-    "study_accession= \"MGYS00005058\"\n",
+    "enf_url = 'https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00005058/pipelines/4.1/file/ERP112696_GO_abundances_v4.1.tsv'\n",
+    "inf_url = 'https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00005056/pipelines/4.1/file/ERP111072_GO_abundances_v4.1.tsv'\n",
+    "\n",
+    "# init dict to hold dataframes\n",
+    "data = {}\n",
+    "for url in [enf_url, inf_url]:\n",
+    "    print(f\"Processing {url}\")\n",
+    "    # download\n",
+    "    response = requests.get(url)\n",
+    "    if response.status_code == 200:\n",
+    "        # treat as file\n",
+    "        file_like = StringIO(response.content.decode('utf-8'))\n",
+    "        # read into pd df\n",
+    "        df = pd.read_csv(file_like, sep='\\t')\n",
+    "        # to the dict\n",
+    "        data[url] = df\n",
+    "    else: \n",
+    "        print(response)\n",
+    "        print(url, \" download skipped\")\n",
     "\n",
-    "url = f'https://www.ebi.ac.uk/metagenomics/api/v1/studies/{study_accession}/pipelines/4.1/file/ERP112696_GO_abundances_v4.1.tsv'\n",
-    "# download\n",
-    "response = requests.get(url)\n",
-    "# treat as file\n",
-    "file_like = StringIO(response.content.decode('utf-8'))\n",
-    "# read into pd df\n",
-    "df = pd.read_csv(file_like, sep='\\t')\n",
     "# sanity check\n",
-    "df.head()"
+    "data[enf_url].head()"
    ]
   },
   {
@@ -267,87 +286,79 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
    "id": "0259a24f",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(array([ 54, 101,  69,  52,  61,  68, 401,  72,  67,  80,  87,  49,  63,\n",
+       "         53, 345, 131,  67,  69,  42,  67, 107, 126,  63,  94]),\n",
+       " array([  23,   87, 1026,   63,  311,   47,   71,  121,  109,  334,   40,\n",
+       "         111,  111,   31,   21,  919,   58,   95,   98,  120,   96,   84,\n",
+       "          26,   89]))"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "go = df.query(\"GO == 'GO:0030655'\")"
+    "term = 'GO:0030655'\n",
+    "\n",
+    "inf_go = data[inf_url].query(\"GO == @term\").drop(columns=['GO', 'description', 'category']).to_numpy()[0]\n",
+    "enf_go = data[enf_url].query(\"GO == @term\").drop(columns=['GO', 'description', 'category']).to_numpy()[0]\n",
+    "\n",
+    "inf_go, enf_go"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "239321d2",
+   "id": "13469e23",
    "metadata": {},
    "source": [
-    "We need the metadata for the samples. We can also get this using the MGnify API"
+    "get metadata"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 92,
-   "id": "e7ade2e0",
+   "execution_count": 29,
+   "id": "5eefb720",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# sample metadata\n",
-    "sample_meta = requests.get(f\"https://www.ebi.ac.uk/metagenomics/api/v1/studies/{study_accession}/samples\")\n",
-    "# analysis metadata\n",
-    "analysis_meta = requests.get(f\"https://www.ebi.ac.uk/metagenomics/api/v1/analyses?study_accession={study_accession}\")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 179,
-   "id": "a9cb02a6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_sm = pd.DataFrame(sample_meta.json()['data'])\n",
-    "df_am = pd.DataFrame(analysis_meta.json()['data'])\n",
+    "from acore.permutation_test import paired_permutation\n",
     "\n",
-    "\n",
-    "#['relationships']['run']['data']['id']"
+    "# optional for reproducibility\n",
+    "import numpy as np\n",
+    "rng = np.random.default_rng(12345)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 195,
-   "id": "f50b423a",
+   "execution_count": 31,
+   "id": "9eabb911",
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<class 'pandas.core.frame.DataFrame'>\n",
-      "RangeIndex: 24 entries, 0 to 23\n",
-      "Data columns (total 9 columns):\n",
-      " #   Column         Non-Null Count  Dtype \n",
-      "---  ------         --------------  ----- \n",
-      " 0   type           24 non-null     object\n",
-      " 1   id_x           24 non-null     object\n",
-      " 2   attributes     24 non-null     object\n",
-      " 3   relationships  24 non-null     object\n",
-      " 4   links          24 non-null     object\n",
-      " 5   run_id         24 non-null     object\n",
-      " 6   sample_id      24 non-null     object\n",
-      " 7   id_y           24 non-null     object\n",
-      " 8   desc           24 non-null     object\n",
-      "dtypes: object(9)\n",
-      "memory usage: 1.8+ KB\n"
-     ]
+     "data": {
+      "text/plain": [
+       "{'metric': <function mean at 0x130d17df0>,\n",
+       " 'observed': np.float64(-70.95833333333333),\n",
+       " 'p_value': np.float64(0.2664)}"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "# get out needed metadata\n",
-    "df_am['run_id'] = df_am['relationships'].apply(lambda x: x['run']['data']['id'])\n",
-    "df_am['sample_id'] = df_am['relationships'].apply(lambda x: x['sample']['data']['id'])\n",
-    "df_sm['desc'] = df_sm['attributes'].apply(lambda x: x['sample-desc'])\n",
+    "result = paired_permutation(inf_go, enf_go, metric='mean', n_permutations=10000, rng=rng)\n",
     "\n",
-    "# join\n",
-    "df_combined = df_am.merge(df_sm[['id', 'desc']], left_on='sample_id', right_on='id', how='left')\n",
-    "df_combined.info()"
+    "result"
    ]
   }
  ],

From dc648f6647b300cba5f88104272d7c316449c022 Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Tue, 16 Sep 2025 13:40:08 +0200
Subject: [PATCH 10/25] other go term

---
 docs/api_examples/permutation_testing.ipynb | 500 ++++++++++++++++++--
 1 file changed, 466 insertions(+), 34 deletions(-)

diff --git a/docs/api_examples/permutation_testing.ipynb b/docs/api_examples/permutation_testing.ipynb
index e72d3ef..1b0d644 100644
--- a/docs/api_examples/permutation_testing.ipynb
+++ b/docs/api_examples/permutation_testing.ipynb
@@ -28,7 +28,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "8506ee79",
    "metadata": {},
    "outputs": [
@@ -252,8 +252,11 @@
     "from io import StringIO\n",
     "import pandas as pd\n",
     "\n",
-    "enf_url = 'https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00005058/pipelines/4.1/file/ERP112696_GO_abundances_v4.1.tsv'\n",
-    "inf_url = 'https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00005056/pipelines/4.1/file/ERP111072_GO_abundances_v4.1.tsv'\n",
+    "enf_accession = \"MGYS00005058\"\n",
+    "inf_accession = \"MGYS00005056\"\n",
+    "\n",
+    "enf_url = f'https://www.ebi.ac.uk/metagenomics/api/v1/studies/{enf_accession}/pipelines/4.1/file/ERP112696_GO_abundances_v4.1.tsv'\n",
+    "inf_url = f'https://www.ebi.ac.uk/metagenomics/api/v1/studies/{inf_accession}/pipelines/4.1/file/ERP111072_GO_abundances_v4.1.tsv'\n",
     "\n",
     "# init dict to hold dataframes\n",
     "data = {}\n",
@@ -276,55 +279,472 @@
     "data[enf_url].head()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "id": "84364bf5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# preprocessing\n",
+    "from sklearn.preprocessing import FunctionTransformer\n",
+    "import numpy as np\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "\n",
+    "coda = FunctionTransformer(lambda x: x / x.sum(axis=0), feature_names_out=\"one-to-one\").set_output(transform=\"pandas\")\n",
+    "\n",
+    "def calc_clr(x):\n",
+    "    # replace zeros with small \n",
+    "    new_x = np.where(x > 0, x, 1e-10)\n",
+    "    return np.log(new_x) - np.mean(np.log(new_x), axis=0)\n",
+    "\n",
+    "clr = FunctionTransformer(calc_clr, feature_names_out=\"one-to-one\").set_output(transform=\"pandas\")\n",
+    "\n",
+    "def coda_clr(df):\n",
+    "    pipe = make_pipeline(coda, clr).set_output(transform=\"pandas\")\n",
+    "    transformed = pipe.fit_transform(df.select_dtypes(include='number'))\n",
+    "    return pd.concat([df.select_dtypes(include='object'), transformed], axis=1)\n",
+    "\n",
+    "clr_enf = coda_clr(data[enf_url])\n",
+    "clr_inf = coda_clr(data[inf_url])"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "a9fa25c4",
    "metadata": {},
    "source": [
-    "For this demo we will only look at [go term 0030655](https://gowiki.tamu.edu/wiki/index.php/Category:GO:0030655_!_beta-lactam_antibiotic_catabolic_process)"
+    "For this demo we will only look at [go term GO:0017001](https://www.ebi.ac.uk/QuickGO/term/GO:0017001)\n",
+    "\n",
+    "It's expected that antibiotic catabolic processes to be higher in influent vs efluent"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 130,
    "id": "0259a24f",
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "term = 'GO:0017001'\n",
+    "\n",
+    "inf_go = clr_inf.query(\"GO == @term\").drop(columns=['GO', 'description', 'category']).T.reset_index()\n",
+    "enf_go = clr_enf.query(\"GO == @term\").drop(columns=['GO', 'description', 'category']).T.reset_index()\n",
+    "\n",
+    "inf_go.columns = ['sample', 'abundance']\n",
+    "enf_go.columns = ['sample', 'abundance']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "13469e23",
+   "metadata": {},
+   "source": [
+    "The samples are paired. We need the metadata for the samples which we can also download using the MGnify API."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 131,
+   "id": "a7eb5848",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing MGYS00005058\n",
+      "Processing MGYS00005056\n"
+     ]
+    }
+   ],
+   "source": [
+    "# getting sampling and analysis metadata\n",
+    "metadata = {}\n",
+    "for accession in [enf_accession, inf_accession]:\n",
+    "    # prep urls\n",
+    "    url_sam = f'https://www.ebi.ac.uk/metagenomics/api/v1/studies/{accession}/samples'\n",
+    "    url_ana = f\"https://www.ebi.ac.uk/metagenomics/api/v1/analyses?study_accession={accession}\"\n",
+    "\n",
+    "    print(f\"Processing {accession}\")\n",
+    "    # download\n",
+    "    sam_response = requests.get(url_sam)\n",
+    "    if sam_response.status_code == 200:\n",
+    "        # json\n",
+    "        df_sam = pd.DataFrame(sam_response.json()['data'])\n",
+    "        # extract needed metadata\n",
+    "        df_sam['desc'] = df_sam['attributes'].apply(lambda x: x['sample-desc'])\n",
+    "        df_sam['sam_source'] = df_sam['desc'].str.split('.', expand=True)[0]\n",
+    "        df_sam['sam_read'] = df_sam['desc'].str.split('.', expand=True)[2]\n",
+    "    else: \n",
+    "        raise Exception(f\"Failed to download {url_sam}\")\n",
+    "        print(sam_response)\n",
+    "    ana_response = requests.get(url_ana)\n",
+    "    if ana_response.status_code == 200:\n",
+    "        # json\n",
+    "        df_ana = pd.DataFrame(ana_response.json()['data'])\n",
+    "        # extract needed metadata\n",
+    "        df_ana['run_id'] = df_ana['relationships'].apply(lambda x: x['run']['data']['id'])\n",
+    "        df_ana['sample_id'] = df_ana['relationships'].apply(lambda x: x['sample']['data']['id'])\n",
+    "    else: \n",
+    "        raise Exception(f\"Failed to download {url_ana}\")\n",
+    "        print(ana_response)\n",
+    "    # merge\n",
+    "    metadata[accession] = df_ana.merge(df_sam[['id', 'desc', 'sam_source', 'sam_read']], left_on='sample_id', right_on='id', how='left')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 132,
+   "id": "487cd4da",
+   "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(array([ 54, 101,  69,  52,  61,  68, 401,  72,  67,  80,  87,  49,  63,\n",
-       "         53, 345, 131,  67,  69,  42,  67, 107, 126,  63,  94]),\n",
-       " array([  23,   87, 1026,   63,  311,   47,   71,  121,  109,  334,   40,\n",
-       "         111,  111,   31,   21,  919,   58,   95,   98,  120,   96,   84,\n",
-       "          26,   89]))"
+       "((24, 13), (24, 13))"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 132,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "term = 'GO:0030655'\n",
-    "\n",
-    "inf_go = data[inf_url].query(\"GO == @term\").drop(columns=['GO', 'description', 'category']).to_numpy()[0]\n",
-    "enf_go = data[enf_url].query(\"GO == @term\").drop(columns=['GO', 'description', 'category']).to_numpy()[0]\n",
-    "\n",
-    "inf_go, enf_go"
+    "enf_all = metadata[enf_accession].merge(enf_go, left_on='run_id', right_on='sample', how='inner')\n",
+    "inf_all = metadata[inf_accession].merge(inf_go, left_on='run_id', right_on='sample', how='inner')\n",
+    "# sanity check\n",
+    "enf_all.shape, inf_all.shape"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "13469e23",
+   "cell_type": "code",
+   "execution_count": 136,
+   "id": "66056854",
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>run_id_enf</th>\n",
+       "      <th>run_id_inf</th>\n",
+       "      <th>sam_source</th>\n",
+       "      <th>sam_read</th>\n",
+       "      <th>abundance_enf</th>\n",
+       "      <th>abundance_inf</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ERR2985255</td>\n",
+       "      <td>ERR2814663</td>\n",
+       "      <td>TG</td>\n",
+       "      <td>READ2 Taxonomy ID:256318</td>\n",
+       "      <td>3.257</td>\n",
+       "      <td>4.227</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ERR2985256</td>\n",
+       "      <td>ERR2814664</td>\n",
+       "      <td>MN</td>\n",
+       "      <td>READ2 Taxonomy ID:256318</td>\n",
+       "      <td>2.573</td>\n",
+       "      <td>3.847</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>ERR2985257</td>\n",
+       "      <td>ERR2814651</td>\n",
+       "      <td>AH</td>\n",
+       "      <td>READ1 Taxonomy ID:256318</td>\n",
+       "      <td>4.299</td>\n",
+       "      <td>4.087</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>ERR2985258</td>\n",
+       "      <td>ERR2814667</td>\n",
+       "      <td>TE</td>\n",
+       "      <td>READ1 Taxonomy ID:256318</td>\n",
+       "      <td>2.759</td>\n",
+       "      <td>3.437</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>ERR2985259</td>\n",
+       "      <td>ERR2814660</td>\n",
+       "      <td>FD</td>\n",
+       "      <td>READ1 Taxonomy ID:256318</td>\n",
+       "      <td>3.365</td>\n",
+       "      <td>3.487</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>ERR2985260</td>\n",
+       "      <td>ERR2814652</td>\n",
+       "      <td>AD</td>\n",
+       "      <td>READ1 Taxonomy ID:256318</td>\n",
+       "      <td>3.123</td>\n",
+       "      <td>3.196</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>ERR2985261</td>\n",
+       "      <td>ERR2814655</td>\n",
+       "      <td>EM</td>\n",
+       "      <td>READ2 Taxonomy ID:256318</td>\n",
+       "      <td>2.811</td>\n",
+       "      <td>3.680</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>ERR2985262</td>\n",
+       "      <td>ERR2814647</td>\n",
+       "      <td>MN</td>\n",
+       "      <td>READ1 Taxonomy ID:256318</td>\n",
+       "      <td>2.648</td>\n",
+       "      <td>3.854</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>ERR2985263</td>\n",
+       "      <td>ERR2814649</td>\n",
+       "      <td>ZR</td>\n",
+       "      <td>READ2 Taxonomy ID:256318</td>\n",
+       "      <td>3.575</td>\n",
+       "      <td>3.899</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>ERR2985264</td>\n",
+       "      <td>ERR2814646</td>\n",
+       "      <td>FD</td>\n",
+       "      <td>READ2 Taxonomy ID:256318</td>\n",
+       "      <td>3.193</td>\n",
+       "      <td>3.451</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>ERR2985265</td>\n",
+       "      <td>ERR2814658</td>\n",
+       "      <td>AD</td>\n",
+       "      <td>READ2 Taxonomy ID:256318</td>\n",
+       "      <td>2.909</td>\n",
+       "      <td>3.303</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>ERR2985266</td>\n",
+       "      <td>ERR2814650</td>\n",
+       "      <td>ZR</td>\n",
+       "      <td>READ1 Taxonomy ID:256318</td>\n",
+       "      <td>3.389</td>\n",
+       "      <td>3.849</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>ERR2985267</td>\n",
+       "      <td>ERR2814648</td>\n",
+       "      <td>WT</td>\n",
+       "      <td>READ2 Taxonomy ID:256318</td>\n",
+       "      <td>2.807</td>\n",
+       "      <td>3.532</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>ERR2985268</td>\n",
+       "      <td>ERR2814645</td>\n",
+       "      <td>TG</td>\n",
+       "      <td>READ1 Taxonomy ID:256318</td>\n",
+       "      <td>3.241</td>\n",
+       "      <td>4.053</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>ERR2985269</td>\n",
+       "      <td>ERR2814656</td>\n",
+       "      <td>TU</td>\n",
+       "      <td>READ2 Taxonomy ID:256318</td>\n",
+       "      <td>2.930</td>\n",
+       "      <td>3.616</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>ERR2985270</td>\n",
+       "      <td>ERR2814659</td>\n",
+       "      <td>AH</td>\n",
+       "      <td>READ2 Taxonomy ID:256318</td>\n",
+       "      <td>4.401</td>\n",
+       "      <td>4.149</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>ERR2985271</td>\n",
+       "      <td>ERR2814657</td>\n",
+       "      <td>TE</td>\n",
+       "      <td>READ2 Taxonomy ID:256318</td>\n",
+       "      <td>2.901</td>\n",
+       "      <td>3.758</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>ERR2985272</td>\n",
+       "      <td>ERR2814666</td>\n",
+       "      <td>BE</td>\n",
+       "      <td>READ1 Taxonomy ID:256318</td>\n",
+       "      <td>4.125</td>\n",
+       "      <td>4.398</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>ERR2985273</td>\n",
+       "      <td>ERR2814665</td>\n",
+       "      <td>BE</td>\n",
+       "      <td>READ2 Taxonomy ID:256318</td>\n",
+       "      <td>4.077</td>\n",
+       "      <td>4.395</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>ERR2985274</td>\n",
+       "      <td>ERR2814662</td>\n",
+       "      <td>WT</td>\n",
+       "      <td>READ1 Taxonomy ID:256318</td>\n",
+       "      <td>3.109</td>\n",
+       "      <td>3.513</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>ERR2985275</td>\n",
+       "      <td>ERR2814653</td>\n",
+       "      <td>FR</td>\n",
+       "      <td>READ2 Taxonomy ID:256318</td>\n",
+       "      <td>3.000</td>\n",
+       "      <td>3.595</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>ERR2985276</td>\n",
+       "      <td>ERR2814668</td>\n",
+       "      <td>EM</td>\n",
+       "      <td>READ1 Taxonomy ID:256318</td>\n",
+       "      <td>2.753</td>\n",
+       "      <td>3.547</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>ERR2985277</td>\n",
+       "      <td>ERR2814661</td>\n",
+       "      <td>TU</td>\n",
+       "      <td>READ1 Taxonomy ID:256318</td>\n",
+       "      <td>2.877</td>\n",
+       "      <td>3.513</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>ERR2985278</td>\n",
+       "      <td>ERR2814654</td>\n",
+       "      <td>FR</td>\n",
+       "      <td>READ1 Taxonomy ID:256318</td>\n",
+       "      <td>2.734</td>\n",
+       "      <td>3.310</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    run_id_enf  run_id_inf sam_source                  sam_read  \\\n",
+       "0   ERR2985255  ERR2814663         TG  READ2 Taxonomy ID:256318   \n",
+       "1   ERR2985256  ERR2814664         MN  READ2 Taxonomy ID:256318   \n",
+       "2   ERR2985257  ERR2814651         AH  READ1 Taxonomy ID:256318   \n",
+       "3   ERR2985258  ERR2814667         TE  READ1 Taxonomy ID:256318   \n",
+       "4   ERR2985259  ERR2814660         FD  READ1 Taxonomy ID:256318   \n",
+       "5   ERR2985260  ERR2814652         AD  READ1 Taxonomy ID:256318   \n",
+       "6   ERR2985261  ERR2814655         EM  READ2 Taxonomy ID:256318   \n",
+       "7   ERR2985262  ERR2814647         MN  READ1 Taxonomy ID:256318   \n",
+       "8   ERR2985263  ERR2814649         ZR  READ2 Taxonomy ID:256318   \n",
+       "9   ERR2985264  ERR2814646         FD  READ2 Taxonomy ID:256318   \n",
+       "10  ERR2985265  ERR2814658         AD  READ2 Taxonomy ID:256318   \n",
+       "11  ERR2985266  ERR2814650         ZR  READ1 Taxonomy ID:256318   \n",
+       "12  ERR2985267  ERR2814648         WT  READ2 Taxonomy ID:256318   \n",
+       "13  ERR2985268  ERR2814645         TG  READ1 Taxonomy ID:256318   \n",
+       "14  ERR2985269  ERR2814656         TU  READ2 Taxonomy ID:256318   \n",
+       "15  ERR2985270  ERR2814659         AH  READ2 Taxonomy ID:256318   \n",
+       "16  ERR2985271  ERR2814657         TE  READ2 Taxonomy ID:256318   \n",
+       "17  ERR2985272  ERR2814666         BE  READ1 Taxonomy ID:256318   \n",
+       "18  ERR2985273  ERR2814665         BE  READ2 Taxonomy ID:256318   \n",
+       "19  ERR2985274  ERR2814662         WT  READ1 Taxonomy ID:256318   \n",
+       "20  ERR2985275  ERR2814653         FR  READ2 Taxonomy ID:256318   \n",
+       "21  ERR2985276  ERR2814668         EM  READ1 Taxonomy ID:256318   \n",
+       "22  ERR2985277  ERR2814661         TU  READ1 Taxonomy ID:256318   \n",
+       "23  ERR2985278  ERR2814654         FR  READ1 Taxonomy ID:256318   \n",
+       "\n",
+       "    abundance_enf  abundance_inf  \n",
+       "0           3.257          4.227  \n",
+       "1           2.573          3.847  \n",
+       "2           4.299          4.087  \n",
+       "3           2.759          3.437  \n",
+       "4           3.365          3.487  \n",
+       "5           3.123          3.196  \n",
+       "6           2.811          3.680  \n",
+       "7           2.648          3.854  \n",
+       "8           3.575          3.899  \n",
+       "9           3.193          3.451  \n",
+       "10          2.909          3.303  \n",
+       "11          3.389          3.849  \n",
+       "12          2.807          3.532  \n",
+       "13          3.241          4.053  \n",
+       "14          2.930          3.616  \n",
+       "15          4.401          4.149  \n",
+       "16          2.901          3.758  \n",
+       "17          4.125          4.398  \n",
+       "18          4.077          4.395  \n",
+       "19          3.109          3.513  \n",
+       "20          3.000          3.595  \n",
+       "21          2.753          3.547  \n",
+       "22          2.877          3.513  \n",
+       "23          2.734          3.310  "
+      ]
+     },
+     "execution_count": 136,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "get metadata"
+    "# pairing \n",
+    "df_data = enf_all.merge(\n",
+    "    inf_all,\n",
+    "    on=['sam_source', 'sam_read'], suffixes=('_enf', '_inf')\n",
+    ")[['run_id_enf', 'run_id_inf', 'sam_source', 'sam_read', 'abundance_enf', 'abundance_inf']]\n",
+    "\n",
+    "df_data"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 134,
    "id": "5eefb720",
    "metadata": {},
    "outputs": [],
@@ -338,28 +758,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 135,
    "id": "9eabb911",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "{'metric': <function mean at 0x130d17df0>,\n",
-       " 'observed': np.float64(-70.95833333333333),\n",
-       " 'p_value': np.float64(0.2664)}"
-      ]
-     },
-     "execution_count": 31,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'metric': <function ttest_rel at 0x115df23e0>, 'observed': TtestResult(statistic=np.float64(6.7389860601792275), pvalue=np.float64(7.122287781830209e-07), df=np.int64(23)), 'p_value': np.float64(0.0)}\n",
+      "{'metric': <function mean at 0x130d17df0>, 'observed': np.float64(0.5350826397500547), 'p_value': np.float64(0.0)}\n",
+      "{'metric': <function mean at 0x130d17df0>, 'observed': np.float64(0.5350826397500547), 'p_value': np.float64(0.0)}\n"
+     ]
     }
    ],
    "source": [
-    "result = paired_permutation(inf_go, enf_go, metric='mean', n_permutations=10000, rng=rng)\n",
+    "for metric in ['t-statistic', 'mean', np.mean]:\n",
+    "    result = paired_permutation(\n",
+    "        df_data['abundance_inf'].to_numpy(),\n",
+    "        df_data['abundance_enf'].to_numpy(), \n",
+    "        metric=metric, \n",
+    "        n_permutations=10000, \n",
+    "        rng=rng\n",
+    "    )\n",
     "\n",
-    "result"
+    "    print(result)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9da7d601",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

From 5492203774f5e18d74e0038270c48310e8d907ec Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Tue, 14 Oct 2025 12:52:45 +0200
Subject: [PATCH 11/25] :construction: instead saving processed dataset to
 example_data/mgnify

---
 docs/api_examples/permutation_testing.ipynb   | 331 +-----------------
 .../Ju2018_GO0017001_enf_inf_paired.csv       |  25 ++
 src/acore/microbiome/internal_functions.py    |  54 +++
 3 files changed, 90 insertions(+), 320 deletions(-)
 create mode 100644 example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv
 create mode 100644 src/acore/microbiome/internal_functions.py

diff --git a/docs/api_examples/permutation_testing.ipynb b/docs/api_examples/permutation_testing.ipynb
index 1b0d644..03f82dd 100644
--- a/docs/api_examples/permutation_testing.ipynb
+++ b/docs/api_examples/permutation_testing.ipynb
@@ -9,7 +9,7 @@
     "\n",
     "In this notebook we will demonstrate how to use acore's permutation testing functions on metagenomics data collected by [Ju and colleagues (2018)](https://doi.org/10.1038/s41396-018-0277-8).\n",
     "\n",
-    "The samples were collected from wastewaster treatment plant inffluent and effluent."
+    "The samples were collected from wastewaster treatment plant inffluent (MGYS00005056) and effluent (MGYS00005058).\n"
    ]
   },
   {
@@ -28,7 +28,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "8506ee79",
    "metadata": {},
    "outputs": [
@@ -242,7 +242,7 @@
        "[5 rows x 27 columns]"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 1,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -281,7 +281,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 4,
    "id": "84364bf5",
    "metadata": {},
    "outputs": [],
@@ -321,7 +321,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 130,
+   "execution_count": 5,
    "id": "0259a24f",
    "metadata": {},
    "outputs": [],
@@ -345,7 +345,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 131,
+   "execution_count": 6,
    "id": "a7eb5848",
    "metadata": {},
    "outputs": [
@@ -395,7 +395,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 132,
+   "execution_count": 7,
    "id": "487cd4da",
    "metadata": {},
    "outputs": [
@@ -405,7 +405,7 @@
        "((24, 13), (24, 13))"
       ]
      },
-     "execution_count": 132,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -419,319 +419,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 136,
+   "execution_count": null,
    "id": "66056854",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>run_id_enf</th>\n",
-       "      <th>run_id_inf</th>\n",
-       "      <th>sam_source</th>\n",
-       "      <th>sam_read</th>\n",
-       "      <th>abundance_enf</th>\n",
-       "      <th>abundance_inf</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>ERR2985255</td>\n",
-       "      <td>ERR2814663</td>\n",
-       "      <td>TG</td>\n",
-       "      <td>READ2 Taxonomy ID:256318</td>\n",
-       "      <td>3.257</td>\n",
-       "      <td>4.227</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>ERR2985256</td>\n",
-       "      <td>ERR2814664</td>\n",
-       "      <td>MN</td>\n",
-       "      <td>READ2 Taxonomy ID:256318</td>\n",
-       "      <td>2.573</td>\n",
-       "      <td>3.847</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>ERR2985257</td>\n",
-       "      <td>ERR2814651</td>\n",
-       "      <td>AH</td>\n",
-       "      <td>READ1 Taxonomy ID:256318</td>\n",
-       "      <td>4.299</td>\n",
-       "      <td>4.087</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>ERR2985258</td>\n",
-       "      <td>ERR2814667</td>\n",
-       "      <td>TE</td>\n",
-       "      <td>READ1 Taxonomy ID:256318</td>\n",
-       "      <td>2.759</td>\n",
-       "      <td>3.437</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>ERR2985259</td>\n",
-       "      <td>ERR2814660</td>\n",
-       "      <td>FD</td>\n",
-       "      <td>READ1 Taxonomy ID:256318</td>\n",
-       "      <td>3.365</td>\n",
-       "      <td>3.487</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>ERR2985260</td>\n",
-       "      <td>ERR2814652</td>\n",
-       "      <td>AD</td>\n",
-       "      <td>READ1 Taxonomy ID:256318</td>\n",
-       "      <td>3.123</td>\n",
-       "      <td>3.196</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>ERR2985261</td>\n",
-       "      <td>ERR2814655</td>\n",
-       "      <td>EM</td>\n",
-       "      <td>READ2 Taxonomy ID:256318</td>\n",
-       "      <td>2.811</td>\n",
-       "      <td>3.680</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>ERR2985262</td>\n",
-       "      <td>ERR2814647</td>\n",
-       "      <td>MN</td>\n",
-       "      <td>READ1 Taxonomy ID:256318</td>\n",
-       "      <td>2.648</td>\n",
-       "      <td>3.854</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>ERR2985263</td>\n",
-       "      <td>ERR2814649</td>\n",
-       "      <td>ZR</td>\n",
-       "      <td>READ2 Taxonomy ID:256318</td>\n",
-       "      <td>3.575</td>\n",
-       "      <td>3.899</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>ERR2985264</td>\n",
-       "      <td>ERR2814646</td>\n",
-       "      <td>FD</td>\n",
-       "      <td>READ2 Taxonomy ID:256318</td>\n",
-       "      <td>3.193</td>\n",
-       "      <td>3.451</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10</th>\n",
-       "      <td>ERR2985265</td>\n",
-       "      <td>ERR2814658</td>\n",
-       "      <td>AD</td>\n",
-       "      <td>READ2 Taxonomy ID:256318</td>\n",
-       "      <td>2.909</td>\n",
-       "      <td>3.303</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>11</th>\n",
-       "      <td>ERR2985266</td>\n",
-       "      <td>ERR2814650</td>\n",
-       "      <td>ZR</td>\n",
-       "      <td>READ1 Taxonomy ID:256318</td>\n",
-       "      <td>3.389</td>\n",
-       "      <td>3.849</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12</th>\n",
-       "      <td>ERR2985267</td>\n",
-       "      <td>ERR2814648</td>\n",
-       "      <td>WT</td>\n",
-       "      <td>READ2 Taxonomy ID:256318</td>\n",
-       "      <td>2.807</td>\n",
-       "      <td>3.532</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>ERR2985268</td>\n",
-       "      <td>ERR2814645</td>\n",
-       "      <td>TG</td>\n",
-       "      <td>READ1 Taxonomy ID:256318</td>\n",
-       "      <td>3.241</td>\n",
-       "      <td>4.053</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>ERR2985269</td>\n",
-       "      <td>ERR2814656</td>\n",
-       "      <td>TU</td>\n",
-       "      <td>READ2 Taxonomy ID:256318</td>\n",
-       "      <td>2.930</td>\n",
-       "      <td>3.616</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>ERR2985270</td>\n",
-       "      <td>ERR2814659</td>\n",
-       "      <td>AH</td>\n",
-       "      <td>READ2 Taxonomy ID:256318</td>\n",
-       "      <td>4.401</td>\n",
-       "      <td>4.149</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>ERR2985271</td>\n",
-       "      <td>ERR2814657</td>\n",
-       "      <td>TE</td>\n",
-       "      <td>READ2 Taxonomy ID:256318</td>\n",
-       "      <td>2.901</td>\n",
-       "      <td>3.758</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17</th>\n",
-       "      <td>ERR2985272</td>\n",
-       "      <td>ERR2814666</td>\n",
-       "      <td>BE</td>\n",
-       "      <td>READ1 Taxonomy ID:256318</td>\n",
-       "      <td>4.125</td>\n",
-       "      <td>4.398</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>18</th>\n",
-       "      <td>ERR2985273</td>\n",
-       "      <td>ERR2814665</td>\n",
-       "      <td>BE</td>\n",
-       "      <td>READ2 Taxonomy ID:256318</td>\n",
-       "      <td>4.077</td>\n",
-       "      <td>4.395</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>19</th>\n",
-       "      <td>ERR2985274</td>\n",
-       "      <td>ERR2814662</td>\n",
-       "      <td>WT</td>\n",
-       "      <td>READ1 Taxonomy ID:256318</td>\n",
-       "      <td>3.109</td>\n",
-       "      <td>3.513</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>20</th>\n",
-       "      <td>ERR2985275</td>\n",
-       "      <td>ERR2814653</td>\n",
-       "      <td>FR</td>\n",
-       "      <td>READ2 Taxonomy ID:256318</td>\n",
-       "      <td>3.000</td>\n",
-       "      <td>3.595</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>21</th>\n",
-       "      <td>ERR2985276</td>\n",
-       "      <td>ERR2814668</td>\n",
-       "      <td>EM</td>\n",
-       "      <td>READ1 Taxonomy ID:256318</td>\n",
-       "      <td>2.753</td>\n",
-       "      <td>3.547</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>22</th>\n",
-       "      <td>ERR2985277</td>\n",
-       "      <td>ERR2814661</td>\n",
-       "      <td>TU</td>\n",
-       "      <td>READ1 Taxonomy ID:256318</td>\n",
-       "      <td>2.877</td>\n",
-       "      <td>3.513</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>23</th>\n",
-       "      <td>ERR2985278</td>\n",
-       "      <td>ERR2814654</td>\n",
-       "      <td>FR</td>\n",
-       "      <td>READ1 Taxonomy ID:256318</td>\n",
-       "      <td>2.734</td>\n",
-       "      <td>3.310</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    run_id_enf  run_id_inf sam_source                  sam_read  \\\n",
-       "0   ERR2985255  ERR2814663         TG  READ2 Taxonomy ID:256318   \n",
-       "1   ERR2985256  ERR2814664         MN  READ2 Taxonomy ID:256318   \n",
-       "2   ERR2985257  ERR2814651         AH  READ1 Taxonomy ID:256318   \n",
-       "3   ERR2985258  ERR2814667         TE  READ1 Taxonomy ID:256318   \n",
-       "4   ERR2985259  ERR2814660         FD  READ1 Taxonomy ID:256318   \n",
-       "5   ERR2985260  ERR2814652         AD  READ1 Taxonomy ID:256318   \n",
-       "6   ERR2985261  ERR2814655         EM  READ2 Taxonomy ID:256318   \n",
-       "7   ERR2985262  ERR2814647         MN  READ1 Taxonomy ID:256318   \n",
-       "8   ERR2985263  ERR2814649         ZR  READ2 Taxonomy ID:256318   \n",
-       "9   ERR2985264  ERR2814646         FD  READ2 Taxonomy ID:256318   \n",
-       "10  ERR2985265  ERR2814658         AD  READ2 Taxonomy ID:256318   \n",
-       "11  ERR2985266  ERR2814650         ZR  READ1 Taxonomy ID:256318   \n",
-       "12  ERR2985267  ERR2814648         WT  READ2 Taxonomy ID:256318   \n",
-       "13  ERR2985268  ERR2814645         TG  READ1 Taxonomy ID:256318   \n",
-       "14  ERR2985269  ERR2814656         TU  READ2 Taxonomy ID:256318   \n",
-       "15  ERR2985270  ERR2814659         AH  READ2 Taxonomy ID:256318   \n",
-       "16  ERR2985271  ERR2814657         TE  READ2 Taxonomy ID:256318   \n",
-       "17  ERR2985272  ERR2814666         BE  READ1 Taxonomy ID:256318   \n",
-       "18  ERR2985273  ERR2814665         BE  READ2 Taxonomy ID:256318   \n",
-       "19  ERR2985274  ERR2814662         WT  READ1 Taxonomy ID:256318   \n",
-       "20  ERR2985275  ERR2814653         FR  READ2 Taxonomy ID:256318   \n",
-       "21  ERR2985276  ERR2814668         EM  READ1 Taxonomy ID:256318   \n",
-       "22  ERR2985277  ERR2814661         TU  READ1 Taxonomy ID:256318   \n",
-       "23  ERR2985278  ERR2814654         FR  READ1 Taxonomy ID:256318   \n",
-       "\n",
-       "    abundance_enf  abundance_inf  \n",
-       "0           3.257          4.227  \n",
-       "1           2.573          3.847  \n",
-       "2           4.299          4.087  \n",
-       "3           2.759          3.437  \n",
-       "4           3.365          3.487  \n",
-       "5           3.123          3.196  \n",
-       "6           2.811          3.680  \n",
-       "7           2.648          3.854  \n",
-       "8           3.575          3.899  \n",
-       "9           3.193          3.451  \n",
-       "10          2.909          3.303  \n",
-       "11          3.389          3.849  \n",
-       "12          2.807          3.532  \n",
-       "13          3.241          4.053  \n",
-       "14          2.930          3.616  \n",
-       "15          4.401          4.149  \n",
-       "16          2.901          3.758  \n",
-       "17          4.125          4.398  \n",
-       "18          4.077          4.395  \n",
-       "19          3.109          3.513  \n",
-       "20          3.000          3.595  \n",
-       "21          2.753          3.547  \n",
-       "22          2.877          3.513  \n",
-       "23          2.734          3.310  "
-      ]
-     },
-     "execution_count": 136,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# pairing \n",
     "df_data = enf_all.merge(\n",
@@ -739,7 +430,7 @@
     "    on=['sam_source', 'sam_read'], suffixes=('_enf', '_inf')\n",
     ")[['run_id_enf', 'run_id_inf', 'sam_source', 'sam_read', 'abundance_enf', 'abundance_inf']]\n",
     "\n",
-    "df_data"
+    "df_data.to_csv('/Users/anglup/GitHub/acore/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv', index=False)"
    ]
   },
   {
diff --git a/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv b/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv
new file mode 100644
index 0000000..07477c5
--- /dev/null
+++ b/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv
@@ -0,0 +1,25 @@
+run_id_enf,run_id_inf,sam_source,sam_read,abundance_enf,abundance_inf
+ERR2985255,ERR2814663,TG,READ2 Taxonomy ID:256318,3.2572833167019404,4.226818522502555
+ERR2985256,ERR2814664,MN,READ2 Taxonomy ID:256318,2.5728406558389647,3.8471905353288207
+ERR2985257,ERR2814651,AH,READ1 Taxonomy ID:256318,4.298777376040054,4.08684074171776
+ERR2985258,ERR2814667,TE,READ1 Taxonomy ID:256318,2.758981522176203,3.4367515193484355
+ERR2985259,ERR2814660,FD,READ1 Taxonomy ID:256318,3.3646753829076825,3.4866734935494126
+ERR2985260,ERR2814652,AD,READ1 Taxonomy ID:256318,3.122830873754367,3.1959892495165043
+ERR2985261,ERR2814655,EM,READ2 Taxonomy ID:256318,2.8111912227938394,3.6796595362263957
+ERR2985262,ERR2814647,MN,READ1 Taxonomy ID:256318,2.64787572226993,3.854349549040254
+ERR2985263,ERR2814649,ZR,READ2 Taxonomy ID:256318,3.5745130928438336,3.898617360802433
+ERR2985264,ERR2814646,FD,READ2 Taxonomy ID:256318,3.1932134470444886,3.4511721120494254
+ERR2985265,ERR2814658,AD,READ2 Taxonomy ID:256318,2.9090652423743357,3.3030032029809693
+ERR2985266,ERR2814650,ZR,READ1 Taxonomy ID:256318,3.3891206874721362,3.84895839035568
+ERR2985267,ERR2814648,WT,READ2 Taxonomy ID:256318,2.8069435917365073,3.5322169927895697
+ERR2985268,ERR2814645,TG,READ1 Taxonomy ID:256318,3.241378979031497,4.052892167654379
+ERR2985269,ERR2814656,TU,READ2 Taxonomy ID:256318,2.929680670135557,3.616255621958018
+ERR2985270,ERR2814659,AH,READ2 Taxonomy ID:256318,4.400504997765587,4.149306529774245
+ERR2985271,ERR2814657,TE,READ2 Taxonomy ID:256318,2.901002904935801,3.75827367299984
+ERR2985272,ERR2814666,BE,READ1 Taxonomy ID:256318,4.125262599471004,4.398085076927169
+ERR2985273,ERR2814665,BE,READ2 Taxonomy ID:256318,4.076901525292074,4.39475485971623
+ERR2985274,ERR2814662,WT,READ1 Taxonomy ID:256318,3.1091661937084183,3.5130581102452503
+ERR2985275,ERR2814653,FR,READ2 Taxonomy ID:256318,3.0000096382997192,3.594857935168328
+ERR2985276,ERR2814668,EM,READ1 Taxonomy ID:256318,2.752778722206287,3.547368740393445
+ERR2985277,ERR2814661,TU,READ1 Taxonomy ID:256318,2.8771702480026953,3.5132674150221757
+ERR2985278,ERR2814654,FR,READ1 Taxonomy ID:256318,2.7336315345563857,3.310422165293325
diff --git a/src/acore/microbiome/internal_functions.py b/src/acore/microbiome/internal_functions.py
new file mode 100644
index 0000000..1afe23c
--- /dev/null
+++ b/src/acore/microbiome/internal_functions.py
@@ -0,0 +1,54 @@
+# preprocessing
+from sklearn.preprocessing import FunctionTransformer
+import numpy as np
+from sklearn.pipeline import make_pipeline
+import pandas as pd
+
+def calc_clr(x):
+    """
+    Calculate the centered log-ratio (CLR) transformation.
+    
+    Parameters
+    ----------
+    x : array-like
+        Input data to transform.
+
+    Returns
+    -------
+    array-like
+        CLR transformed data.
+    """
+    # replace zeros with small 
+    new_x = np.where(x > 0, x, 1e-10)
+    return np.log(new_x) - np.mean(np.log(new_x), axis=0)
+
+
+def coda_clr(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Apply CoDA and CLR transformations to the numeric columns of a DataFrame.
+    Non-numeric columns are retained without transformation.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Input DataFrame with numeric and non-numeric columns.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with transformed numeric columns and original non-numeric columns.
+    """
+    # Define the CoDA and CLR transformers
+    coda = FunctionTransformer(
+        lambda x: x / x.sum(axis=0), 
+        feature_names_out="one-to-one"
+    ).set_output(transform="pandas")
+    clr = FunctionTransformer(
+        calc_clr, 
+        feature_names_out="one-to-one"
+    ).set_output(transform="pandas")
+    # Create a pipeline with CoDA and CLR transformations
+    pipe = make_pipeline(coda, clr).set_output(transform="pandas")
+    # Apply the pipeline to numeric columns and concatenate with non-numeric columns
+    transformed = pipe.fit_transform(df.select_dtypes(include='number'))
+    return pd.concat([df.select_dtypes(include='object'), transformed], axis=1)
\ No newline at end of file

From 8ebab69a76582941aa4929777b1dff2051cded95 Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Tue, 14 Oct 2025 17:06:28 +0200
Subject: [PATCH 12/25] :memo:docs: Update API examples for permutation testing

- saved processed dataset to example_data
- removed data cleaning code from notebook
- added descriptions and permutation explanation in markdown
---
 docs/api_examples/permutation_testing.ipynb   | 471 +++++-------------
 docs/index.rst                                |   1 +
 .../Ju2018_GO0017001_enf_inf_paired.csv       |   2 +-
 3 files changed, 113 insertions(+), 361 deletions(-)

diff --git a/docs/api_examples/permutation_testing.ipynb b/docs/api_examples/permutation_testing.ipynb
index 03f82dd..de1b503 100644
--- a/docs/api_examples/permutation_testing.ipynb
+++ b/docs/api_examples/permutation_testing.ipynb
@@ -9,37 +9,56 @@
     "\n",
     "In this notebook we will demonstrate how to use acore's permutation testing functions on metagenomics data collected by [Ju and colleagues (2018)](https://doi.org/10.1038/s41396-018-0277-8).\n",
     "\n",
-    "The samples were collected from wastewaster treatment plant inffluent (MGYS00005056) and effluent (MGYS00005058).\n"
+    "The samples in this demo were collected from wastewaster treatment plant inffluent (MGYS00005056) and effluent (MGYS00005058).\n",
+    "\n",
+    "For this demo we look at the GO term abundance tables generated by the Mgnify pipeline. The values in the table are the absolute abundance of selected GO terms for each sample, which we then transform to relative abundances and centred-log ratios. \n"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "79d8b537",
-   "metadata": {
-    "vscode": {
-     "languageId": "markdown"
-    }
-   },
+   "id": "dc9f17b2",
+   "metadata": {},
    "source": [
-    "## Downloading the dataset\n",
+    "## Data preparation details\n",
+    "\n",
+    "### Downloading\n",
+    "The analysed samples were downloaded via the [MGnify API](https://www.ebi.ac.uk/metagenomics/api/docs/). The inffluent (INF) and effluent (EFFF) datasets have paired samples and we also needed to download the sample metadata (also available via Mgnify API) to assign the correct pairing.\n",
+    "\n",
+    "### Preprocessing of abundances\n",
+    "- To account for technical variation due to sequencing technology limitations, we first transform the abundance values so they are relative to the total reads for the sample aka getting relative abundances. \n",
+    "- The relative abundances are compositional data (CoDa) so we map them to unconstrained vectors using centred log-ratio transformation `acore.microbiome.internal_functions.calc_clr()` to not violate assumptions of any frequentist stats we do\n",
     "\n",
-    "The analysed samples can be downloaded via the Mgnify API."
+    "### Preprocessing of the metadata \n",
+    "- the sample metadata needed for this demo (sampling location) were available in their \"sample-desc\" \n",
+    "- the sample-desc for each sample in both INF and EFF were parsed and used for pairing off\n",
+    "\n",
+    "### Subset of data for demo\n",
+    "- For this demo we only look at [go term GO:0017001](https://www.ebi.ac.uk/QuickGO/term/GO:0017001)\n",
+    "- It's expected that antibiotic catabolic processes to be higher in INF vs EFF\n",
+    "\n",
+    "### Saving the demo dataset\n",
+    "This example subset of data was saved to a CSV, ./example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv. The data dictionary is below:\n",
+    "\n",
+    "| column            | description                                                                                                       | dtype |\n",
+    "|-------------------|-------------------------------------------------------------------------------------------------------------------|-------|\n",
+    "| eff_id            | The run id for the mgnify analysis of the effluent sample.                                                        | str   |\n",
+    "| inf_id            | The run id for the mgnify analysis of the influent sample.                                                        | str   |\n",
+    "| sampling_location | [The ISO 3166-1 alpha-2 code](http://iso.org/obp/ui/#iso:pub:PUB500001:en) for the country where the sample was from. | str   |\n",
+    "| sampling_read     | Replicates?                                                                                                       | str   |\n",
+    "| eff_abundance     | The relative abundance of the GO term for a given effluent sample following preprocessing (i.e., CoDA and CLR)    | float |\n",
+    "| inf_abundance     | The relative abundance of the GO term for a given influent sample following preprocessing (i.e., CoDA and CLR)    | float |\n",
+    "\n",
+    "-----\n",
+    "\n",
+    "We will now proceed with reading in the prepared dataset. "
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "8506ee79",
+   "id": "00d8f038",
    "metadata": {},
    "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Processing https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00005058/pipelines/4.1/file/ERP112696_GO_abundances_v4.1.tsv\n",
-      "Processing https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00005056/pipelines/4.1/file/ERP111072_GO_abundances_v4.1.tsv\n"
-     ]
-    },
     {
      "data": {
       "text/html": [
@@ -61,185 +80,78 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>GO</th>\n",
-       "      <th>description</th>\n",
-       "      <th>category</th>\n",
-       "      <th>ERR2985255</th>\n",
-       "      <th>ERR2985256</th>\n",
-       "      <th>ERR2985257</th>\n",
-       "      <th>ERR2985258</th>\n",
-       "      <th>ERR2985259</th>\n",
-       "      <th>ERR2985260</th>\n",
-       "      <th>ERR2985261</th>\n",
-       "      <th>...</th>\n",
-       "      <th>ERR2985269</th>\n",
-       "      <th>ERR2985270</th>\n",
-       "      <th>ERR2985271</th>\n",
-       "      <th>ERR2985272</th>\n",
-       "      <th>ERR2985273</th>\n",
-       "      <th>ERR2985274</th>\n",
-       "      <th>ERR2985275</th>\n",
-       "      <th>ERR2985276</th>\n",
-       "      <th>ERR2985277</th>\n",
-       "      <th>ERR2985278</th>\n",
+       "      <th>eff_id</th>\n",
+       "      <th>inf_id</th>\n",
+       "      <th>sampling_location</th>\n",
+       "      <th>sampling_read</th>\n",
+       "      <th>eff_abundance</th>\n",
+       "      <th>inf_abundance</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>GO:0000001</td>\n",
-       "      <td>mitochondrion inheritance</td>\n",
-       "      <td>biological process</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2</td>\n",
+       "      <td>ERR2985255</td>\n",
+       "      <td>ERR2814663</td>\n",
+       "      <td>TG</td>\n",
+       "      <td>READ2 Taxonomy ID:256318</td>\n",
+       "      <td>3.257283</td>\n",
+       "      <td>4.226819</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>GO:0000002</td>\n",
-       "      <td>mitochondrial genome maintenance</td>\n",
-       "      <td>biological process</td>\n",
-       "      <td>0</td>\n",
-       "      <td>9</td>\n",
-       "      <td>7</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>7</td>\n",
-       "      <td>4</td>\n",
-       "      <td>...</td>\n",
-       "      <td>15</td>\n",
-       "      <td>10</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>5</td>\n",
-       "      <td>0</td>\n",
-       "      <td>4</td>\n",
-       "      <td>11</td>\n",
-       "      <td>1</td>\n",
+       "      <td>ERR2985256</td>\n",
+       "      <td>ERR2814664</td>\n",
+       "      <td>MN</td>\n",
+       "      <td>READ2 Taxonomy ID:256318</td>\n",
+       "      <td>2.572841</td>\n",
+       "      <td>3.847191</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>GO:0000012</td>\n",
-       "      <td>single strand break repair</td>\n",
-       "      <td>biological process</td>\n",
-       "      <td>0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
+       "      <td>ERR2985257</td>\n",
+       "      <td>ERR2814651</td>\n",
+       "      <td>AH</td>\n",
+       "      <td>READ1 Taxonomy ID:256318</td>\n",
+       "      <td>4.298777</td>\n",
+       "      <td>4.086841</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>GO:0000015</td>\n",
-       "      <td>phosphopyruvate hydratase complex</td>\n",
-       "      <td>cellular component</td>\n",
-       "      <td>477</td>\n",
-       "      <td>384</td>\n",
-       "      <td>500</td>\n",
-       "      <td>188</td>\n",
-       "      <td>226</td>\n",
-       "      <td>232</td>\n",
-       "      <td>301</td>\n",
-       "      <td>...</td>\n",
-       "      <td>203</td>\n",
-       "      <td>432</td>\n",
-       "      <td>152</td>\n",
-       "      <td>341</td>\n",
-       "      <td>319</td>\n",
-       "      <td>520</td>\n",
-       "      <td>290</td>\n",
-       "      <td>349</td>\n",
-       "      <td>270</td>\n",
-       "      <td>292</td>\n",
+       "      <td>ERR2985258</td>\n",
+       "      <td>ERR2814667</td>\n",
+       "      <td>TE</td>\n",
+       "      <td>READ1 Taxonomy ID:256318</td>\n",
+       "      <td>2.758982</td>\n",
+       "      <td>3.436752</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>GO:0000030</td>\n",
-       "      <td>mannosyltransferase activity</td>\n",
-       "      <td>molecular function</td>\n",
-       "      <td>189</td>\n",
-       "      <td>200</td>\n",
-       "      <td>57</td>\n",
-       "      <td>67</td>\n",
-       "      <td>39</td>\n",
-       "      <td>129</td>\n",
-       "      <td>181</td>\n",
-       "      <td>...</td>\n",
-       "      <td>70</td>\n",
-       "      <td>42</td>\n",
-       "      <td>58</td>\n",
-       "      <td>73</td>\n",
-       "      <td>92</td>\n",
-       "      <td>159</td>\n",
-       "      <td>67</td>\n",
-       "      <td>179</td>\n",
-       "      <td>58</td>\n",
-       "      <td>91</td>\n",
+       "      <td>ERR2985259</td>\n",
+       "      <td>ERR2814660</td>\n",
+       "      <td>FD</td>\n",
+       "      <td>READ1 Taxonomy ID:256318</td>\n",
+       "      <td>3.364675</td>\n",
+       "      <td>3.486673</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>5 rows × 27 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "           GO                        description            category  \\\n",
-       "0  GO:0000001          mitochondrion inheritance  biological process   \n",
-       "1  GO:0000002   mitochondrial genome maintenance  biological process   \n",
-       "2  GO:0000012         single strand break repair  biological process   \n",
-       "3  GO:0000015  phosphopyruvate hydratase complex  cellular component   \n",
-       "4  GO:0000030       mannosyltransferase activity  molecular function   \n",
-       "\n",
-       "   ERR2985255  ERR2985256  ERR2985257  ERR2985258  ERR2985259  ERR2985260  \\\n",
-       "0           0           0           0           0           0           3   \n",
-       "1           0           9           7           0           2           7   \n",
-       "2           0           3           1           1           0           0   \n",
-       "3         477         384         500         188         226         232   \n",
-       "4         189         200          57          67          39         129   \n",
-       "\n",
-       "   ERR2985261  ...  ERR2985269  ERR2985270  ERR2985271  ERR2985272  \\\n",
-       "0           0  ...           1           0           1           1   \n",
-       "1           4  ...          15          10           0           0   \n",
-       "2           0  ...           0           0           1           0   \n",
-       "3         301  ...         203         432         152         341   \n",
-       "4         181  ...          70          42          58          73   \n",
+       "       eff_id      inf_id sampling_location             sampling_read  \\\n",
+       "0  ERR2985255  ERR2814663                TG  READ2 Taxonomy ID:256318   \n",
+       "1  ERR2985256  ERR2814664                MN  READ2 Taxonomy ID:256318   \n",
+       "2  ERR2985257  ERR2814651                AH  READ1 Taxonomy ID:256318   \n",
+       "3  ERR2985258  ERR2814667                TE  READ1 Taxonomy ID:256318   \n",
+       "4  ERR2985259  ERR2814660                FD  READ1 Taxonomy ID:256318   \n",
        "\n",
-       "   ERR2985273  ERR2985274  ERR2985275  ERR2985276  ERR2985277  ERR2985278  \n",
-       "0           1           2           0           0           2           2  \n",
-       "1           0           5           0           4          11           1  \n",
-       "2           0           2           1           2           1           1  \n",
-       "3         319         520         290         349         270         292  \n",
-       "4          92         159          67         179          58          91  \n",
-       "\n",
-       "[5 rows x 27 columns]"
+       "   eff_abundance  inf_abundance  \n",
+       "0       3.257283       4.226819  \n",
+       "1       2.572841       3.847191  \n",
+       "2       4.298777       4.086841  \n",
+       "3       2.758982       3.436752  \n",
+       "4       3.364675       3.486673  "
       ]
      },
      "execution_count": 1,
@@ -248,208 +160,44 @@
     }
    ],
    "source": [
-    "import requests \n",
-    "from io import StringIO\n",
-    "import pandas as pd\n",
-    "\n",
-    "enf_accession = \"MGYS00005058\"\n",
-    "inf_accession = \"MGYS00005056\"\n",
-    "\n",
-    "enf_url = f'https://www.ebi.ac.uk/metagenomics/api/v1/studies/{enf_accession}/pipelines/4.1/file/ERP112696_GO_abundances_v4.1.tsv'\n",
-    "inf_url = f'https://www.ebi.ac.uk/metagenomics/api/v1/studies/{inf_accession}/pipelines/4.1/file/ERP111072_GO_abundances_v4.1.tsv'\n",
+    "import pandas as pd \n",
     "\n",
-    "# init dict to hold dataframes\n",
-    "data = {}\n",
-    "for url in [enf_url, inf_url]:\n",
-    "    print(f\"Processing {url}\")\n",
-    "    # download\n",
-    "    response = requests.get(url)\n",
-    "    if response.status_code == 200:\n",
-    "        # treat as file\n",
-    "        file_like = StringIO(response.content.decode('utf-8'))\n",
-    "        # read into pd df\n",
-    "        df = pd.read_csv(file_like, sep='\\t')\n",
-    "        # to the dict\n",
-    "        data[url] = df\n",
-    "    else: \n",
-    "        print(response)\n",
-    "        print(url, \" download skipped\")\n",
-    "\n",
-    "# sanity check\n",
-    "data[enf_url].head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "84364bf5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# preprocessing\n",
-    "from sklearn.preprocessing import FunctionTransformer\n",
-    "import numpy as np\n",
-    "from sklearn.pipeline import make_pipeline\n",
-    "\n",
-    "coda = FunctionTransformer(lambda x: x / x.sum(axis=0), feature_names_out=\"one-to-one\").set_output(transform=\"pandas\")\n",
-    "\n",
-    "def calc_clr(x):\n",
-    "    # replace zeros with small \n",
-    "    new_x = np.where(x > 0, x, 1e-10)\n",
-    "    return np.log(new_x) - np.mean(np.log(new_x), axis=0)\n",
-    "\n",
-    "clr = FunctionTransformer(calc_clr, feature_names_out=\"one-to-one\").set_output(transform=\"pandas\")\n",
-    "\n",
-    "def coda_clr(df):\n",
-    "    pipe = make_pipeline(coda, clr).set_output(transform=\"pandas\")\n",
-    "    transformed = pipe.fit_transform(df.select_dtypes(include='number'))\n",
-    "    return pd.concat([df.select_dtypes(include='object'), transformed], axis=1)\n",
-    "\n",
-    "clr_enf = coda_clr(data[enf_url])\n",
-    "clr_inf = coda_clr(data[inf_url])"
+    "df_data = pd.read_csv('/Users/anglup/GitHub/acore/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv')\n",
+    "# sanity check \n",
+    "df_data.head()"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "a9fa25c4",
+   "id": "7cf6c3d0",
    "metadata": {},
    "source": [
-    "For this demo we will only look at [go term GO:0017001](https://www.ebi.ac.uk/QuickGO/term/GO:0017001)\n",
+    "## The permutation test\n",
     "\n",
-    "It's expected that antibiotic catabolic processes to be higher in influent vs efluent"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "0259a24f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "term = 'GO:0017001'\n",
-    "\n",
-    "inf_go = clr_inf.query(\"GO == @term\").drop(columns=['GO', 'description', 'category']).T.reset_index()\n",
-    "enf_go = clr_enf.query(\"GO == @term\").drop(columns=['GO', 'description', 'category']).T.reset_index()\n",
-    "\n",
-    "inf_go.columns = ['sample', 'abundance']\n",
-    "enf_go.columns = ['sample', 'abundance']"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "13469e23",
-   "metadata": {},
-   "source": [
-    "The samples are paired. We need the metadata for the samples which we can also download using the MGnify API."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "a7eb5848",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Processing MGYS00005058\n",
-      "Processing MGYS00005056\n"
-     ]
-    }
-   ],
-   "source": [
-    "# getting sampling and analysis metadata\n",
-    "metadata = {}\n",
-    "for accession in [enf_accession, inf_accession]:\n",
-    "    # prep urls\n",
-    "    url_sam = f'https://www.ebi.ac.uk/metagenomics/api/v1/studies/{accession}/samples'\n",
-    "    url_ana = f\"https://www.ebi.ac.uk/metagenomics/api/v1/analyses?study_accession={accession}\"\n",
+    "Since these are paird samples we will proceed with paired sample permutation test using `acore.perumutation_test.paired_permutation()`. \n",
     "\n",
-    "    print(f\"Processing {accession}\")\n",
-    "    # download\n",
-    "    sam_response = requests.get(url_sam)\n",
-    "    if sam_response.status_code == 200:\n",
-    "        # json\n",
-    "        df_sam = pd.DataFrame(sam_response.json()['data'])\n",
-    "        # extract needed metadata\n",
-    "        df_sam['desc'] = df_sam['attributes'].apply(lambda x: x['sample-desc'])\n",
-    "        df_sam['sam_source'] = df_sam['desc'].str.split('.', expand=True)[0]\n",
-    "        df_sam['sam_read'] = df_sam['desc'].str.split('.', expand=True)[2]\n",
-    "    else: \n",
-    "        raise Exception(f\"Failed to download {url_sam}\")\n",
-    "        print(sam_response)\n",
-    "    ana_response = requests.get(url_ana)\n",
-    "    if ana_response.status_code == 200:\n",
-    "        # json\n",
-    "        df_ana = pd.DataFrame(ana_response.json()['data'])\n",
-    "        # extract needed metadata\n",
-    "        df_ana['run_id'] = df_ana['relationships'].apply(lambda x: x['run']['data']['id'])\n",
-    "        df_ana['sample_id'] = df_ana['relationships'].apply(lambda x: x['sample']['data']['id'])\n",
-    "    else: \n",
-    "        raise Exception(f\"Failed to download {url_ana}\")\n",
-    "        print(ana_response)\n",
-    "    # merge\n",
-    "    metadata[accession] = df_ana.merge(df_sam[['id', 'desc', 'sam_source', 'sam_read']], left_on='sample_id', right_on='id', how='left')\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "487cd4da",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "((24, 13), (24, 13))"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "enf_all = metadata[enf_accession].merge(enf_go, left_on='run_id', right_on='sample', how='inner')\n",
-    "inf_all = metadata[inf_accession].merge(inf_go, left_on='run_id', right_on='sample', how='inner')\n",
-    "# sanity check\n",
-    "enf_all.shape, inf_all.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "66056854",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# pairing \n",
-    "df_data = enf_all.merge(\n",
-    "    inf_all,\n",
-    "    on=['sam_source', 'sam_read'], suffixes=('_enf', '_inf')\n",
-    ")[['run_id_enf', 'run_id_inf', 'sam_source', 'sam_read', 'abundance_enf', 'abundance_inf']]\n",
+    "The permutation test compares the actual observed chosen metric (e.g., t-statistic, mean difference) with metrics calculated when the dataset values are randomly shuffled permutations of the dataset. \n",
     "\n",
-    "df_data.to_csv('/Users/anglup/GitHub/acore/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv', index=False)"
+    "If we do 100 permutations of our data (although we should do a bunch more) and only 1 of those permutations falsely showed a larger effect size than the actual observed effect than it suggests there is a 1/100 chance (p value of 0.01) of the observed effect sizze having occurred by chance. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 134,
+   "execution_count": 2,
    "id": "5eefb720",
    "metadata": {},
    "outputs": [],
    "source": [
     "from acore.permutation_test import paired_permutation\n",
     "\n",
-    "# optional for reproducibility\n",
+    "# optional choice of random number generatorfor repro\n",
     "import numpy as np\n",
     "rng = np.random.default_rng(12345)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 135,
+   "execution_count": 3,
    "id": "9eabb911",
    "metadata": {},
    "outputs": [
@@ -457,32 +205,35 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'metric': <function ttest_rel at 0x115df23e0>, 'observed': TtestResult(statistic=np.float64(6.7389860601792275), pvalue=np.float64(7.122287781830209e-07), df=np.int64(23)), 'p_value': np.float64(0.0)}\n",
-      "{'metric': <function mean at 0x130d17df0>, 'observed': np.float64(0.5350826397500547), 'p_value': np.float64(0.0)}\n",
-      "{'metric': <function mean at 0x130d17df0>, 'observed': np.float64(0.5350826397500547), 'p_value': np.float64(0.0)}\n"
+      "{'metric': <function ttest_rel at 0x11014d8a0>, 'observed': TtestResult(statistic=np.float64(6.7389860601792275), pvalue=np.float64(7.122287781830209e-07), df=np.int64(23)), 'p_value': np.float64(0.0)}\n",
+      "{'metric': <function mean at 0x1090f7a70>, 'observed': np.float64(0.5350826397500547), 'p_value': np.float64(0.0)}\n",
+      "{'metric': <function mean at 0x1090f7a70>, 'observed': np.float64(0.5350826397500547), 'p_value': np.float64(0.0)}\n"
      ]
     }
    ],
    "source": [
+    "# trying diff metrics to demo functionality also\n",
     "for metric in ['t-statistic', 'mean', np.mean]:\n",
     "    result = paired_permutation(\n",
-    "        df_data['abundance_inf'].to_numpy(),\n",
-    "        df_data['abundance_enf'].to_numpy(), \n",
+    "        df_data['inf_abundance'].to_numpy(),\n",
+    "        df_data['eff_abundance'].to_numpy(), \n",
     "        metric=metric, \n",
     "        n_permutations=10000, \n",
     "        rng=rng\n",
     "    )\n",
-    "\n",
+    "    # verbosity\n",
     "    print(result)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9da7d601",
+   "cell_type": "markdown",
+   "id": "1ad22530",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "source": [
+    "## Result\n",
+    "\n",
+    "Based on the permutation tests by test statistic and mean difference, the probability of the observed metrics (t=6.739 and mean diff=0.535) occurring at random would be <0.00001."
+   ]
   }
  ],
  "metadata": {
diff --git a/docs/index.rst b/docs/index.rst
index 35f5751..af0844f 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -19,6 +19,7 @@ Acore documentation
    api_examples/normalization_analysis
    api_examples/ANCOVA_analysis
    api_examples/enrichment_analysis
+   api_examples/permutation_testing
 
 .. toctree::
    :maxdepth: 1
diff --git a/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv b/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv
index 07477c5..ca3c6b6 100644
--- a/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv
+++ b/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv
@@ -1,4 +1,4 @@
-run_id_enf,run_id_inf,sam_source,sam_read,abundance_enf,abundance_inf
+eff_id,inf_id,sampling_location,sampling_read,eff_abundance,inf_abundance
 ERR2985255,ERR2814663,TG,READ2 Taxonomy ID:256318,3.2572833167019404,4.226818522502555
 ERR2985256,ERR2814664,MN,READ2 Taxonomy ID:256318,2.5728406558389647,3.8471905353288207
 ERR2985257,ERR2814651,AH,READ1 Taxonomy ID:256318,4.298777376040054,4.08684074171776

From b6d049cab85b3d68053968c1847e0a408289dab9 Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Tue, 14 Oct 2025 17:13:32 +0200
Subject: [PATCH 13/25] jupytext of the permutation testing demo notebook

---
 docs/api_examples/permutation_testing.py | 98 ++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 docs/api_examples/permutation_testing.py

diff --git a/docs/api_examples/permutation_testing.py b/docs/api_examples/permutation_testing.py
new file mode 100644
index 0000000..d2bac7f
--- /dev/null
+++ b/docs/api_examples/permutation_testing.py
@@ -0,0 +1,98 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.17.3
+#   kernelspec:
+#     display_name: .venv
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Tutorial: Permutation Testing using `acore`
+#
+# In this notebook we will demonstrate how to use acore's permutation testing functions on metagenomics data collected by [Ju and colleagues (2018)](https://doi.org/10.1038/s41396-018-0277-8).
+#
+# The samples in this demo were collected from wastewaster treatment plant inffluent (MGYS00005056) and effluent (MGYS00005058).
+#
+# For this demo we look at the GO term abundance tables generated by the Mgnify pipeline. The values in the table are the absolute abundance of selected GO terms for each sample, which we then transform to relative abundances and centred-log ratios. 
+#
+
+# %% [markdown]
+# ## Data preparation details
+#
+# ### Downloading
+# The analysed samples were downloaded via the [MGnify API](https://www.ebi.ac.uk/metagenomics/api/docs/). The inffluent (INF) and effluent (EFFF) datasets have paired samples and we also needed to download the sample metadata (also available via Mgnify API) to assign the correct pairing.
+#
+# ### Preprocessing of abundances
+# - To account for technical variation due to sequencing technology limitations, we first transform the abundance values so they are relative to the total reads for the sample aka getting relative abundances. 
+# - The relative abundances are compositional data (CoDa) so we map them to unconstrained vectors using centred log-ratio transformation `acore.microbiome.internal_functions.calc_clr()` to not violate assumptions of any frequentist stats we do
+#
+# ### Preprocessing of the metadata 
+# - the sample metadata needed for this demo (sampling location) were available in their "sample-desc" 
+# - the sample-desc for each sample in both INF and EFF were parsed and used for pairing off
+#
+# ### Subset of data for demo
+# - For this demo we only look at [go term GO:0017001](https://www.ebi.ac.uk/QuickGO/term/GO:0017001)
+# - It's expected that antibiotic catabolic processes to be higher in INF vs EFF
+#
+# ### Saving the demo dataset
+# This example subset of data was saved to a CSV, ./example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv. The data dictionary is below:
+#
+# | column            | description                                                                                                       | dtype |
+# |-------------------|-------------------------------------------------------------------------------------------------------------------|-------|
+# | eff_id            | The run id for the mgnify analysis of the effluent sample.                                                        | str   |
+# | inf_id            | The run id for the mgnify analysis of the influent sample.                                                        | str   |
+# | sampling_location | [The ISO 3166-1 alpha-2 code](http://iso.org/obp/ui/#iso:pub:PUB500001:en) for the country where the sample was from. | str   |
+# | sampling_read     | Replicates?                                                                                                       | str   |
+# | eff_abundance     | The relative abundance of the GO term for a given effluent sample following preprocessing (i.e., CoDA and CLR)    | float |
+# | inf_abundance     | The relative abundance of the GO term for a given influent sample following preprocessing (i.e., CoDA and CLR)    | float |
+#
+# -----
+#
+# We will now proceed with reading in the prepared dataset. 
+
+# %%
+import pandas as pd 
+
+df_data = pd.read_csv('/Users/anglup/GitHub/acore/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv')
+# sanity check 
+df_data.head()
+
+# %% [markdown]
+# ## The permutation test
+#
+# Since these are paird samples we will proceed with paired sample permutation test using `acore.perumutation_test.paired_permutation()`. 
+#
+# The permutation test compares the actual observed chosen metric (e.g., t-statistic, mean difference) with metrics calculated when the dataset values are randomly shuffled permutations of the dataset. 
+#
+# If we do 100 permutations of our data (although we should do a bunch more) and only 1 of those permutations falsely showed a larger effect size than the actual observed effect than it suggests there is a 1/100 chance (p value of 0.01) of the observed effect sizze having occurred by chance. 
+
+# %%
+from acore.permutation_test import paired_permutation
+
+# optional choice of random number generatorfor repro
+import numpy as np
+rng = np.random.default_rng(12345)
+
+# %%
+# trying diff metrics to demo functionality also
+for metric in ['t-statistic', 'mean', np.mean]:
+    result = paired_permutation(
+        df_data['inf_abundance'].to_numpy(),
+        df_data['eff_abundance'].to_numpy(), 
+        metric=metric, 
+        n_permutations=10000, 
+        rng=rng
+    )
+    # verbosity
+    print(result)
+
+# %% [markdown]
+# ## Result
+#
+# Based on the permutation tests by test statistic and mean difference, the probability of the observed metrics (t=6.739 and mean diff=0.535) occurring at random would be <0.00001.

From 74255e360d8ea12d0b276e5f50960d77d9b39e57 Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Tue, 14 Oct 2025 17:18:22 +0200
Subject: [PATCH 14/25] :lipstick:style: lint and reformatd code

---
 src/acore/permutation_test/__init__.py        | 53 ++++---------
 .../permutation_test/internal_functions.py    | 15 ++--
 src/acore/permutation_test/jaccard.py         | 77 ++++++++++---------
 3 files changed, 64 insertions(+), 81 deletions(-)

diff --git a/src/acore/permutation_test/__init__.py b/src/acore/permutation_test/__init__.py
index 2a3cb9b..dc8e4a8 100644
--- a/src/acore/permutation_test/__init__.py
+++ b/src/acore/permutation_test/__init__.py
@@ -1,19 +1,14 @@
 import numpy as np
-import pandas as pd
 from scipy.stats import (
-    chi2_contingency, 
+    chi2_contingency,
     ttest_rel,
     ttest_ind,
     f_oneway,
 )
+from .internal_functions import _permute, _contingency_table, _check_degeneracy
 import warnings
-warnings.simplefilter("always", UserWarning)
 
-from .internal_functions import (
-    _permute, 
-    _contingency_table, 
-    _check_degeneracy
-)
+warnings.simplefilter("always", UserWarning)
 
 
 def paired_permutation(
@@ -22,7 +17,7 @@ def paired_permutation(
     metric: str = "t-statistic",
     n_permutations: int = 10000,
     rng: np.random.Generator = np.random.default_rng(seed=12345),
-    **kwargs
+    **kwargs,
 ) -> dict:
     """
     Perform a permutation test for paired samples.
@@ -49,14 +44,14 @@ def paired_permutation(
         - 'metric': Metric function used.
         - 'observed': Observed metric value.
         - 'p_value': Permutation test p-value (np.nan if degenerate).
-    """    # Validate input
+    """  # Validate input
     if not isinstance(cond1, np.ndarray) or not isinstance(cond2, np.ndarray):
         raise TypeError("Input must be numpy arrays.")
     if cond1.shape != cond2.shape:
         raise ValueError("Input arrays must have the same shape.")
 
     # paired differences
-    diff = cond1-cond2
+    diff = cond1 - cond2
     args = [diff]
 
     # compute observed metric
@@ -87,11 +82,13 @@ def paired_permutation(
     for _ in range(n_permutations):
         # randomly flip direction of differences
         permuted_diff = diff * rng.choice([-1, 1], size=diff.shape)
-        new_cond1 = np.where((permuted_diff==diff), cond1, cond2)
-        new_cond2 = np.where((permuted_diff==diff), cond2, cond1)
+        new_cond1 = np.where((permuted_diff == diff), cond1, cond2)
+        new_cond2 = np.where((permuted_diff == diff), cond2, cond1)
         # postcondition check
         if not (permuted_diff == (new_cond1 - new_cond2)).all():
-            raise ArithmeticError("Postcondition failed: Issue with permuted differences")
+            raise ArithmeticError(
+                "Postcondition failed: Issue with permuted differences"
+            )
         # prep args
         if metric == "t-statistic":
             new_args = [new_cond1, new_cond2]
@@ -119,17 +116,13 @@ def paired_permutation(
         # Compute p-value
         p_value = np.mean(permuted_f >= abs_met)
 
-        return {
-            "metric": calculator,
-            "observed": observed_metric,
-            "p_value": p_value
-        }
+        return {"metric": calculator, "observed": observed_metric, "p_value": p_value}
 
 
 def chi2_permutation(
     *groups,
     n_permutations: int = 10000,
-    rng: np.random.Generator = np.random.default_rng(seed=12345)
+    rng: np.random.Generator = np.random.default_rng(seed=12345),
 ) -> dict:
     """
     Perform a permutation test for categorical data using the chi-squared statistic.
@@ -169,10 +162,7 @@ def chi2_permutation(
     # Compute p-value
     p_value = np.mean(permuted_chi2 >= observed_chi2)
 
-    return {
-        "observed": observed_test,
-        "p_value": p_value
-    }
+    return {"observed": observed_test, "p_value": p_value}
 
 
 def indep_permutation(
@@ -181,7 +171,7 @@ def indep_permutation(
     metric: str = "t-statistic",
     n_permutations: int = 10000,
     rng: np.random.Generator = np.random.default_rng(seed=12345),
-    **kwargs
+    **kwargs,
 ) -> dict:
     """
     Perform a permutation test for independent samples.
@@ -236,7 +226,7 @@ def indep_permutation(
         )
 
     # compute observed metric
-    if stat: 
+    if stat:
         observed_metric = calculator(group1, group2, **kwargs)
     else:
         observed_metric = abs(calculator(group1) - calculator(group2))
@@ -258,13 +248,4 @@ def indep_permutation(
     # Compute p-value
     p_value = np.mean(permuted_f >= abs_met)
 
-    return {
-        "metric": calculator,
-        "observed": observed_metric,
-        "p_value": p_value
-    }
-
-
-
-
-
+    return {"metric": calculator, "observed": observed_metric, "p_value": p_value}
diff --git a/src/acore/permutation_test/internal_functions.py b/src/acore/permutation_test/internal_functions.py
index 58b8f46..80fdddd 100644
--- a/src/acore/permutation_test/internal_functions.py
+++ b/src/acore/permutation_test/internal_functions.py
@@ -1,6 +1,7 @@
-import numpy as np 
+import numpy as np
 import pandas as pd
 
+
 def _permute(*groups, rng=np.random.default_rng(seed=12345)):
     """
     Perform a single permutation of the groups.
@@ -31,7 +32,7 @@ def _permute(*groups, rng=np.random.default_rng(seed=12345)):
     return new_groups
 
 
-def _contingency_table(*groups, to_np:bool=True)-> np.array:
+def _contingency_table(*groups, to_np: bool = True) -> np.array:
     """
     Create a contingency table from the provided groups.
 
@@ -52,12 +53,13 @@ def _contingency_table(*groups, to_np:bool=True)-> np.array:
         if not isinstance(group, (list, pd.Series, np.ndarray)):
             raise TypeError("Each group must be a list, pandas Series, or numpy array.")
 
-
     # MAIN FUNCTION
     # Create contingency table
-    table = pd.concat(
-        [pd.Series(group).value_counts() for group in groups], axis=1
-    ).fillna(0).T
+    table = (
+        pd.concat([pd.Series(group).value_counts() for group in groups], axis=1)
+        .fillna(0)
+        .T
+    )
 
     if to_np:
         return table.to_numpy()
@@ -80,4 +82,3 @@ def _check_degeneracy(array: np.ndarray) -> bool:
         True if the conditions are degenerate, False otherwise.
     """
     return np.all(array == array[0])
-
diff --git a/src/acore/permutation_test/jaccard.py b/src/acore/permutation_test/jaccard.py
index 08e52ff..f8ad37c 100644
--- a/src/acore/permutation_test/jaccard.py
+++ b/src/acore/permutation_test/jaccard.py
@@ -1,8 +1,9 @@
-import numpy as np 
+import numpy as np
 import itertools
 from collections.abc import Iterable
 
-def jaccard_similarity(set1:set, set2:set) -> float:
+
+def jaccard_similarity(set1: set, set2: set) -> float:
     """
     Compute the Jaccard similarity between two sets.
 
@@ -15,9 +16,9 @@ def jaccard_similarity(set1:set, set2:set) -> float:
     Returns
     -------
     float
-        Jaccard similarity coefficient, 
-        which is the size of the intersection divided by 
-        the size of the union of the two sets. 
+        Jaccard similarity coefficient,
+        which is the size of the intersection divided by
+        the size of the union of the two sets.
 
     Example
     -------
@@ -29,31 +30,31 @@ def jaccard_similarity(set1:set, set2:set) -> float:
     intersection = len(set1 & set2)
     union = len(set1 | set2)
 
-    if union !=0:
+    if union != 0:
         return intersection / union
     else:
         return 0
 
 
-def avg_jaccard(group: Iterable)-> tuple:
+def avg_jaccard(group: Iterable) -> tuple:
     """
-    Compute the average pairwise Jaccard similarity within a group of graphs.
+     Compute the average pairwise Jaccard similarity within a group of graphs.
 
-    Parameters
-    ----------
-    group : Iterable
-        An iterator of iterable objects, such as a list of sets.
+     Parameters
+     ----------
+     group : Iterable
+         An iterator of iterable objects, such as a list of sets.
 
-   Returns
-    -------
-    tuple
-        A tuple containing the average Jaccard similarity and its standard deviation.
-    
-    Example
-    -------
-    >>> group = [{1, 2, 3}, {2, 3, 4}, {3, 4, 5}]
-    >>> avg_jaccard(group)
-    (0.3333333333333333, 0.0)
+    Returns
+     -------
+     tuple
+         A tuple containing the average Jaccard similarity and its standard deviation.
+
+     Example
+     -------
+     >>> group = [{1, 2, 3}, {2, 3, 4}, {3, 4, 5}]
+     >>> avg_jaccard(group)
+     (0.3333333333333333, 0.0)
     """
 
     ## PRECONDITIONS
@@ -62,12 +63,13 @@ def avg_jaccard(group: Iterable)-> tuple:
     if len(group) > 0:
         for g in group:
             if not isinstance(g, Iterable):
-                raise TypeError("Each element in the group must be an iterable (set, list, or tuple).")
+                raise TypeError(
+                    "Each element in the group must be an iterable (set, list, or tuple)."
+                )
     elif len(group) == 0:
         raise ValueError("Input group is empty. Cannot compute Jaccard similarity.")
 
-
-    # init the list 
+    # init the list
     scores = []
 
     for g1, g2 in itertools.combinations(group, 2):
@@ -100,42 +102,41 @@ def btwn_jaccard(group1: Iterable, group2: Iterable) -> tuple:
     -------
     tuple
         A tuple containing the average Jaccard similarity and its standard deviation.
-    
+
     Example
     -------
     >>> group1 = [{1, 2, 3}, {2, 3, 4}]
     >>> group2 = [{3, 4, 5}, {4, 5, 6}]
     >>> btwn_jaccard(group1, group2)
     (0.3333333333333333, 0.0)
-    
+
     """
-    
+
     if not isinstance(group1, Iterable) or not isinstance(group2, Iterable):
         raise TypeError("Both inputs must be iterables of iterables.")
     if len(group1) == 0 or len(group2) == 0:
         raise ValueError("Both input groups must be non-empty.")
-    
+
     # init the list
     scores = []
-    
+
     for g1, g2 in itertools.product(group1, group2):
-        
         # check
         if not isinstance(g1, Iterable) or not isinstance(g2, Iterable):
-            raise TypeError("Each element in the groups must be an iterable (set, list, or tuple).")
-        
-        # get sets 
+            raise TypeError(
+                "Each element in the groups must be an iterable (set, list, or tuple)."
+            )
+
+        # get sets
         set1 = set(g1)
         set2 = set(g2)
 
         # compute Jaccard similarity
         scores.append(jaccard_similarity(set1, set2))
-    
+
     if scores:
         # if scores is not empty, return the average
         return np.mean(scores), np.std(scores)
     # if scores is empty (less than 2 in group), return 0
     else:
-        return 0, 0    
-
-
+        return 0, 0

From 23a79ff3c4bb4c5b5632f78a2bad3a3b4ca66418 Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Fri, 24 Oct 2025 13:35:51 +0200
Subject: [PATCH 15/25] :bug:fix: update PR module link in contributing guide

---
 CONTRIBUTING.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index 10f7421..9af11a3 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -125,7 +125,7 @@ Before you submit a pull request, check that it meets these guidelines:
 3. The pull request should pass the workflows on GitHub.
 
 See for example the PR-Template for a module: 
-`Add module PR template <https://github.com/Multiomics-Analytics-Group/acore/blob/main/.github/PULL_REQUEST_TEMPLATE/module.md>`_.
+`Add module PR template <https://github.com/Multiomics-Analytics-Group/acore/blob/main/.github/workflows/PULL_REQUEST_TEMPLATE/module.md>`_.
 
 
 

From 2194b08d5627fb04068467dc0f47d13b8df8d2f5 Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Fri, 24 Oct 2025 14:03:43 +0200
Subject: [PATCH 16/25] :bug: fix: add quotation marks to command

---
 CONTRIBUTING.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index 9af11a3..842eaae 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -70,7 +70,7 @@ advanced editor to help you with some of the common steps described below, e.g.
     $ cd acore/
     $ python -m venv .env
     $ source .env/bin/activate
-    $ pip install -e .[dev]
+    $ pip install -e ".[dev]"
 
 If you work on a Windows shell, see the docs for instructions: 
 `How venvs work <https://docs.python.org/3/library/venv.html#how-venvs-work>`_

From 5b0124e3802b5b5154c1f0d1c3b69d5420ac90d3 Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Wed, 29 Oct 2025 16:43:50 +0100
Subject: [PATCH 17/25] :sparkles: feat: add pydantic model for perm test
 output

---
 src/acore/types/permutation_test.py | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 src/acore/types/permutation_test.py

diff --git a/src/acore/types/permutation_test.py b/src/acore/types/permutation_test.py
new file mode 100644
index 0000000..436fc12
--- /dev/null
+++ b/src/acore/types/permutation_test.py
@@ -0,0 +1,9 @@
+# using pydantic since these are just dictionary outputs no df
+from typing import Any, Optional, Callable, Union
+from pydantic import BaseModel, Field
+
+class PermutationResult(BaseModel):
+    metric: Optional[Union[str, Callable]] = Field(default=None, description="Name of the metric used in the permutation test")
+    observed: Any = Field(default=None, description="Observed value of the metric")
+    p_value: float = Field(description="p-value from the permutation test")
+

From 83bbfda98f6f1a11a89df02fcd022a8597008160 Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Wed, 29 Oct 2025 16:44:14 +0100
Subject: [PATCH 18/25] :sparkles: feat: add pydantic model for perm test
 output

---
 docs/api_examples/permutation_testing.ipynb | 10 +++---
 src/acore/permutation_test/__init__.py      | 36 +++++++++++++++++----
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/docs/api_examples/permutation_testing.ipynb b/docs/api_examples/permutation_testing.ipynb
index de1b503..cb92f75 100644
--- a/docs/api_examples/permutation_testing.ipynb
+++ b/docs/api_examples/permutation_testing.ipynb
@@ -162,7 +162,9 @@
    "source": [
     "import pandas as pd \n",
     "\n",
-    "df_data = pd.read_csv('/Users/anglup/GitHub/acore/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv')\n",
+    "df_data = pd.read_csv(\n",
+    "    'https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/anglup-learning/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv'\n",
+    ")\n",
     "# sanity check \n",
     "df_data.head()"
    ]
@@ -205,9 +207,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'metric': <function ttest_rel at 0x11014d8a0>, 'observed': TtestResult(statistic=np.float64(6.7389860601792275), pvalue=np.float64(7.122287781830209e-07), df=np.int64(23)), 'p_value': np.float64(0.0)}\n",
-      "{'metric': <function mean at 0x1090f7a70>, 'observed': np.float64(0.5350826397500547), 'p_value': np.float64(0.0)}\n",
-      "{'metric': <function mean at 0x1090f7a70>, 'observed': np.float64(0.5350826397500547), 'p_value': np.float64(0.0)}\n"
+      "{'metric': <function ttest_rel at 0x10fee80e0>, 'observed': (np.float64(6.7389860601792275), np.float64(7.122287781830209e-07)), 'p_value': 0.0}\n",
+      "{'metric': <function mean at 0x10920a370>, 'observed': np.float64(0.5350826397500547), 'p_value': 0.0}\n",
+      "{'metric': <function mean at 0x10920a370>, 'observed': np.float64(0.5350826397500547), 'p_value': 0.0}\n"
      ]
     }
    ],
diff --git a/src/acore/permutation_test/__init__.py b/src/acore/permutation_test/__init__.py
index dc8e4a8..0c0bbd7 100644
--- a/src/acore/permutation_test/__init__.py
+++ b/src/acore/permutation_test/__init__.py
@@ -5,11 +5,17 @@
     ttest_ind,
     f_oneway,
 )
-from .internal_functions import _permute, _contingency_table, _check_degeneracy
+from .internal_functions import (
+    _permute, 
+    _contingency_table, 
+    _check_degeneracy
+)
 import warnings
 
 warnings.simplefilter("always", UserWarning)
 
+from acore.types.permutation_test import PermutationResult
+
 
 def paired_permutation(
     cond1: np.ndarray,
@@ -107,16 +113,23 @@ def paired_permutation(
         )
         warnings.warn(identical_warn)
 
-        return {
+        val_result = PermutationResult.model_validate({
             "metric": calculator,
             "observed": observed_metric,
             "p_value": np.nan,
-        }
+        })
+        
     else:
         # Compute p-value
         p_value = np.mean(permuted_f >= abs_met)
 
-        return {"metric": calculator, "observed": observed_metric, "p_value": p_value}
+        val_result = PermutationResult.model_validate({
+            "metric": calculator,
+            "observed": observed_metric,
+            "p_value": p_value
+        })
+
+    return val_result.model_dump()
 
 
 def chi2_permutation(
@@ -162,7 +175,12 @@ def chi2_permutation(
     # Compute p-value
     p_value = np.mean(permuted_chi2 >= observed_chi2)
 
-    return {"observed": observed_test, "p_value": p_value}
+    val_result = PermutationResult.model_validate({
+        "observed": observed_test,
+        "p_value": p_value
+    })
+
+    return val_result.model_dump(exclude_none=True)
 
 
 def indep_permutation(
@@ -248,4 +266,10 @@ def indep_permutation(
     # Compute p-value
     p_value = np.mean(permuted_f >= abs_met)
 
-    return {"metric": calculator, "observed": observed_metric, "p_value": p_value}
+    val_result = PermutationResult.model_validate({
+        "metric": calculator,
+        "observed": observed_metric,
+        "p_value": p_value
+    })
+
+    return val_result.model_dump()

From e61404904898b5594229d08273128f7f2b2f90f1 Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Wed, 29 Oct 2025 16:49:03 +0100
Subject: [PATCH 19/25] :memo: docs:  jupytexted the notebook

---
 docs/api_examples/permutation_testing.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/api_examples/permutation_testing.py b/docs/api_examples/permutation_testing.py
index d2bac7f..f533b66 100644
--- a/docs/api_examples/permutation_testing.py
+++ b/docs/api_examples/permutation_testing.py
@@ -59,7 +59,9 @@
 # %%
 import pandas as pd 
 
-df_data = pd.read_csv('/Users/anglup/GitHub/acore/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv')
+df_data = pd.read_csv(
+    'https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/anglup-learning/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv'
+)
 # sanity check 
 df_data.head()
 

From 6bea62d2b19f7c47b6cfe17b99307e8481bbfa83 Mon Sep 17 00:00:00 2001
From: "Angel L. P" <59593766+angelphanth@users.noreply.github.com>
Date: Wed, 29 Oct 2025 16:53:09 +0100
Subject: [PATCH 20/25] :lipstick: style: lint and format

---
 docs/api_examples/permutation_testing.py   | 35 ++++++++--------
 src/acore/decomposition/umap.py            |  1 +
 src/acore/enrichment_analysis/__init__.py  |  4 +-
 src/acore/io/uniprot/uniprot.py            |  2 -
 src/acore/microbiome/internal_functions.py | 19 +++++----
 src/acore/multiple_testing/__init__.py     |  2 +-
 src/acore/permutation_test/__init__.py     | 47 +++++++++-------------
 src/acore/types/permutation_test.py        |  6 ++-
 tests/test_permutation_test.py             | 37 ++++++++---------
 9 files changed, 71 insertions(+), 82 deletions(-)

diff --git a/docs/api_examples/permutation_testing.py b/docs/api_examples/permutation_testing.py
index f533b66..91f5777 100644
--- a/docs/api_examples/permutation_testing.py
+++ b/docs/api_examples/permutation_testing.py
@@ -19,7 +19,7 @@
 #
 # The samples in this demo were collected from wastewaster treatment plant inffluent (MGYS00005056) and effluent (MGYS00005058).
 #
-# For this demo we look at the GO term abundance tables generated by the Mgnify pipeline. The values in the table are the absolute abundance of selected GO terms for each sample, which we then transform to relative abundances and centred-log ratios. 
+# For this demo we look at the GO term abundance tables generated by the Mgnify pipeline. The values in the table are the absolute abundance of selected GO terms for each sample, which we then transform to relative abundances and centred-log ratios.
 #
 
 # %% [markdown]
@@ -29,11 +29,11 @@
 # The analysed samples were downloaded via the [MGnify API](https://www.ebi.ac.uk/metagenomics/api/docs/). The inffluent (INF) and effluent (EFFF) datasets have paired samples and we also needed to download the sample metadata (also available via Mgnify API) to assign the correct pairing.
 #
 # ### Preprocessing of abundances
-# - To account for technical variation due to sequencing technology limitations, we first transform the abundance values so they are relative to the total reads for the sample aka getting relative abundances. 
+# - To account for technical variation due to sequencing technology limitations, we first transform the abundance values so they are relative to the total reads for the sample aka getting relative abundances.
 # - The relative abundances are compositional data (CoDa) so we map them to unconstrained vectors using centred log-ratio transformation `acore.microbiome.internal_functions.calc_clr()` to not violate assumptions of any frequentist stats we do
 #
-# ### Preprocessing of the metadata 
-# - the sample metadata needed for this demo (sampling location) were available in their "sample-desc" 
+# ### Preprocessing of the metadata
+# - the sample metadata needed for this demo (sampling location) were available in their "sample-desc"
 # - the sample-desc for each sample in both INF and EFF were parsed and used for pairing off
 #
 # ### Subset of data for demo
@@ -54,42 +54,43 @@
 #
 # -----
 #
-# We will now proceed with reading in the prepared dataset. 
+# We will now proceed with reading in the prepared dataset.
 
 # %%
-import pandas as pd 
+import pandas as pd
 
 df_data = pd.read_csv(
-    'https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/anglup-learning/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv'
+    "https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/refs/heads/anglup-learning/example_data/mgnify/Ju2018_GO0017001_enf_inf_paired.csv"
 )
-# sanity check 
+# sanity check
 df_data.head()
 
 # %% [markdown]
 # ## The permutation test
 #
-# Since these are paird samples we will proceed with paired sample permutation test using `acore.perumutation_test.paired_permutation()`. 
+# Since these are paird samples we will proceed with paired sample permutation test using `acore.perumutation_test.paired_permutation()`.
 #
-# The permutation test compares the actual observed chosen metric (e.g., t-statistic, mean difference) with metrics calculated when the dataset values are randomly shuffled permutations of the dataset. 
+# The permutation test compares the actual observed chosen metric (e.g., t-statistic, mean difference) with metrics calculated when the dataset values are randomly shuffled permutations of the dataset.
 #
-# If we do 100 permutations of our data (although we should do a bunch more) and only 1 of those permutations falsely showed a larger effect size than the actual observed effect than it suggests there is a 1/100 chance (p value of 0.01) of the observed effect sizze having occurred by chance. 
+# If we do 100 permutations of our data (although we should do a bunch more) and only 1 of those permutations falsely showed a larger effect size than the actual observed effect than it suggests there is a 1/100 chance (p value of 0.01) of the observed effect sizze having occurred by chance.
 
 # %%
 from acore.permutation_test import paired_permutation
 
 # optional choice of random number generatorfor repro
 import numpy as np
+
 rng = np.random.default_rng(12345)
 
 # %%
 # trying diff metrics to demo functionality also
-for metric in ['t-statistic', 'mean', np.mean]:
+for metric in ["t-statistic", "mean", np.mean]:
     result = paired_permutation(
-        df_data['inf_abundance'].to_numpy(),
-        df_data['eff_abundance'].to_numpy(), 
-        metric=metric, 
-        n_permutations=10000, 
-        rng=rng
+        df_data["inf_abundance"].to_numpy(),
+        df_data["eff_abundance"].to_numpy(),
+        metric=metric,
+        n_permutations=10000,
+        rng=rng,
     )
     # verbosity
     print(result)
diff --git a/src/acore/decomposition/umap.py b/src/acore/decomposition/umap.py
index 2a05dd7..6d5ab21 100644
--- a/src/acore/decomposition/umap.py
+++ b/src/acore/decomposition/umap.py
@@ -1,4 +1,5 @@
 """Run UMAP on DataFrame and return result."""
+
 import pandas as pd
 import umap
 
diff --git a/src/acore/enrichment_analysis/__init__.py b/src/acore/enrichment_analysis/__init__.py
index b0a09a6..b3ae590 100644
--- a/src/acore/enrichment_analysis/__init__.py
+++ b/src/acore/enrichment_analysis/__init__.py
@@ -552,13 +552,13 @@ def run_ssgsea(
     index.index = name
     df = df.set_index("Name").transpose()
 
-    if not annotation_col in annotation:
+    if annotation_col not in annotation:
         raise ValueError(
             f"Missing Annotation Column: {annotation_col}"
             " as specified by `annotation_col`"
         )
 
-    if not identifier_col in annotation:
+    if identifier_col not in annotation:
         raise ValueError(
             f"Missing Identifier Column: {identifier_col}"
             " as specified by `identifier_col`"
diff --git a/src/acore/io/uniprot/uniprot.py b/src/acore/io/uniprot/uniprot.py
index c9abf05..f359eb1 100644
--- a/src/acore/io/uniprot/uniprot.py
+++ b/src/acore/io/uniprot/uniprot.py
@@ -177,8 +177,6 @@ def get_id_mapping_results_stream(url):
     return decode_results(request, file_format, compressed)
 
 
-
-
 if __name__ == "__main__":
     # id mapping is used to create a link to a query (you can see the json in the browser)
     # UniProtKB is the knowleadgebase integrating all kind of other databases
diff --git a/src/acore/microbiome/internal_functions.py b/src/acore/microbiome/internal_functions.py
index 1afe23c..637bc21 100644
--- a/src/acore/microbiome/internal_functions.py
+++ b/src/acore/microbiome/internal_functions.py
@@ -4,10 +4,11 @@
 from sklearn.pipeline import make_pipeline
 import pandas as pd
 
+
 def calc_clr(x):
     """
     Calculate the centered log-ratio (CLR) transformation.
-    
+
     Parameters
     ----------
     x : array-like
@@ -18,7 +19,7 @@ def calc_clr(x):
     array-like
         CLR transformed data.
     """
-    # replace zeros with small 
+    # replace zeros with small
     new_x = np.where(x > 0, x, 1e-10)
     return np.log(new_x) - np.mean(np.log(new_x), axis=0)
 
@@ -40,15 +41,13 @@ def coda_clr(df: pd.DataFrame) -> pd.DataFrame:
     """
     # Define the CoDA and CLR transformers
     coda = FunctionTransformer(
-        lambda x: x / x.sum(axis=0), 
-        feature_names_out="one-to-one"
-    ).set_output(transform="pandas")
-    clr = FunctionTransformer(
-        calc_clr, 
-        feature_names_out="one-to-one"
+        lambda x: x / x.sum(axis=0), feature_names_out="one-to-one"
     ).set_output(transform="pandas")
+    clr = FunctionTransformer(calc_clr, feature_names_out="one-to-one").set_output(
+        transform="pandas"
+    )
     # Create a pipeline with CoDA and CLR transformations
     pipe = make_pipeline(coda, clr).set_output(transform="pandas")
     # Apply the pipeline to numeric columns and concatenate with non-numeric columns
-    transformed = pipe.fit_transform(df.select_dtypes(include='number'))
-    return pd.concat([df.select_dtypes(include='object'), transformed], axis=1)
\ No newline at end of file
+    transformed = pipe.fit_transform(df.select_dtypes(include="number"))
+    return pd.concat([df.select_dtypes(include="object"), transformed], axis=1)
diff --git a/src/acore/multiple_testing/__init__.py b/src/acore/multiple_testing/__init__.py
index 3ca9316..1949daa 100644
--- a/src/acore/multiple_testing/__init__.py
+++ b/src/acore/multiple_testing/__init__.py
@@ -207,7 +207,7 @@ def correct_pairwise_ttest(df, alpha, correction="fdr_bh"):
 
     required_col = ["group1", "group2", "posthoc pvalue"]
     for _col in required_col:
-        if not _col in df:
+        if _col not in df:
             raise KeyError(f"Did not find '{_col}' in columns of data.")
 
     for comparison in df.groupby(["group1", "group2"]).groups:
diff --git a/src/acore/permutation_test/__init__.py b/src/acore/permutation_test/__init__.py
index 0c0bbd7..b6bc69c 100644
--- a/src/acore/permutation_test/__init__.py
+++ b/src/acore/permutation_test/__init__.py
@@ -5,17 +5,13 @@
     ttest_ind,
     f_oneway,
 )
-from .internal_functions import (
-    _permute, 
-    _contingency_table, 
-    _check_degeneracy
-)
+from .internal_functions import _permute, _contingency_table, _check_degeneracy
 import warnings
 
-warnings.simplefilter("always", UserWarning)
-
 from acore.types.permutation_test import PermutationResult
 
+warnings.simplefilter("always", UserWarning)
+
 
 def paired_permutation(
     cond1: np.ndarray,
@@ -113,21 +109,21 @@ def paired_permutation(
         )
         warnings.warn(identical_warn)
 
-        val_result = PermutationResult.model_validate({
-            "metric": calculator,
-            "observed": observed_metric,
-            "p_value": np.nan,
-        })
-        
+        val_result = PermutationResult.model_validate(
+            {
+                "metric": calculator,
+                "observed": observed_metric,
+                "p_value": np.nan,
+            }
+        )
+
     else:
         # Compute p-value
         p_value = np.mean(permuted_f >= abs_met)
 
-        val_result = PermutationResult.model_validate({
-            "metric": calculator,
-            "observed": observed_metric,
-            "p_value": p_value
-        })
+        val_result = PermutationResult.model_validate(
+            {"metric": calculator, "observed": observed_metric, "p_value": p_value}
+        )
 
     return val_result.model_dump()
 
@@ -175,10 +171,9 @@ def chi2_permutation(
     # Compute p-value
     p_value = np.mean(permuted_chi2 >= observed_chi2)
 
-    val_result = PermutationResult.model_validate({
-        "observed": observed_test,
-        "p_value": p_value
-    })
+    val_result = PermutationResult.model_validate(
+        {"observed": observed_test, "p_value": p_value}
+    )
 
     return val_result.model_dump(exclude_none=True)
 
@@ -266,10 +261,8 @@ def indep_permutation(
     # Compute p-value
     p_value = np.mean(permuted_f >= abs_met)
 
-    val_result = PermutationResult.model_validate({
-        "metric": calculator,
-        "observed": observed_metric,
-        "p_value": p_value
-    })
+    val_result = PermutationResult.model_validate(
+        {"metric": calculator, "observed": observed_metric, "p_value": p_value}
+    )
 
     return val_result.model_dump()
diff --git a/src/acore/types/permutation_test.py b/src/acore/types/permutation_test.py
index 436fc12..42d3831 100644
--- a/src/acore/types/permutation_test.py
+++ b/src/acore/types/permutation_test.py
@@ -2,8 +2,10 @@
 from typing import Any, Optional, Callable, Union
 from pydantic import BaseModel, Field
 
+
 class PermutationResult(BaseModel):
-    metric: Optional[Union[str, Callable]] = Field(default=None, description="Name of the metric used in the permutation test")
+    metric: Optional[Union[str, Callable]] = Field(
+        default=None, description="Name of the metric used in the permutation test"
+    )
     observed: Any = Field(default=None, description="Observed value of the metric")
     p_value: float = Field(description="p-value from the permutation test")
-
diff --git a/tests/test_permutation_test.py b/tests/test_permutation_test.py
index 2372e31..c755cbc 100644
--- a/tests/test_permutation_test.py
+++ b/tests/test_permutation_test.py
@@ -3,36 +3,36 @@
 import numpy as np
 from scipy.stats import ttest_rel, ttest_ind, mannwhitneyu, wilcoxon
 
+
 @pytest.fixture
 def set_array():
     return np.arange(1, 10)
 
+
 @pytest.fixture
 def random_low():
     return np.random.negative_binomial(10, 0.3, 100)
 
+
 @pytest.fixture
 def random_high():
     return np.random.negative_binomial(20, 0.1, 100)
 
+
 @pytest.fixture
 def random_choice_equal():
     return np.random.choice(
-        ["A", "B", "C", "D"], 
-        size=100, 
-        replace=True,
-        p=[0.25, 0.25, 0.25, 0.25]
+        ["A", "B", "C", "D"], size=100, replace=True, p=[0.25, 0.25, 0.25, 0.25]
     )
 
+
 @pytest.fixture
 def random_choice_unequal():
     return np.random.choice(
-        ["A", "B", "C", "D"], 
-        size=100, 
-        replace=True,
-        p=[0.1, 0.25, 0.05, 0.6]
+        ["A", "B", "C", "D"], size=100, replace=True, p=[0.1, 0.25, 0.05, 0.6]
     )
 
+
 def test_paired_permutation_same(set_array):
     cond1 = set_array
     cond2 = set_array
@@ -53,12 +53,11 @@ def test_paired_permutation_reject(random_low, random_high):
 
 
 def test_paired_permutation_degen(set_array):
-    for stat in ["t-statistic", "mean", "median"]: 
-        assert pt.paired_permutation(
-            set_array, 
-            set_array+30,
-            metric=stat
-        )["p_value"] is np.nan
+    for stat in ["t-statistic", "mean", "median"]:
+        assert (
+            pt.paired_permutation(set_array, set_array + 30, metric=stat)["p_value"]
+            is np.nan
+        )
 
 
 def test_indep_permutation_reject(random_low, random_high):
@@ -71,11 +70,9 @@ def test_indep_permutation_reject(random_low, random_high):
 
 def test_indep_permutation_accept(random_low):
     for stat in ["t-statistic", "anova", "median", "mean"]:
-        assert pt.indep_permutation(
-            random_low,
-            random_low,
-            metric=stat
-        )["p_value"] > 0.05
+        assert (
+            pt.indep_permutation(random_low, random_low, metric=stat)["p_value"] > 0.05
+        )
 
 
 def test_chi2_permutation_accept(random_choice_equal):
@@ -86,5 +83,3 @@ def test_chi2_permutation_accept(random_choice_equal):
 def test_chi2_permutation_reject(random_choice_unequal, random_choice_equal):
     result = pt.chi2_permutation(random_choice_equal, random_choice_unequal)
     assert result["p_value"] < 0.05
-
-

From f27cf2ecce917c65cd03bdc709e7437162755c54 Mon Sep 17 00:00:00 2001
From: "Angel L. P." <59593766+angelphanth@users.noreply.github.com>
Date: Fri, 7 Nov 2025 13:40:56 +0100
Subject: [PATCH 21/25] Update docs/api_examples/permutation_testing.py

Co-authored-by: Henry Webel <heweb@dtu.dk>
---
 docs/api_examples/permutation_testing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/api_examples/permutation_testing.py b/docs/api_examples/permutation_testing.py
index 91f5777..6a48ef7 100644
--- a/docs/api_examples/permutation_testing.py
+++ b/docs/api_examples/permutation_testing.py
@@ -30,7 +30,7 @@
 #
 # ### Preprocessing of abundances
 # - To account for technical variation due to sequencing technology limitations, we first transform the abundance values so they are relative to the total reads for the sample aka getting relative abundances.
-# - The relative abundances are compositional data (CoDa) so we map them to unconstrained vectors using centred log-ratio transformation `acore.microbiome.internal_functions.calc_clr()` to not violate assumptions of any frequentist stats we do
+# - The relative abundances are compositional data (CoDa) so we map them to unconstrained vectors using centred log-ratio transformation [`acore.microbiome.internal_functions.calc_clr`](`acore.microbiome.internal_functions.calc_clr`) to not violate assumptions of any frequentist stats we do
 #
 # ### Preprocessing of the metadata
 # - the sample metadata needed for this demo (sampling location) were available in their "sample-desc"

From 8415f0f6b0b2549576f110d6762f07fd2f1244dd Mon Sep 17 00:00:00 2001
From: "Angel L. P." <59593766+angelphanth@users.noreply.github.com>
Date: Fri, 7 Nov 2025 13:41:14 +0100
Subject: [PATCH 22/25] Update docs/api_examples/permutation_testing.py

Co-authored-by: Henry Webel <heweb@dtu.dk>
---
 docs/api_examples/permutation_testing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/api_examples/permutation_testing.py b/docs/api_examples/permutation_testing.py
index 6a48ef7..e920df2 100644
--- a/docs/api_examples/permutation_testing.py
+++ b/docs/api_examples/permutation_testing.py
@@ -13,7 +13,7 @@
 # ---
 
 # %% [markdown]
-# # Tutorial: Permutation Testing using `acore`
+# # Permutation Tests
 #
 # In this notebook we will demonstrate how to use acore's permutation testing functions on metagenomics data collected by [Ju and colleagues (2018)](https://doi.org/10.1038/s41396-018-0277-8).
 #

From ccfe00e987876faa6a49788ffbbfb055793aacd6 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Mon, 12 Jan 2026 16:59:21 +0100
Subject: [PATCH 23/25] Apply suggestions from code review

mainly typos

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 docs/api_examples/permutation_testing.ipynb | 6 +++---
 docs/api_examples/permutation_testing.py    | 6 +++---
 src/acore/permutation_test/__init__.py      | 4 ++--
 tests/test_permutation_test.py              | 2 --
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/docs/api_examples/permutation_testing.ipynb b/docs/api_examples/permutation_testing.ipynb
index cb92f75..5471464 100644
--- a/docs/api_examples/permutation_testing.ipynb
+++ b/docs/api_examples/permutation_testing.ipynb
@@ -9,7 +9,7 @@
     "\n",
     "In this notebook we will demonstrate how to use acore's permutation testing functions on metagenomics data collected by [Ju and colleagues (2018)](https://doi.org/10.1038/s41396-018-0277-8).\n",
     "\n",
-    "The samples in this demo were collected from wastewaster treatment plant inffluent (MGYS00005056) and effluent (MGYS00005058).\n",
+    "The samples in this demo were collected from wastewater treatment plant influent (MGYS00005056) and effluent (MGYS00005058).\n",
     "\n",
     "For this demo we look at the GO term abundance tables generated by the Mgnify pipeline. The values in the table are the absolute abundance of selected GO terms for each sample, which we then transform to relative abundances and centred-log ratios. \n"
    ]
@@ -180,7 +180,7 @@
     "\n",
     "The permutation test compares the actual observed chosen metric (e.g., t-statistic, mean difference) with metrics calculated when the dataset values are randomly shuffled permutations of the dataset. \n",
     "\n",
-    "If we do 100 permutations of our data (although we should do a bunch more) and only 1 of those permutations falsely showed a larger effect size than the actual observed effect than it suggests there is a 1/100 chance (p value of 0.01) of the observed effect sizze having occurred by chance. "
+    "If we do 100 permutations of our data (although we should do a bunch more) and only 1 of those permutations falsely showed a larger effect size than the actual observed effect than it suggests there is a 1/100 chance (p value of 0.01) of the observed effect size having occurred by chance. "
    ]
   },
   {
@@ -192,7 +192,7 @@
    "source": [
     "from acore.permutation_test import paired_permutation\n",
     "\n",
-    "# optional choice of random number generatorfor repro\n",
+    "# optional choice of random number generator for repro\n",
     "import numpy as np\n",
     "rng = np.random.default_rng(12345)"
    ]
diff --git a/docs/api_examples/permutation_testing.py b/docs/api_examples/permutation_testing.py
index e920df2..e54723c 100644
--- a/docs/api_examples/permutation_testing.py
+++ b/docs/api_examples/permutation_testing.py
@@ -17,7 +17,7 @@
 #
 # In this notebook we will demonstrate how to use acore's permutation testing functions on metagenomics data collected by [Ju and colleagues (2018)](https://doi.org/10.1038/s41396-018-0277-8).
 #
-# The samples in this demo were collected from wastewaster treatment plant inffluent (MGYS00005056) and effluent (MGYS00005058).
+# The samples in this demo were collected from wastewater treatment plant influent (MGYS00005056) and effluent (MGYS00005058).
 #
 # For this demo we look at the GO term abundance tables generated by the Mgnify pipeline. The values in the table are the absolute abundance of selected GO terms for each sample, which we then transform to relative abundances and centred-log ratios.
 #
@@ -72,12 +72,12 @@
 #
 # The permutation test compares the actual observed chosen metric (e.g., t-statistic, mean difference) with metrics calculated when the dataset values are randomly shuffled permutations of the dataset.
 #
-# If we do 100 permutations of our data (although we should do a bunch more) and only 1 of those permutations falsely showed a larger effect size than the actual observed effect than it suggests there is a 1/100 chance (p value of 0.01) of the observed effect sizze having occurred by chance.
+# If we do 100 permutations of our data (although we should do a bunch more) and only 1 of those permutations falsely showed a larger effect size than the actual observed effect than it suggests there is a 1/100 chance (p value of 0.01) of the observed effect size having occurred by chance.
 
 # %%
 from acore.permutation_test import paired_permutation
 
-# optional choice of random number generatorfor repro
+# optional choice of random number generator for repro
 import numpy as np
 
 rng = np.random.default_rng(12345)
diff --git a/src/acore/permutation_test/__init__.py b/src/acore/permutation_test/__init__.py
index b6bc69c..a88dd5d 100644
--- a/src/acore/permutation_test/__init__.py
+++ b/src/acore/permutation_test/__init__.py
@@ -69,7 +69,7 @@ def paired_permutation(
     else:
         raise ValueError(
             "Invalid metric specified. Acceptable metrics are: "
-            "'t-statistic', 'mean', 'median', or a custom function"
+            "'t-statistic', 'mean', 'median', or a custom function "
             "that takes `cond1-cond2` as input."
         )
 
@@ -234,7 +234,7 @@ def indep_permutation(
     else:
         raise ValueError(
             "Invalid metric specified. Acceptable metrics are: "
-            "'t-statistic', 'mean', 'median', or a custom function"
+            "'t-statistic', 'mean', 'median', or a custom function "
             "that takes each group as input."
         )
 
diff --git a/tests/test_permutation_test.py b/tests/test_permutation_test.py
index c755cbc..ad2db46 100644
--- a/tests/test_permutation_test.py
+++ b/tests/test_permutation_test.py
@@ -47,8 +47,6 @@ def test_paired_permutation_reject(random_low, random_high):
     cond2 = random_high
     for stat in ["t-statistic", "mean", "median"]:
         result = pt.paired_permutation(cond1, cond2, metric=stat)
-        print(result)
-        print(ttest_rel(cond1, cond2))
         assert result["p_value"] < 0.05
 
 

From 0ef5a84951ec10d638b624eda13db10b5f3e2d41 Mon Sep 17 00:00:00 2001
From: Henry Webel <heweb@dtu.dk>
Date: Wed, 14 Jan 2026 13:45:58 +0100
Subject: [PATCH 24/25] Merge branch 'main' into anglup-learning

---
 AUTHORS.md                                    |  10 +
 AUTHORS.rst                                   |  13 -
 CONTRIBUTING.md                               | 130 ++++++
 CONTRIBUTING.rst                              | 138 ------
 HISTORY.md                                    |   5 +
 HISTORY.rst                                   |   8 -
 docs/api_examples/batch_correction.ipynb      | 430 ++++++++++++++++++
 docs/api_examples/batch_correction.py         | 233 ++++++++++
 .../api_examples/normalization_analysis.ipynb |  93 +---
 docs/api_examples/normalization_analysis.py   |  35 +-
 docs/authors.md                               |   4 +
 docs/authors.rst                              |   1 -
 docs/conf.py                                  | 205 +++------
 docs/contributing.md                          |   4 +
 docs/contributing.rst                         |   1 -
 docs/history.md                               |   4 +
 docs/history.rst                              |   1 -
 docs/index.md                                 |  69 +++
 docs/index.rst                                |  63 ---
 .../syntethic_pep_enrichment/1_analysis.py    |   5 +-
 src/acore/batch_correction/__init__.py        |  53 +++
 src/acore/normalization/__init__.py           |  48 --
 tests/test_normalization.py                   |   4 +-
 23 files changed, 1012 insertions(+), 545 deletions(-)
 create mode 100644 AUTHORS.md
 delete mode 100644 AUTHORS.rst
 create mode 100644 CONTRIBUTING.md
 delete mode 100644 CONTRIBUTING.rst
 create mode 100644 HISTORY.md
 delete mode 100644 HISTORY.rst
 create mode 100644 docs/api_examples/batch_correction.ipynb
 create mode 100644 docs/api_examples/batch_correction.py
 create mode 100644 docs/authors.md
 delete mode 100644 docs/authors.rst
 create mode 100644 docs/contributing.md
 delete mode 100644 docs/contributing.rst
 create mode 100644 docs/history.md
 delete mode 100644 docs/history.rst
 create mode 100644 docs/index.md
 delete mode 100644 docs/index.rst
 create mode 100644 src/acore/batch_correction/__init__.py

diff --git a/AUTHORS.md b/AUTHORS.md
new file mode 100644
index 0000000..f3e0d81
--- /dev/null
+++ b/AUTHORS.md
@@ -0,0 +1,10 @@
+# Credits
+
+## Development Lead
+
+- Alberto Santos Delgado <albsad@dtu.dk>
+- Henry Webel <heweb@dtu.dk>
+
+## Contributors
+
+- Sebastián Ayala Ruano <https://github.com/sayalaruano>
diff --git a/AUTHORS.rst b/AUTHORS.rst
deleted file mode 100644
index 5048a97..0000000
--- a/AUTHORS.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-=======
-Credits
-=======
-
-Development Lead
-----------------
-
-* Alberto Santos Delgado <albsad@dtu.dk>
-
-Contributors
-------------
-
-None yet. Why not be the first?
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..fab7d04
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,130 @@
+# Contributing
+
+Contributions are welcome, and they are greatly appreciated! Every little bit
+helps, and credit will always be given.
+
+You can contribute in many ways:
+
+## Report Bugs
+
+Report bugs at
+[github.com/Multiomics-Analytics-Group/acore/issues](https://github.com/Multiomics-Analytics-Group/acore/issues).
+
+If you are reporting a bug, please include:
+
+- Your operating system name and version.
+- Any details about your local setup that might be helpful in troubleshooting.
+- Detailed steps to reproduce the bug.
+
+## Fix Bugs
+
+Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
+wanted" is open to whoever wants to implement it.
+
+## Implement Features
+
+Look through the GitHub issues for features. Anything tagged with "enhancement"
+and "help wanted" is open to whoever wants to implement it.
+
+## Write Documentation
+
+acore could always use more documentation, whether as part of the
+official acore docs, in docstrings, or even on the web in blog posts,
+articles, and such.
+
+## Submit Feedback
+
+The best way to send feedback is to file an issue at
+[github.com/Multiomics-Analytics-Group/acore/issues](https://github.com/Multiomics-Analytics-Group/acore/issues).
+
+If you are proposing a feature:
+
+- Explain in detail how it would work.
+- Keep the scope as narrow as possible, to make it easier to implement.
+- Remember that this is a volunteer-driven project, and that contributions are welcome :)
+
+## Get Started!
+
+Ready to contribute? Here's how to set up `acore` for local development. Consider an
+advanced editor to help you with some of the common steps described below, e.g.
+[VSCode](https://code.visualstudio.com/docs/introvideos/basics).
+
+1. Fork the `acore` repo on GitHub.
+2. Clone your fork locally:
+
+```sh
+git clone https://github.com/Multiomics-Analytics-Group/acore.git
+```
+
+3. Install your local copy into a virtual environment. Assuming you have Python available
+   on your system, this can be done using `venv`. Alternatives are conda environments
+   or uv to create and manage virtual environments.
+
+```sh
+cd acore/
+python -m venv .env
+source .env/bin/activate
+pip install -e ".[dev]"
+```
+
+If you work on Windows, see the docs: https://docs.python.org/3/library/venv.html#how-venvs-work
+
+4. Create a branch for local development:
+
+```sh
+git checkout -b name-of-your-bugfix-or-feature
+```
+
+5. When you're done making changes, check formatting and run tests locally:
+
+```sh
+black .
+ruff check src
+pytest .
+```
+
+Some `ruff` fixes can be applied automatically with `--fix`:
+
+```sh
+ruff check src --fix
+```
+
+6. Commit your changes and push your branch to GitHub:
+
+```sh
+git add .
+git commit -m "Your detailed description of your changes."
+git push origin name-of-your-bugfix-or-feature
+```
+
+7. Submit a pull request through the GitHub website.
+
+## General design principles in the library
+
+- Prefer a single `DataFrame` output type per module (subpackage). For example,
+  the enrichment module should output a `DataFrame` that adheres to a single pandera
+  schema defined under `src/acore/types/enrichment_analysis.py`. Thus there is a 'type'
+  of enrichment analysis results.
+- User-facing functions should have clear names and good docstrings
+  (e.g. `run_analysis`, `run_enrichment_analysis`, `apply_normalization`).
+- Docstrings may follow numpy, google, or classic Sphinx styles.
+
+## Pull Request Guidelines
+
+Before you submit a pull request, check that it meets these guidelines:
+
+1. The pull request should include tests.
+2. If the pull request adds functionality, update the docs. Put your new functionality
+   into a function with a docstring, and add the feature to the list in README.rst.
+3. The pull request should pass the GitHub workflows.
+
+See the PR template example:
+[Add module PR template](https://github.com/Multiomics-Analytics-Group/acore/blob/main/.github/workflows/PULL_REQUEST_TEMPLATE/module.md)
+
+## Deploying
+
+A reminder for maintainers on how to deploy:
+
+- Make sure all changes are committed (including an entry in HISTORY.rst).
+- Create a new [GitHub release](https://github.com/Multiomics-Analytics-Group/acore/releases)
+- The package will be deployed to PyPI if the tests pass.
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
deleted file mode 100644
index 842eaae..0000000
--- a/CONTRIBUTING.rst
+++ /dev/null
@@ -1,138 +0,0 @@
-.. highlight:: shell
-
-============
-Contributing
-============
-
-Contributions are welcome, and they are greatly appreciated! Every little bit
-helps, and credit will always be given.
-
-You can contribute in many ways:
-
-Report Bugs
-~~~~~~~~~~~
-
-Report bugs at https://github.com/Multiomics-Analytics-Group/acore/issues.
-
-If you are reporting a bug, please include:
-
-* Your operating system name and version.
-* Any details about your local setup that might be helpful in troubleshooting.
-* Detailed steps to reproduce the bug.
-
-Fix Bugs
-~~~~~~~~
-
-Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
-wanted" is open to whoever wants to implement it.
-
-Implement Features
-~~~~~~~~~~~~~~~~~~
-
-Look through the GitHub issues for features. Anything tagged with "enhancement"
-and "help wanted" is open to whoever wants to implement it.
-
-Write Documentation
-~~~~~~~~~~~~~~~~~~~
-
-acore could always use more documentation, whether as part of the
-official acore docs, in docstrings, or even on the web in blog posts,
-articles, and such.
-
-Submit Feedback
-~~~~~~~~~~~~~~~
-
-The best way to send feedback is to file an issue at https://github.com/Multiomics-Analytics-Group/acore/issues.
-
-If you are proposing a feature:
-
-* Explain in detail how it would work.
-* Keep the scope as narrow as possible, to make it easier to implement.
-* Remember that this is a volunteer-driven project, and that contributions
-  are welcome :)
-
-Get Started!
-------------
-
-Ready to contribute? Here's how to set up `acore` for local development. Consider an 
-advanced editor to help you with some of the common steps described below, e.g. 
-`VSCode <https://code.visualstudio.com/docs/introvideos/basics>`_.
-
-1. Fork the `acore` repo on GitHub.
-2. Clone your fork locally::
-
-    $ git clone https://github.com/Multiomics-Analytics-Group/acore.git
-
-3. Install your local copy into a virtual environment. Assuming you have Python available 
-   on your system, this can be done using `venv`. Alternatives are conda environments
-   or uv to create and manage virtual environments::
-
-    $ cd acore/
-    $ python -m venv .env
-    $ source .env/bin/activate
-    $ pip install -e ".[dev]"
-
-If you work on a Windows shell, see the docs for instructions: 
-`How venvs work <https://docs.python.org/3/library/venv.html#how-venvs-work>`_
-
-4. Create a branch for local development::
-
-    $ git checkout -b name-of-your-bugfix-or-feature
-
-   Now you can make your changes locally.
-
-5. When you're done making changes, check that your changes are formatted and pass ruff 
-   checks and the tests at least in your development environment::
-
-    $ black .
-    $ ruff check src
-    $ pytest .
-
-   Some changes ruff can automatically fix for you, if you pass the `--fix` flag:
-
-   $ ruff check src --fix
-
-6. Commit your changes and push your branch to GitHub::
-
-    $ git add .
-    $ git commit -m "Your detailed description of your changes."
-    $ git push origin name-of-your-bugfix-or-feature
-
-7. Submit a pull request through the GitHub website.
-
-General design principles in the library
-----------------------------------------
-
-- at best only one type of `DataFrame` output per module (subpackage.). For example, 
-  the enrichment module should output a `DataFrame` which adheres to a single pandera
-  schema defined under `src/acore/types/enrichment_analysis.py`. Thus there is a 'type'
-  of enrichment analysis results.
-- User facing functions should have clear names and good docstrings. They can something
-  along `run_analysis` (`run_enrichment_analysis`) or
-  `apply_normalization` (`apply_step` or `apply_method`).
-- docstrings for functions can be numpy, google or the classic Sphinx one.
-
-
-Pull Request Guidelines
------------------------
-
-Before you submit a pull request, check that it meets these guidelines:
-
-1. The pull request should include tests.
-2. If the pull request adds functionality, the docs should be updated. Put
-   your new functionality into a function with a docstring, and add the
-   feature to the list in README.rst.
-3. The pull request should pass the workflows on GitHub.
-
-See for example the PR-Template for a module: 
-`Add module PR template <https://github.com/Multiomics-Analytics-Group/acore/blob/main/.github/workflows/PULL_REQUEST_TEMPLATE/module.md>`_.
-
-
-
-Deploying
----------
-
-A reminder for the maintainers on how to deploy.
-Make sure all your changes are committed (including an entry in HISTORY.rst).
-Then run create a new `GitHub release <https://github.com/Multiomics-Analytics-Group/acore/releases>`_.
-The code will then be deployed to PyPI if the tests pass.
diff --git a/HISTORY.md b/HISTORY.md
new file mode 100644
index 0000000..c8c33b1
--- /dev/null
+++ b/HISTORY.md
@@ -0,0 +1,5 @@
+# History
+
+Please see the Release notes on GitHub:
+
+[Multiomics-Analytics-Group/acore/releases](https://github.com/Multiomics-Analytics-Group/acore/releases)
diff --git a/HISTORY.rst b/HISTORY.rst
deleted file mode 100644
index 85c741f..0000000
--- a/HISTORY.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-=======
-History
-=======
-
-0.1.0 (2023-09-26)
-------------------
-
-* First release on PyPI.
diff --git a/docs/api_examples/batch_correction.ipynb b/docs/api_examples/batch_correction.ipynb
new file mode 100644
index 0000000..6974363
--- /dev/null
+++ b/docs/api_examples/batch_correction.ipynb
@@ -0,0 +1,430 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "27b8d53e",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "source": [
+    "# Batch correction of samples\n",
+    "\n",
+    "We will explore an Alzheimer dataset where the data was collected in four different sites.\n",
+    "We will see that the sites have a an effect where the data is in principal component space\n",
+    "and in UMAP space. We will then batch correct the data and see how the effect on these plots.\n",
+    "\n",
+    "Refers to the [`acore.batch_correction`](acore.batch_correction) module."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dcc1505c",
+   "metadata": {
+    "tags": [
+     "hide-output"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "%pip install acore"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "563697dc",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "from typing import Optional\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import sklearn\n",
+    "import sklearn.impute\n",
+    "import sklearn.preprocessing\n",
+    "import vuecore.decomposition\n",
+    "\n",
+    "import acore.batch_correction\n",
+    "import acore.decomposition\n",
+    "\n",
+    "\n",
+    "def plot_umap(X_scaled, y, meta_column, random_state=42) -> plt.Axes:\n",
+    "    \"\"\"Fit and plot UMAP embedding with two components with colors defined by meta_column.\"\"\"\n",
+    "    embedding = acore.decomposition.umap.run_umap(\n",
+    "        X_scaled, y, random_state=random_state\n",
+    "    )\n",
+    "    ax = embedding.plot.scatter(\"UMAP 1\", \"UMAP 2\", c=meta_column, cmap=\"Paired\")\n",
+    "    return ax\n",
+    "\n",
+    "\n",
+    "def standard_normalize(X: pd.DataFrame) -> pd.DataFrame:\n",
+    "    \"\"\"Standard normalize data and keep indices of DataFrame.\"\"\"\n",
+    "    X_scaled = (\n",
+    "        sklearn.preprocessing.StandardScaler()\n",
+    "        .set_output(transform=\"pandas\")\n",
+    "        .fit_transform(X)\n",
+    "    )\n",
+    "    return X_scaled\n",
+    "\n",
+    "\n",
+    "def median_impute(X: pd.DataFrame) -> pd.DataFrame:\n",
+    "    X_imputed = (\n",
+    "        sklearn.impute.SimpleImputer(strategy=\"median\")\n",
+    "        .set_output(transform=\"pandas\")\n",
+    "        .fit_transform(X)\n",
+    "    )\n",
+    "    return X_imputed\n",
+    "\n",
+    "\n",
+    "def run_and_plot_pca(\n",
+    "    X_scaled,\n",
+    "    y,\n",
+    "    meta_column: Optional[str] = None,\n",
+    "    n_components: int = 4,\n",
+    ") -> tuple[pd.DataFrame, plt.Figure]:\n",
+    "    PCs, _ = acore.decomposition.pca.run_pca(X_scaled, n_components=n_components)\n",
+    "    PCs.columns = [s.replace(\"principal component\", \"PC\") for s in PCs.columns]\n",
+    "    fig = vuecore.decomposition.pca_grid(\n",
+    "        PCs=PCs, meta_column=y, n_components=n_components, meta_col_name=meta_column\n",
+    "    )\n",
+    "    return PCs, fig"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "60ee225e",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## Set some parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a32698b4",
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "fname_metadata: str = (\n",
+    "    \"https://raw.githubusercontent.com/RasmussenLab/\"\n",
+    "    \"njab/HEAD/docs/tutorial/data/alzheimer/meta.csv\"  # clincial data\n",
+    ")\n",
+    "fname_omics: str = (\n",
+    "    \"https://raw.githubusercontent.com/RasmussenLab/\"\n",
+    "    \"njab/HEAD/docs/tutorial/data/alzheimer/proteome.csv\"  # omics data\n",
+    ")\n",
+    "METACOL: str = \"_collection site\"  # target column in fname_metadata dataset (binary)\n",
+    "METACOL_LABEL: Optional[str] = \"site\"  # optional: rename target variable\n",
+    "n_features_max: int = 5\n",
+    "freq_cutoff: float = 0.5  # Omics cutoff for sample completeness\n",
+    "VAL_IDS: str = \"\"  #\n",
+    "VAL_IDS_query: str = \"\"\n",
+    "weights: bool = True\n",
+    "FOLDER = \"alzheimer\"\n",
+    "model_name = \"all\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9bd7687c",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "55b42bc2",
+   "metadata": {},
+   "source": [
+    "### Load proteomics (protein groups) data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "857b0522",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if METACOL_LABEL is None:\n",
+    "    METACOL_LABEL = METACOL\n",
+    "metadata = (\n",
+    "    pd.read_csv(fname_metadata, usecols=[\"Sample ID\", METACOL], index_col=0)\n",
+    "    .convert_dtypes()\n",
+    "    .rename(columns={METACOL: METACOL_LABEL})\n",
+    ")\n",
+    "omics = pd.read_csv(fname_omics, index_col=0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4ad54348",
+   "metadata": {},
+   "source": [
+    "Data shapes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fedfab62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "omics.shape, metadata.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ed9362eb",
+   "metadata": {},
+   "source": [
+    "See how common omics features are and remove feature below choosen frequency cutoff"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4fca7328",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ax = omics.notna().sum().sort_values().plot(rot=90)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "954c4682",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "M_before = omics.shape[1]\n",
+    "omics = omics.dropna(thresh=int(len(omics) * freq_cutoff), axis=1)\n",
+    "M_after = omics.shape[1]\n",
+    "msg = (\n",
+    "    f\"Removed {M_before-M_after} features with more \"\n",
+    "    f\"than {freq_cutoff*100}% missing values.\"\n",
+    "    f\"\\nRemaining features: {M_after} (of {M_before})\"\n",
+    ")\n",
+    "print(msg)\n",
+    "# keep a map of all proteins in protein group, but only display first protein\n",
+    "# proteins are unique to protein groups\n",
+    "pg_map = {k: k.split(\";\")[0] for k in omics.columns}\n",
+    "omics = omics.rename(columns=pg_map)\n",
+    "# log2 transform raw intensity data:\n",
+    "omics = np.log2(omics + 1)\n",
+    "ax = (\n",
+    "    omics.notna()\n",
+    "    .sum()\n",
+    "    .sort_values()\n",
+    "    .plot(\n",
+    "        rot=90,\n",
+    "        ylabel=\"Number of samples\",\n",
+    "        xlabel=\"Proteins (ranked by missing values)\",\n",
+    "    )\n",
+    ")\n",
+    "omics"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b1ba957f",
+   "metadata": {},
+   "source": [
+    "### Sample metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e2ba978a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metadata"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2877f00e",
+   "metadata": {},
+   "source": [
+    "Tabulate selected metadata and check for missing values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7f0d0364",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "metadata[METACOL_LABEL].value_counts(dropna=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9688dd46",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "target_counts = metadata[METACOL_LABEL].value_counts()\n",
+    "\n",
+    "if target_counts.sum() < len(metadata):\n",
+    "    print(\n",
+    "        \"Target has missing values.\"\n",
+    "        f\" Can only use {target_counts.sum()} of {len(metadata)} samples.\"\n",
+    "    )\n",
+    "    mask = metadata[METACOL_LABEL].notna()\n",
+    "    metadata, omics = metadata.loc[mask], omics.loc[mask]\n",
+    "\n",
+    "if METACOL_LABEL is None:\n",
+    "    METACOL_LABEL = METACOL_LABEL\n",
+    "y = metadata[METACOL_LABEL].astype(\"category\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "713a7b35",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "source": [
+    "## Before batch correction\n",
+    "Explore data in PCA and UMAP space before batch correction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "986f7cab",
+   "metadata": {
+    "lines_to_next_cell": 2,
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "omics_imp = median_impute(omics)\n",
+    "omics_imp_scaled = standard_normalize(omics_imp)\n",
+    "PCs, fig = run_and_plot_pca(omics_imp, y, METACOL_LABEL, n_components=4)\n",
+    "ax = plot_umap(omics_imp, y, METACOL_LABEL)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2bf8ca74",
+   "metadata": {},
+   "source": [
+    "## Combat batch correction\n",
+    "Correct for batch effects in the data using a robust regression approach removing\n",
+    "mean and scale effects out for each provided co-variate by batch.\n",
+    "Assumes normally distributed data.\n",
+    "\n",
+    "> ⚠️ Combat needs imputed data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc107f61",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "X = median_impute(omics)\n",
+    "X = acore.batch_correction.combat_batch_correction(\n",
+    "    X.join(y),\n",
+    "    batch_col=\"site\",\n",
+    ")\n",
+    "X"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ca46af9e",
+   "metadata": {},
+   "source": [
+    "Plot PCA and UMAP after batch correction on standard normalized data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cdc4afa2",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "PCs, fig = run_and_plot_pca(standard_normalize(X), y, METACOL_LABEL, n_components=4)\n",
+    "ax = plot_umap(X, y, METACOL_LABEL)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9fda9a50",
+   "metadata": {},
+   "source": [
+    "See change by substracting combat corrected data from original data.\n",
+    "- NAs in original data will remain NA below (no imputation done here)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9cecdd8",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "omics - X"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "cell_metadata_filter": "tags,-all"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/api_examples/batch_correction.py b/docs/api_examples/batch_correction.py
new file mode 100644
index 0000000..8134052
--- /dev/null
+++ b/docs/api_examples/batch_correction.py
@@ -0,0 +1,233 @@
+# ---
+# jupyter:
+#   jupytext:
+#     cell_metadata_filter: tags,-all
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.18.1
+#   kernelspec:
+#     display_name: Python 3 (ipykernel)
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Batch correction of samples
+#
+# We will explore an Alzheimer dataset where the data was collected in four different sites.
+# We will see that the sites have a an effect where the data is in principal component space
+# and in UMAP space. We will then batch correct the data and see how the effect on these plots.
+#
+# Refers to the [`acore.batch_correction`](acore.batch_correction) module.
+
+
+# %% tags=["hide-output"]
+# %pip install acore
+
+# %% tags=["hide-input"]
+from typing import Optional
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import sklearn
+import sklearn.impute
+import sklearn.preprocessing
+import vuecore.decomposition
+
+import acore.batch_correction
+import acore.decomposition
+
+
+def plot_umap(X_scaled, y, meta_column, random_state=42) -> plt.Axes:
+    """Fit and plot UMAP embedding with two components with colors defined by meta_column."""
+    embedding = acore.decomposition.umap.run_umap(
+        X_scaled, y, random_state=random_state
+    )
+    ax = embedding.plot.scatter("UMAP 1", "UMAP 2", c=meta_column, cmap="Paired")
+    return ax
+
+
+def standard_normalize(X: pd.DataFrame) -> pd.DataFrame:
+    """Standard normalize data and keep indices of DataFrame."""
+    X_scaled = (
+        sklearn.preprocessing.StandardScaler()
+        .set_output(transform="pandas")
+        .fit_transform(X)
+    )
+    return X_scaled
+
+
+def median_impute(X: pd.DataFrame) -> pd.DataFrame:
+    X_imputed = (
+        sklearn.impute.SimpleImputer(strategy="median")
+        .set_output(transform="pandas")
+        .fit_transform(X)
+    )
+    return X_imputed
+
+
+def run_and_plot_pca(
+    X_scaled,
+    y,
+    meta_column: Optional[str] = None,
+    n_components: int = 4,
+) -> tuple[pd.DataFrame, plt.Figure]:
+    PCs, _ = acore.decomposition.pca.run_pca(X_scaled, n_components=n_components)
+    PCs.columns = [s.replace("principal component", "PC") for s in PCs.columns]
+    fig = vuecore.decomposition.pca_grid(
+        PCs=PCs, meta_column=y, n_components=n_components, meta_col_name=meta_column
+    )
+    return PCs, fig
+
+
+# %% [markdown]
+#
+# ## Set some parameters
+
+# %% tags=["parameters"]
+fname_metadata: str = (
+    "https://raw.githubusercontent.com/RasmussenLab/"
+    "njab/HEAD/docs/tutorial/data/alzheimer/meta.csv"  # clincial data
+)
+fname_omics: str = (
+    "https://raw.githubusercontent.com/RasmussenLab/"
+    "njab/HEAD/docs/tutorial/data/alzheimer/proteome.csv"  # omics data
+)
+METACOL: str = "_collection site"  # target column in fname_metadata dataset (binary)
+METACOL_LABEL: Optional[str] = "site"  # optional: rename target variable
+n_features_max: int = 5
+freq_cutoff: float = 0.5  # Omics cutoff for sample completeness
+VAL_IDS: str = ""  #
+VAL_IDS_query: str = ""
+weights: bool = True
+FOLDER = "alzheimer"
+model_name = "all"
+
+# %% [markdown]
+# ## Setup
+
+# %% [markdown]
+# ### Load proteomics (protein groups) data
+
+# %%
+if METACOL_LABEL is None:
+    METACOL_LABEL = METACOL
+metadata = (
+    pd.read_csv(fname_metadata, usecols=["Sample ID", METACOL], index_col=0)
+    .convert_dtypes()
+    .rename(columns={METACOL: METACOL_LABEL})
+)
+omics = pd.read_csv(fname_omics, index_col=0)
+
+# %% [markdown]
+# Data shapes
+
+# %%
+omics.shape, metadata.shape
+
+# %% [markdown]
+# See how common omics features are and remove feature below choosen frequency cutoff
+
+# %%
+ax = omics.notna().sum().sort_values().plot(rot=90)
+
+# %% tags=["hide-input"]
+M_before = omics.shape[1]
+omics = omics.dropna(thresh=int(len(omics) * freq_cutoff), axis=1)
+M_after = omics.shape[1]
+msg = (
+    f"Removed {M_before-M_after} features with more "
+    f"than {freq_cutoff*100}% missing values."
+    f"\nRemaining features: {M_after} (of {M_before})"
+)
+print(msg)
+# keep a map of all proteins in protein group, but only display first protein
+# proteins are unique to protein groups
+pg_map = {k: k.split(";")[0] for k in omics.columns}
+omics = omics.rename(columns=pg_map)
+# log2 transform raw intensity data:
+omics = np.log2(omics + 1)
+ax = (
+    omics.notna()
+    .sum()
+    .sort_values()
+    .plot(
+        rot=90,
+        ylabel="Number of samples",
+        xlabel="Proteins (ranked by missing values)",
+    )
+)
+omics
+
+# %% [markdown]
+# ### Sample metadata
+
+# %%
+metadata
+
+# %% [markdown]
+# Tabulate selected metadata and check for missing values
+
+# %% tags=["hide-input"]
+metadata[METACOL_LABEL].value_counts(dropna=False)
+
+# %% tags=["hide-input"]
+target_counts = metadata[METACOL_LABEL].value_counts()
+
+if target_counts.sum() < len(metadata):
+    print(
+        "Target has missing values."
+        f" Can only use {target_counts.sum()} of {len(metadata)} samples."
+    )
+    mask = metadata[METACOL_LABEL].notna()
+    metadata, omics = metadata.loc[mask], omics.loc[mask]
+
+if METACOL_LABEL is None:
+    METACOL_LABEL = METACOL_LABEL
+y = metadata[METACOL_LABEL].astype("category")
+
+# %% [markdown]
+# ## Before batch correction
+# Explore data in PCA and UMAP space before batch correction
+
+
+# %% tags=["hide-input"]
+omics_imp = median_impute(omics)
+omics_imp_scaled = standard_normalize(omics_imp)
+PCs, fig = run_and_plot_pca(omics_imp, y, METACOL_LABEL, n_components=4)
+ax = plot_umap(omics_imp, y, METACOL_LABEL)
+
+
+# %% [markdown]
+# ## Combat batch correction
+# Correct for batch effects in the data using a robust regression approach removing
+# mean and scale effects out for each provided co-variate by batch.
+# Assumes normally distributed data.
+#
+# > ⚠️ Combat needs imputed data
+
+# %%
+# %%time
+X = median_impute(omics)
+X = acore.batch_correction.combat_batch_correction(
+    X.join(y),
+    batch_col="site",
+)
+X
+
+# %% [markdown]
+# Plot PCA and UMAP after batch correction on standard normalized data
+
+# %% tags=["hide-input"]
+PCs, fig = run_and_plot_pca(standard_normalize(X), y, METACOL_LABEL, n_components=4)
+ax = plot_umap(X, y, METACOL_LABEL)
+
+# %% [markdown]
+# See change by substracting combat corrected data from original data.
+# - NAs in original data will remain NA below (no imputation done here)
+
+# %% tags=["hide-input"]
+omics - X
diff --git a/docs/api_examples/normalization_analysis.ipynb b/docs/api_examples/normalization_analysis.ipynb
index 84b7360..04e8ceb 100644
--- a/docs/api_examples/normalization_analysis.ipynb
+++ b/docs/api_examples/normalization_analysis.ipynb
@@ -5,13 +5,13 @@
    "id": "e31da646",
    "metadata": {},
    "source": [
-    "# Normalization of samples in a dataset example\n",
+    "# Normalization of samples\n",
     "\n",
     "We will explore an Alzheimer dataset where the data was collected in four different sites.\n",
     "We will see that the sites have a an effect where the data is in principal component space\n",
     "and in UMAP space. We will then normalize the data and see how the effect on these plots.\n",
     "\n",
-    "Refers to the `acore.normalization` module."
+    "Refers to the [`acore.normalization`](acore.normalization) module."
    ]
   },
   {
@@ -63,7 +63,6 @@
     "\n",
     "import acore.decomposition\n",
     "import acore.normalization\n",
-    "import acore.sklearn\n",
     "\n",
     "\n",
     "def plot_umap(X_scaled, y, meta_column, random_state=42) -> plt.Axes:\n",
@@ -470,92 +469,6 @@
      "iopub.status.idle": "2024-10-15T08:04:45.540528Z",
      "shell.execute_reply": "2024-10-15T08:04:45.539880Z"
     },
-    "tags": [
-     "hide-input"
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "omics"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8e972d1e",
-   "metadata": {},
-   "source": [
-    "## Combat normalization\n",
-    "Correct for batch effects in the data using a robust regression approach normalizing\n",
-    "mean and scale effetcs out for each feature by batch. Assumes normally distributed data.\n",
-    "\n",
-    "> ⚠️ Combat needs imputed data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c3a7d04d",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-10-15T08:04:45.611270Z",
-     "iopub.status.busy": "2024-10-15T08:04:45.611049Z",
-     "iopub.status.idle": "2024-10-15T08:04:45.634947Z",
-     "shell.execute_reply": "2024-10-15T08:04:45.634525Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "%%time\n",
-    "X = median_impute(omics)\n",
-    "X = acore.normalization.combat_batch_correction(\n",
-    "    X.join(y),\n",
-    "    batch_col=\"site\",\n",
-    ")\n",
-    "X"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "aff9589b",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-10-15T08:04:45.667463Z",
-     "iopub.status.busy": "2024-10-15T08:04:45.667004Z",
-     "iopub.status.idle": "2024-10-15T08:04:46.730589Z",
-     "shell.execute_reply": "2024-10-15T08:04:46.730346Z"
-    },
-    "tags": [
-     "hide-input"
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "omics_imp = median_impute(X)\n",
-    "omics_imp_scaled = standard_normalize(omics_imp)\n",
-    "PCs, fig = run_and_plot_pca(omics_imp_scaled, y, METACOL_LABEL, n_components=4)\n",
-    "ax = plot_umap(omics_imp_scaled, y, METACOL_LABEL)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "57ffebe1",
-   "metadata": {},
-   "source": [
-    "See change by substracting combat normalized data from original data."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "02d6773f",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-10-15T08:04:46.789884Z",
-     "iopub.status.busy": "2024-10-15T08:04:46.789752Z",
-     "iopub.status.idle": "2024-10-15T08:04:46.796400Z",
-     "shell.execute_reply": "2024-10-15T08:04:46.796180Z"
-    },
     "lines_to_next_cell": 2,
     "tags": [
      "hide-input"
@@ -563,7 +476,7 @@
    },
    "outputs": [],
    "source": [
-    "omics - X"
+    "omics"
    ]
   },
   {
diff --git a/docs/api_examples/normalization_analysis.py b/docs/api_examples/normalization_analysis.py
index 424dac8..7729b7c 100644
--- a/docs/api_examples/normalization_analysis.py
+++ b/docs/api_examples/normalization_analysis.py
@@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.17.2
+#       jupytext_version: 1.18.1
 #   kernelspec:
 #     display_name: Python 3 (ipykernel)
 #     language: python
@@ -14,13 +14,13 @@
 # ---
 
 # %% [markdown]
-# # Normalization of samples in a dataset example
+# # Normalization of samples
 #
 # We will explore an Alzheimer dataset where the data was collected in four different sites.
 # We will see that the sites have a an effect where the data is in principal component space
 # and in UMAP space. We will then normalize the data and see how the effect on these plots.
 #
-# Refers to the `acore.normalization` module.
+# Refers to the [`acore.normalization`](acore.normalization) module.
 
 # %% tags=["hide-output"]
 # %pip install acore
@@ -38,7 +38,6 @@
 
 import acore.decomposition
 import acore.normalization
-import acore.sklearn
 
 
 def plot_umap(X_scaled, y, meta_column, random_state=42) -> plt.Axes:
@@ -225,34 +224,6 @@ def run_and_plot_pca(
 # %% tags=["hide-input"]
 omics
 
-# %% [markdown]
-# ## Combat normalization
-# Correct for batch effects in the data using a robust regression approach normalizing
-# mean and scale effetcs out for each feature by batch. Assumes normally distributed data.
-#
-# > ⚠️ Combat needs imputed data
-
-# %%
-# %%time
-X = median_impute(omics)
-X = acore.normalization.combat_batch_correction(
-    X.join(y),
-    batch_col="site",
-)
-X
-
-# %% tags=["hide-input"]
-omics_imp = median_impute(X)
-omics_imp_scaled = standard_normalize(omics_imp)
-PCs, fig = run_and_plot_pca(omics_imp_scaled, y, METACOL_LABEL, n_components=4)
-ax = plot_umap(omics_imp_scaled, y, METACOL_LABEL)
-
-# %% [markdown]
-# See change by substracting combat normalized data from original data.
-
-# %% tags=["hide-input"]
-omics - X
-
 
 # %% [markdown]
 # ## Median normalization
diff --git a/docs/authors.md b/docs/authors.md
new file mode 100644
index 0000000..fa8dfb5
--- /dev/null
+++ b/docs/authors.md
@@ -0,0 +1,4 @@
+```{include} ../AUTHORS.md
+:relative-docs: docs
+:relative-images:
+```
diff --git a/docs/authors.rst b/docs/authors.rst
deleted file mode 100644
index e122f91..0000000
--- a/docs/authors.rst
+++ /dev/null
@@ -1 +0,0 @@
-.. include:: ../AUTHORS.rst
diff --git a/docs/conf.py b/docs/conf.py
index f40d78e..332a4f8 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,7 +1,26 @@
 import os
+from importlib import metadata
 from pathlib import Path
 
-import acore
+# -- Project information -----------------------------------------------------
+
+# General information about the project.
+project = "acore"
+copyright = "2026, Multi-omics Network Analytics Group"
+author = "Multi-omics Network Analytics Group, Sebastián Ayala Ruano, Henry Webel, Alberto Santos"
+PACKAGE_VERSION = metadata.version("acore")
+version = PACKAGE_VERSION
+release = PACKAGE_VERSION
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = "en"
+
+# -- General configuration ---------------------------------------------------
+
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
@@ -14,7 +33,6 @@
     # "sphinx.ext.doctest",
     "sphinx.ext.intersphinx",
     # "sphinx.ext.autosummary",
-    # "sphinx.ext.todo",
     # "sphinx.ext.mathjax",
     # "sphinx.ext.coverage",
     # "sphinx.ext.imgmath",
@@ -28,12 +46,52 @@
 myst_enable_extensions = [
     # "strikethrough",
     "dollarmath",
-    # "amsmath",
+    "amsmath",
+    "colon_fence",
 ]
 
+#  https://myst-nb.readthedocs.io/en/latest/computation/execute.html
+nb_execution_mode = "auto"
+
+# https://myst-nb.readthedocs.io/en/latest/configuration.html
+# Execution
+nb_execution_raise_on_error = True
+# Rendering
+nb_merge_streams = True
+# maximum execution time per cell in seconds
+nb_execution_timeout = 120
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
 
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = [
+    "_build",
+    "Thumbs.db",
+    ".DS_Store",
+    "jupyter_execute",
+    "conf.py",
+    "fetch_files.py",
+]
+
+
+# Intersphinx options
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3", None),
+    "numpy": ("https://numpy.org/doc/stable/", None),
+    "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
+    "pandera": ("https://pandera.readthedocs.io/en/stable/", None),
+    "pydantic": ("https://docs.pydantic.dev/latest", None),
+    # "scikit-learn": ("https://scikit-learn.org/stable/", None),
+    # "matplotlib": ("https://matplotlib.org/stable/", None),
+}
+
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = "sphinx"
+
 # If true, the current module name will be prepended to all description
 # unit titles (such as .. function::).
 add_module_names = False
@@ -54,62 +112,8 @@
 ## Generate autodoc stubs with summaries from code
 autosummary_generate = True
 
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-#
-# source_suffix = ['.rst', '.md']
-source_suffix = ".rst"
-
-# The master toctree document.
-master_doc = "index"
-
-# General information about the project.
-project = "acore"
-copyright = "2023, Multi-omics Network Analytics Group"
-author = "Multi-omics Network Analytics Group"
-
-# The version info for the project you're documenting, acts as replacement
-# for |version| and |release|, also used in various other places throughout
-# the built documents.
-#
-# The short X.Y version.
-version = acore.__version__
-# The full version, including alpha/beta/rc tags.
-release = acore.__version__
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = "en"
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = [
-    "_build",
-    "Thumbs.db",
-    ".DS_Store",
-    "jupyter_execute",
-    "conf.py",
-    "fetch_files.py",
-]
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = "sphinx"
-
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = False
-
 # -- General configuration ---------------------------------------------------
 
-#  https://myst-nb.readthedocs.io/en/latest/computation/execute.html
-nb_execution_mode = "auto"
-nb_execution_timeout = (
-    90  # increase limit of default 30 seconds max execution time per cell
-)
-
 myst_enable_extensions = ["dollarmath", "amsmath", "colon_fence"]
 
 # Plolty support through require javascript library
@@ -120,30 +124,6 @@
     "https://cdn.plot.ly/plotly-3.0.1.min.js",
 ]
 
-# https://myst-nb.readthedocs.io/en/latest/configuration.html
-# Execution
-nb_execution_raise_on_error = True
-# Rendering
-nb_merge_streams = True
-# maximum execution time per cell in seconds
-nb_execution_timeout = 120
-
-# https://myst-nb.readthedocs.io/en/latest/authoring/custom-formats.html#write-custom-formats
-# nb_custom_formats = {".py": ["jupytext.reads", {"fmt": "py:percent"}]}
-# ! better use jupytext directly to sync the notebook formats -> allows to download ipynb
-# ! and see start in colab button
-
-
-# Intersphinx options
-intersphinx_mapping = {
-    "python": ("https://docs.python.org/3", None),
-    "numpy": ("https://numpy.org/doc/stable/", None),
-    "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
-    "pandera": ("https://pandera.readthedocs.io/en/stable/", None),
-    # "scikit-learn": ("https://scikit-learn.org/stable/", None),
-    # "matplotlib": ("https://matplotlib.org/stable/", None),
-}
-
 
 # -- Options for HTML output -------------------------------------------------
 
@@ -177,69 +157,6 @@
 # so a file named "default.css" will overwrite the builtin "default.css".
 # html_static_path = ['_static']
 
-
-# -- Options for HTMLHelp output ---------------------------------------
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = "acoredoc"
-
-
-# -- Options for LaTeX output ------------------------------------------
-
-latex_elements = {
-    # The paper size ('letterpaper' or 'a4paper').
-    #
-    # 'papersize': 'letterpaper',
-    # The font size ('10pt', '11pt' or '12pt').
-    #
-    # 'pointsize': '10pt',
-    # Additional stuff for the LaTeX preamble.
-    #
-    # 'preamble': '',
-    # Latex figure (float) alignment
-    #
-    # 'figure_align': 'htbp',
-}
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title, author, documentclass
-# [howto, manual, or own class]).
-latex_documents = [
-    (
-        master_doc,
-        "acore.tex",
-        "Analytics Core Documentation",
-        "Multi-omics Network Analytics Group",
-        "manual",
-    ),
-]
-
-
-# -- Options for manual page output ------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [(master_doc, "acore", "Analytics Core Documentation", [author], 1)]
-
-
-# -- Options for Texinfo output ----------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-#  dir menu entry, description, category)
-texinfo_documents = [
-    (
-        master_doc,
-        "acore",
-        "Analytics Core Documentation",
-        author,
-        "acore",
-        "Oneline description of project.",
-        "Miscellaneous",
-    ),
-]
-
-
 # -- Setup for sphinx-apidoc -------------------------------------------------
 
 # sphinx-apidoc needs to be called manually if Sphinx is running there.
diff --git a/docs/contributing.md b/docs/contributing.md
new file mode 100644
index 0000000..10d8391
--- /dev/null
+++ b/docs/contributing.md
@@ -0,0 +1,4 @@
+```{include} ../CONTRIBUTING.md
+:relative-docs: docs
+:relative-images:
+```
diff --git a/docs/contributing.rst b/docs/contributing.rst
deleted file mode 100644
index e582053..0000000
--- a/docs/contributing.rst
+++ /dev/null
@@ -1 +0,0 @@
-.. include:: ../CONTRIBUTING.rst
diff --git a/docs/history.md b/docs/history.md
new file mode 100644
index 0000000..26f1255
--- /dev/null
+++ b/docs/history.md
@@ -0,0 +1,4 @@
+```{include} ../HISTORY.md
+:relative-docs: docs
+:relative-images:
+```
diff --git a/docs/history.rst b/docs/history.rst
deleted file mode 100644
index 2506499..0000000
--- a/docs/history.rst
+++ /dev/null
@@ -1 +0,0 @@
-.. include:: ../HISTORY.rst
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 0000000..ab97f42
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,69 @@
+# Acore Documentation
+
+```{include} ./sections_readme/home_page.md
+:caption: VueGen
+:relative-docs: docs
+:relative-images:
+:start-line: 0
+:end-line: 40
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Overview
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: API usage examples
+:hidden:
+
+api_examples/normalization_analysis
+api_examples/batch_correction
+api_examples/exploratory_analysis
+api_examples/ANCOVA_analysis
+api_examples/enrichment_analysis
+api_examples/permutation_testing
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Reanalysis
+:hidden:
+
+PXD040621 <downloaded/PXD040621_analysis>
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Example dataset (preprocessing)
+:hidden:
+
+api_examples/Download_PRIDE_data
+api_examples/ovarian_cancer
+```
+
+```{toctree}
+:maxdepth: 2
+:caption: Modules
+:hidden:
+
+reference/acore
+```
+
+## Indices and tables
+
+- [General Index](genindex)
+- [Module Index](modindex)
+- [Search Page](search)
+
+```{toctree}
+:maxdepth: 1
+:caption: MISC:
+:hidden:
+
+contributing
+authors
+history
+README.md
+```
diff --git a/docs/index.rst b/docs/index.rst
deleted file mode 100644
index af0844f..0000000
--- a/docs/index.rst
+++ /dev/null
@@ -1,63 +0,0 @@
-Acore documentation
-=================================
-
-.. include:: sections_readme/home_page.md
-   :parser: myst_parser.sphinx_
-   :start-line: 0
-   :end-line: 40
-
-.. only supported in myst markdown for include...
-.. :relative-docs: docs
-.. :relative-images:
-
-.. toctree::
-   :maxdepth: 1
-   :caption: API usage examples
-   :hidden:
-
-   api_examples/exploratory_analysis
-   api_examples/normalization_analysis
-   api_examples/ANCOVA_analysis
-   api_examples/enrichment_analysis
-   api_examples/permutation_testing
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Reanalysis
-   :hidden:
-
-   PXD040621 <downloaded/PXD040621_analysis>
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Example dataset (preprocessing)
-   :hidden:
-
-   api_examples/Download_PRIDE_data
-   api_examples/ovarian_cancer
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Modules
-   :hidden:
-
-   reference/acore
-
-Indices and tables
-------------------
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
-
-
-.. toctree::
-   :maxdepth: 1
-   :caption: MISC:
-   :hidden:
-
-   contributing
-   authors
-   history
-   README.md
-
-
diff --git a/example_data/syntethic_pep_enrichment/1_analysis.py b/example_data/syntethic_pep_enrichment/1_analysis.py
index e61a792..cdd997c 100644
--- a/example_data/syntethic_pep_enrichment/1_analysis.py
+++ b/example_data/syntethic_pep_enrichment/1_analysis.py
@@ -8,6 +8,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+from scipy.stats import fisher_exact, hypergeom
 
 import acore.enrichment_analysis as ea
 
@@ -57,8 +58,6 @@
 # playing with the components of single test to understand
 # fisher exact test and how it works:
 
-# %%
-from scipy.stats import fisher_exact
 
 # %%
 num_in_foreground = 4
@@ -87,8 +86,6 @@
 # - foreground population and number of peptides in foreground are important
 
 # %%
-from scipy.stats import hypergeom
-
 [M, n, N] = [background_pop, foreground_pop, num_in_background + num_in_foreground]
 rv = hypergeom(M, n, N)
 x = np.arange(0, n + 1)
diff --git a/src/acore/batch_correction/__init__.py b/src/acore/batch_correction/__init__.py
new file mode 100644
index 0000000..8af5447
--- /dev/null
+++ b/src/acore/batch_correction/__init__.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+import pandas as pd
+
+# PyPI pycombat as described in the docstring was never used:
+# https://pypi.org/project/pycombat/
+# https://github.com/epigenelabs/inmoose is the update from combat.pycombat on PyPI
+# from combat.pycombat import pycombat
+from inmoose.pycombat import pycombat_norm
+
+__all__ = ["combat_batch_correction"]
+
+
+def combat_batch_correction(
+    data: pd.DataFrame,
+    batch_col: str,
+    # index_cols: list[str],
+) -> pd.DataFrame:
+    """
+    This function corrects processed data for batch effects. For more information visit:
+    https://github.com/epigenelabs/inmoose
+
+    :param data: pandas.DataFrame with samples as rows and protein identifiers as columns.
+    :param batch_col: column with the batch identifiers
+    :return: pandas.DataFrame with samples as rows and protein identifiers as columns.
+    Example::
+        result = combat_batch_correction(
+                    data,
+                    batch_col="batch",
+                    index_cols=["subject", "sample", "group"],
+                )
+
+    """
+    # :param index_cols: list of columns that don't need to be corrected (i.e group)
+    df_corrected = pd.DataFrame()
+    # index_cols = [c for c in index_cols if c != batch_col]
+    # data = data.set_index(index_cols)  # ? should this not be provided directly as data
+    df = data.drop(batch_col, axis=1)
+    df_numeric = df.select_dtypes("number")
+    num_batches = len(data[batch_col].unique())
+    if df_numeric.empty:
+        raise ValueError("No numeric columns found in data.")
+    if not num_batches > 1:
+        raise ValueError("Only one batch found in data.")
+    info_cols = df.columns.difference(df_numeric.columns)
+    df_corrected = pd.DataFrame(
+        pycombat_norm(df_numeric.T, data[batch_col]).T,
+        index=df.index,
+    )
+    df_corrected = df_corrected.join(df[info_cols])
+    # df_corrected = df_corrected  # .reset_index()  # ? would also not reset index here
+
+    return df_corrected
diff --git a/src/acore/normalization/__init__.py b/src/acore/normalization/__init__.py
index a24f5b3..4c05d14 100644
--- a/src/acore/normalization/__init__.py
+++ b/src/acore/normalization/__init__.py
@@ -10,12 +10,6 @@
 
 import pandas as pd
 
-# PyPI pycombat as described in the docstring was never used:
-# https://pypi.org/project/pycombat/
-# https://github.com/epigenelabs/inmoose is the update from combat.pycombat on PyPI
-# from combat.pycombat import pycombat
-from inmoose.pycombat import pycombat_norm
-
 from .strategies import (
     linear_normalization,
     median_normalization,
@@ -28,48 +22,6 @@
 __all__ = ["combat_batch_correction", "normalize_data", "normalize_data_per_group"]
 
 
-def combat_batch_correction(
-    data: pd.DataFrame,
-    batch_col: str,
-    # index_cols: list[str],
-) -> pd.DataFrame:
-    """
-    This function corrects processed data for batch effects. For more information visit:
-    https://github.com/epigenelabs/inmoose
-
-    :param data: pandas.DataFrame with samples as rows and protein identifiers as columns.
-    :param batch_col: column with the batch identifiers
-    :return: pandas.DataFrame with samples as rows and protein identifiers as columns.
-    Example::
-        result = combat_batch_correction(
-                    data,
-                    batch_col="batch",
-                    index_cols=["subject", "sample", "group"],
-                )
-
-    """
-    # :param index_cols: list of columns that don't need to be corrected (i.e group)
-    df_corrected = pd.DataFrame()
-    # index_cols = [c for c in index_cols if c != batch_col]
-    # data = data.set_index(index_cols)  # ? should this not be provided directly as data
-    df = data.drop(batch_col, axis=1)
-    df_numeric = df.select_dtypes("number")
-    num_batches = len(data[batch_col].unique())
-    if df_numeric.empty:
-        raise ValueError("No numeric columns found in data.")
-    if not num_batches > 1:
-        raise ValueError("Only one batch found in data.")
-    info_cols = df.columns.difference(df_numeric.columns)
-    df_corrected = pd.DataFrame(
-        pycombat_norm(df_numeric.T, data[batch_col]).T,
-        index=df.index,
-    )
-    df_corrected = df_corrected.join(df[info_cols])
-    # df_corrected = df_corrected  # .reset_index()  # ? would also not reset index here
-
-    return df_corrected
-
-
 def normalize_data_per_group(
     data: pd.DataFrame,
     group: str | int | list[str | int],
diff --git a/tests/test_normalization.py b/tests/test_normalization.py
index 57f5fca..e04f48f 100644
--- a/tests/test_normalization.py
+++ b/tests/test_normalization.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from acore import normalization as normalization
+from acore import batch_correction, normalization
 
 
 def test_combat_batch_correction():
@@ -21,7 +21,7 @@ def test_combat_batch_correction():
         3: {"a": 3.3051631491010953, "b": 4.671866609626733, "c": 9.362703235564519},
         4: {"a": 2.537904319175784, "b": 3.589114860839033, "c": 6.885375054723476},
     }
-    actual = normalization.combat_batch_correction(data, "batch").to_dict(
+    actual = batch_correction.combat_batch_correction(data, "batch").to_dict(
         orient="index"
     )
     assert actual == expected

From 7f74321624624f6881c07639891130a667115eae Mon Sep 17 00:00:00 2001
From: angelphanth <angel.phanth@hotmail.com>
Date: Thu, 12 Mar 2026 09:31:58 +0000
Subject: [PATCH 25/25] :truck: renamed file and moved to 'transform' dir

---
 .../internal_functions.py => transform/compositional.py}          | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/acore/{microbiome/internal_functions.py => transform/compositional.py} (100%)

diff --git a/src/acore/microbiome/internal_functions.py b/src/acore/transform/compositional.py
similarity index 100%
rename from src/acore/microbiome/internal_functions.py
rename to src/acore/transform/compositional.py