From a60e879658d84f810db97f49077d85b32941824e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@MacBook-Pro.local>
Date: Thu, 15 May 2025 17:14:15 +0200
Subject: [PATCH 1/8] Refactor ClusterNode, fix the bug when two nodes had the
 same stds, extend code to support multiple children

---
 tests/test_bahc.py                            |  11 ++
 unsupervised_bias_detection/cluster/_bahc.py  | 109 +++++++++---------
 .../cluster/_cluster_node.py                  |  17 ++-
 .../cluster/_kmeans.py                        |   7 +-
 .../cluster/_kmodes.py                        |   7 +-
 5 files changed, 83 insertions(+), 68 deletions(-)

diff --git a/tests/test_bahc.py b/tests/test_bahc.py
index 739963d..74619c3 100644
--- a/tests/test_bahc.py
+++ b/tests/test_bahc.py
@@ -27,6 +27,17 @@ def test_labels():
     # Checks that cluster sizes are at least bahc_min_cluster_size
 
 
+def test_constant_metric():
+    # Checks that there is only one cluster with a score of 0 if the metric is constant
+    rng = np.random.RandomState(12)
+    X = rng.rand(20, 10)
+    y = np.full(20, rng.rand())
+    bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
+    bahc.fit(X, y)
+    assert bahc.n_clusters_ == 1
+    assert bahc.scores_[0] == 0
+
+
 def test_scores():
     # Checks that scores are computed correctly
     rng = np.random.RandomState(12)
diff --git a/unsupervised_bias_detection/cluster/_bahc.py b/unsupervised_bias_detection/cluster/_bahc.py
index bd3ba2a..ecfeb1b 100644
--- a/unsupervised_bias_detection/cluster/_bahc.py
+++ b/unsupervised_bias_detection/cluster/_bahc.py
@@ -10,7 +10,8 @@
 
 
 class BiasAwareHierarchicalClustering(BaseEstimator, ClusterMixin):
-    """TODO: Add docstring.
+    """
+    TODO: Add docstring.
 
     References
     ----------
@@ -65,19 +66,18 @@ def fit(self, X, y):
         # We assign all samples a label of zero
         labels = np.zeros(n_samples, dtype=np.uint32)
         leaves = []
-        scores = []
-        label = 0
-        root = ClusterNode(label)
+        std = np.std(y)
+        score = 0
+        root = ClusterNode(std, score)
         self.cluster_tree_ = root
         # The entire dataset has a discrimination score of zero
-        score = 0
-        heap = [(None, root, score)]
+        heap = [root]
         for _ in range(self.bahc_max_iter):
             if not heap:
                 # If the heap is empty we stop iterating
                 break
             # Take the cluster with the highest standard deviation of metric y
-            _, node, score = heapq.heappop(heap)
+            node = heapq.heappop(heap)
             label = node.label
             cluster_indices = np.nonzero(labels == label)[0]
             cluster = X[cluster_indices]
@@ -85,55 +85,58 @@ def fit(self, X, y):
             clustering_model = self.clustering_cls(**self.clustering_params)
             cluster_labels = clustering_model.fit_predict(cluster)
 
-            # TODO: Generalize for more than 2 clusters
-            # Can do this by checking clustering_model.n_clusters_ (if it exists)
-            # or by checking the number of unique values in cluster_labels
-            indices0 = cluster_indices[np.nonzero(cluster_labels == 0)[0]]
-            indices1 = cluster_indices[np.nonzero(cluster_labels == 1)[0]]
-            if (
-                len(indices0) >= self.bahc_min_cluster_size
-                and len(indices1) >= self.bahc_min_cluster_size
-            ):
-                # We calculate the discrimination scores using formula (1) in [1]
-                # TODO: Move y[indices0] and y[indices1] into separate variables
-                # to avoid recomputing them
-                # Maybe create a function to compute the score
-                mask0 = np.ones(n_samples, dtype=bool)
-                mask0[indices0] = False
-                score0 = np.mean(y[mask0]) - np.mean(y[indices0])
-                mask1 = np.ones(n_samples, dtype=bool)
-                mask1[indices1] = False
-                score1 = np.mean(y[mask1]) - np.mean(y[indices1])
-                if max(score0, score1) >= score:
-                    std0 = np.std(y[indices0])
-                    node0 = ClusterNode(label)
-                    # heapq implements min-heap
-                    # so we have to negate std before pushing
-                    heapq.heappush(heap, (-std0, node0, score0))
-                    std1 = np.std(y[indices1])
-                    node1 = ClusterNode(self.n_clusters_)
-                    heapq.heappush(heap, (-std1, node1, score1))
-                    labels[indices1] = self.n_clusters_
-                    # TODO: Increase n_clusters_ by clustering_model.n_clusters_ - 1
-                    self.n_clusters_ += 1
-                    children = [node0, node1]
-                    node.split(clustering_model, children)
+            if hasattr(clustering_model, "n_clusters_"):
+                n_children = clustering_model.n_clusters_
+            else:
+                n_children = len(np.unique(cluster_labels))
+            
+            # We first check if all child clusters meet the minimum size requirement
+            valid_split = True
+            children_indices = []
+            for i in range(n_children):
+                child_indices = cluster_indices[np.nonzero(cluster_labels == i)[0]]
+                if len(child_indices) >= self.bahc_min_cluster_size:
+                    children_indices.append(child_indices)
                 else:
                     leaves.append(node)
-                    scores.append(score)
-            else:
-                leaves.append(node)
-                scores.append(score)
-        if heap:
-            # TODO: Check if this can be made more efficient
-            leaves.extend((node for _, node, _ in heap))
-            scores = np.concatenate([scores, [score for _, _, score in heap]])
-        else:
-            scores = np.array(scores)
-
+                    valid_split = False
+                    break
+            
+            # If all children clusters are of sufficient size, we check if the score of each child cluster is greater than or equal to the current score
+            if valid_split:
+                child_scores = []
+                for child_indices in children_indices:
+                    cluster_metric = y[child_indices]
+                    complement_mask = np.ones(n_samples, dtype=bool)
+                    complement_mask[child_indices] = False
+                    complement_metric = y[complement_mask]
+                    child_score = np.mean(complement_metric) - np.mean(cluster_metric)
+                    if child_score >= score:
+                        child_scores.append(child_score)
+                    else:
+                        leaves.append(node)
+                        valid_split = False
+                        break
+            
+            # If the split is valid, we create the children nodes and split the current node
+            if valid_split:
+                children = []
+                for i in range(n_children):
+                    child_indices = children_indices[i]
+                    child_std = np.std(y[child_indices])
+                    child_score = child_scores[i]
+                    # heapq implements min-heap
+                    # so we have to negate std before pushing
+                    child_node = ClusterNode(-child_std, child_score)
+                    children.append(child_node)
+                node.split(clustering_model, children)
+                self.n_clusters_ += n_children - 1
+        
+        leaves.extend(heap)
+        leaf_scores = np.array([leaf.score for leaf in leaves])
         # We sort clusters by decreasing scores
-        sorted_indices = np.argsort(-scores)
-        self.scores_ = scores[sorted_indices]
+        sorted_indices = np.argsort(-leaf_scores)
+        self.scores_ = leaf_scores[sorted_indices]
         leaf_labels = np.array([leaf.label for leaf in leaves])
         leaf_labels = leaf_labels[sorted_indices]
         label_mapping = np.zeros(self.n_clusters_, dtype=np.uint32)
diff --git a/unsupervised_bias_detection/cluster/_cluster_node.py b/unsupervised_bias_detection/cluster/_cluster_node.py
index 2901c9b..9bd032c 100644
--- a/unsupervised_bias_detection/cluster/_cluster_node.py
+++ b/unsupervised_bias_detection/cluster/_cluster_node.py
@@ -1,8 +1,11 @@
+import itertools
 from sklearn.base import ClusterMixin
 from typing import Self
 
 class ClusterNode:
-    def __init__(self, label: int):
+    _id_counter = itertools.count()
+
+    def __init__(self, neg_std: float, score: float):
         """
         Initialize a node in the cluster tree.
         
@@ -11,7 +14,12 @@ def __init__(self, label: int):
         label : int
             The cluster label for this node (required as all nodes start as leaves)
         """
-        self.label = label
+        self.id = next(self._id_counter)
+        self.neg_std = neg_std
+        self.score = score
+        # The label is set to the id when the node is a leaf
+        # and is set to None when the node is split
+        self.label = self.id
         self.clustering_model = None
         self.children = []
     
@@ -19,6 +27,9 @@ def __init__(self, label: int):
     def is_leaf(self):
         return len(self.children) == 0
     
+    def __lt__(self, other: Self):
+        return self.neg_std < other.neg_std or (self.neg_std == other.neg_std and self.id < other.id)
+    
     def split(self, clustering_model: ClusterMixin, children: list[Self]):
         """
         Split this node by setting its clustering model and adding children.
@@ -31,7 +42,7 @@ def split(self, clustering_model: ClusterMixin, children: list[Self]):
             The clustering model used to split this node
         children : list of ClusterNode
             The child nodes resulting from the split
-        """   
+        """
         self.label = None
         self.clustering_model = clustering_model
         self.children = children
diff --git a/unsupervised_bias_detection/cluster/_kmeans.py b/unsupervised_bias_detection/cluster/_kmeans.py
index e3b4aac..5906be0 100644
--- a/unsupervised_bias_detection/cluster/_kmeans.py
+++ b/unsupervised_bias_detection/cluster/_kmeans.py
@@ -48,12 +48,7 @@ def __init__(
         bahc_min_cluster_size,
         **kmeans_params,
     ):
-        # TODO: Remove this once we have a better way to handle the number of clusters
-        if "n_clusters" in kmeans_params and kmeans_params["n_clusters"] != 2:
-            raise ValueError(
-                f"The parameter `n_clusters` should be 2, got {kmeans_params['n_clusters']}."
-            )
-        else:
+        if "n_clusters" not in kmeans_params:
             kmeans_params["n_clusters"] = 2
 
         if "n_init" not in kmeans_params:
diff --git a/unsupervised_bias_detection/cluster/_kmodes.py b/unsupervised_bias_detection/cluster/_kmodes.py
index 2fc84fa..82cbf44 100644
--- a/unsupervised_bias_detection/cluster/_kmodes.py
+++ b/unsupervised_bias_detection/cluster/_kmodes.py
@@ -43,12 +43,7 @@ class BiasAwareHierarchicalKModes(BaseEstimator, ClusterMixin):
     """
 
     def __init__(self, bahc_max_iter, bahc_min_cluster_size, **kmodes_params):
-        # TODO: Remove this once we have a better way to handle the number of clusters
-        if "n_clusters" in kmodes_params and kmodes_params["n_clusters"] != 2:
-            raise ValueError(
-                f"The parameter `n_clusters` should be 2, got {kmodes_params['n_clusters']}."
-            )
-        else:
+        if "n_clusters" not in kmodes_params:
             kmodes_params["n_clusters"] = 2
 
         self.bahc_max_iter = bahc_max_iter

From 686249f418852c49d55c49463c035ce5b3070270 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@MacBook-Pro.local>
Date: Thu, 15 May 2025 17:48:18 +0200
Subject: [PATCH 2/8] Add data and helper functions

---
 HBAC_scan/helper_functions.py                 | 492 ++++++++++++++++++
 .../german_dataset.py                         | 162 ++++++
 .../helper_functions.py                       | 149 ++++++
 data/GermanCredit_dataset/german_dataset.py   | 162 ++++++
 tests/test_bahc.py                            |  36 +-
 unsupervised_bias_detection/cluster/_bahc.py  |   6 +-
 .../cluster/_cluster_node.py                  |  10 +-
 7 files changed, 994 insertions(+), 23 deletions(-)
 create mode 100644 HBAC_scan/helper_functions.py
 create mode 100644 classifiers/Loan_approval_classifier/german_dataset.py
 create mode 100644 classifiers/Loan_approval_classifier/helper_functions.py
 create mode 100644 data/GermanCredit_dataset/german_dataset.py

diff --git a/HBAC_scan/helper_functions.py b/HBAC_scan/helper_functions.py
new file mode 100644
index 0000000..8e92025
--- /dev/null
+++ b/HBAC_scan/helper_functions.py
@@ -0,0 +1,492 @@
+import random
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import pingouin as pg
+import scipy.stats as stats
+
+# matplotlib
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans
+from matplotlib.lines import Line2D
+from matplotlib import collections as mc
+
+# sklearn
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+
+
+def init_GermanCredit_dataset(
+    raw_data,
+    features,
+    with_errors=True,
+    just_features=True,
+    scale_features=True,
+    with_classes=True,
+):
+    """Initializing dataset: scaling features, adding new columns which are required for HBAC"""
+
+    new_data = raw_data.copy(deep=True)
+
+    to_scale = new_data.drop(
+        ["predicted_class", "true_class", "errors", "FP_errors", "FN_errors"], axis=1
+    ).columns
+    new_data[to_scale] = StandardScaler().fit_transform(features[to_scale])
+
+    new_data["clusters"] = 0
+    new_data["new_clusters"] = -1
+    return new_data
+
+
+def init_dataset(raw_data, features):
+    """Initializing dataset: scaling features, adding new columns which are required for HBAC"""
+
+    # copy dataframe
+    new_data = raw_data.copy(deep=True)
+
+    # only scale features
+    to_scale = new_data.drop(
+        ["predicted_class", "true_class", "errors", "FP_errors", "FN_errors"], axis=1
+    ).columns
+    new_data[to_scale] = StandardScaler().fit_transform(features[to_scale])
+
+    # initialize clustering parameters
+    new_data["clusters"] = 0
+    new_data["new_clusters"] = -1
+
+    return new_data
+
+
+def bias(results, metric):
+    """Return accuracy, FP rate or FN rate of dataframe"""
+
+    if metric == "Accuracy":
+        correct = results.loc[results["errors"] == 0]
+        acc = len(correct) / len(results)
+        return acc
+    if metric == "FP":
+        FPs = results.loc[
+            (results["predicted_class"] == 1) & (results["true_class"] == 0)
+        ]
+        Ns = results.loc[(results["true_class"] == 0)]
+        if Ns.shape[0] != 0:
+            FP_rate = len(FPs) / len(Ns)
+            return 1 - FP_rate
+        else:
+            return 1
+    if metric == "FN":
+        FNs = results.loc[
+            (results["predicted_class"] == 0) & (results["true_class"] == 1)
+        ]
+        Ps = results.loc[(results["true_class"] == 1)]
+        if Ps.shape[0] != 0:
+            FN_rate = len(FNs) / len(Ps)
+            return 1 - FN_rate
+        else:
+            return 1
+
+
+def bias_acc(data, metric, cluster_id, cluster_col):
+    """Bias := bias metric of the selected cluster - bias metric of the remaining clusters"""
+    cluster_x = data.loc[data[cluster_col] == cluster_id]
+    if len(cluster_x) == 0:
+        print("This is an empty cluster", cluster_id)
+    remaining_clusters = data.loc[data[cluster_col] != cluster_id]
+    if len(remaining_clusters) == 0:
+        print("This cluster is the entire dataset", cluster_id)
+    return bias(cluster_x, metric) - bias(remaining_clusters, metric)
+
+
+def get_max_bias(fulldata, metric, function=bias_acc):
+    """Calculates the highest negative bias of the newly introduced clusters"""
+    max_bias = -999999
+    for cluster_number in fulldata["new_clusters"].unique():
+        current_bias = function(fulldata, metric, cluster_number, "new_clusters")
+        if current_bias < max_bias:
+            print("current bias: ", current_bias)
+            print("max abs bias: ", max_bias)
+            max_bias = current_bias
+    return max_bias
+
+
+def get_max_bias_cluster(fulldata, metric, function=bias_acc):
+    """Identifies cluster linked to the highest bias of the newly introduced clusters"""
+    max_bias = 100
+    min_bias = -100
+    best_cluster = -2
+    for cluster_number in fulldata["clusters"].unique():
+        current_bias = function(fulldata, metric, cluster_number, "clusters")
+        print(f"{cluster_number} has bias {current_bias}")
+
+        # Accuracy
+        if metric == "Accuracy":
+            if current_bias < max_bias:
+                max_bias = current_bias
+                best_cluster = cluster_number
+
+        # FP/FN
+        if metric == "FP" or metric == "FN":
+            if current_bias > min_bias:
+                min_bias = current_bias
+                best_cluster = cluster_number
+
+    return best_cluster
+
+
+def get_min_cluster_size(data):
+    """Size of smallest new cluster"""
+    min_cluster_size = len(data)
+    for i in data["new_clusters"].unique():
+        # exclude the cluster -1 from being seen as a cluster, since it contains outliers
+        if i == -1:
+            continue
+        size = len(data.loc[data["new_clusters"] == i])
+        if size < min_cluster_size:
+            min_cluster_size = size
+    return min_cluster_size
+
+
+def get_next_cluster(data, metric):
+    """Identifies cluster number with the highest variance. The variance is calculated based on the error metric of each cluster. The cluster with the highest variance will be selected as splitting cluster"""
+    n_cluster = max(data["clusters"])
+    highest_variance = -1
+    cluster_number = 0
+
+    for i in data["clusters"].unique():
+        if i == -1:
+            continue
+        cluster_i = data.loc[data["clusters"] == i]
+        if metric == "Accuracy":
+            variance_cluster = np.var(cluster_i["errors"])
+        if metric == "FP":
+            variance_cluster = np.var(cluster_i["FP_errors"])
+        if metric == "FN":
+            variance_cluster = np.var(cluster_i["FN_errors"])
+
+        if variance_cluster > highest_variance:
+            highest_variance = variance_cluster
+            cluster_number = i
+
+    return cluster_number
+
+
+def calculate_variance(data, metric):
+    """Determines variance for a dataframe."""
+    variance_list_local = []
+    for j in data["clusters"].unique():
+        average_bias = bias(data, metric)
+        bias_clus = bias_acc(data, metric, j, "clusters")
+        variance_list_local.append(bias_clus)
+    variance = np.var(variance_list_local)
+    return variance
+
+
+def get_random_cluster(clusters):
+    """Identifies value of a random cluster"""
+    result = -1
+    while result == -1:
+        result = random.randint(0, len(clusters.unique()))
+    return result
+
+
+def HBAC_bias_scan(
+    df, metric, split_cluster_size, acc_cluster_size, clustering_paramaters
+):
+    iterations_max = 20
+    x = 0  # initial cluster number
+    initial_bias = 0
+    variance_list = []
+    average_bias = bias(df, metric)
+    minimal_splittable_cluster_size = split_cluster_size
+    minimal_acceptable_cluster_size = acc_cluster_size
+    print(f"bias {metric} is: ", average_bias)
+
+    for i in range(1, iterations_max):
+        if i != 1:
+
+            # calculate variance for cluster
+            variance_list.append(calculate_variance(df, metric))
+
+        df["new_clusters"] = -1
+        candidate_cluster = df.loc[df["clusters"] == x]
+
+        if len(candidate_cluster) < minimal_splittable_cluster_size:
+            x = get_random_cluster(df["clusters"])
+            continue
+
+        # k-means clustering
+        kmeans_algo = KMeans(**clustering_paramaters).fit(
+            candidate_cluster.drop(
+                [
+                    "clusters",
+                    "new_clusters",
+                    "predicted_class",
+                    "true_class",
+                    "errors",
+                    "FP_errors",
+                    "FN_errors",
+                ],
+                axis=1,
+            )
+        )
+
+        candidate_cluster["new_clusters"] = pd.DataFrame(
+            kmeans_algo.predict(
+                candidate_cluster.drop(
+                    [
+                        "clusters",
+                        "new_clusters",
+                        "predicted_class",
+                        "true_class",
+                        "errors",
+                        "FP_errors",
+                        "FN_errors",
+                    ],
+                    axis=1,
+                )
+            ),
+            index=candidate_cluster.index,
+        )
+        df["new_clusters"] = candidate_cluster["new_clusters"].combine_first(
+            df["new_clusters"]
+        )
+
+        # find discriminated clusters
+        max_bias = get_max_bias(df, metric)
+        min_new_size = get_min_cluster_size(df)
+
+        if (max_bias <= initial_bias) & (
+            min_new_size > minimal_acceptable_cluster_size
+        ):
+            # Add new cluster
+            n_cluster = max(df["clusters"])
+            df["clusters"][df["new_clusters"] == 1] = n_cluster + 1
+
+            x = get_next_cluster(df, metric)
+            initial_bias = max_bias
+        else:
+            x = get_random_cluster(df["clusters"])
+
+    print("done")
+    return df
+
+
+def stat_df(df, discriminated_cluster, not_discriminated):
+
+    # finding difference
+    difference = (discriminated_cluster.mean()) - (not_discriminated.mean())
+    diff_dict = difference.to_dict()
+
+    # unscaling the discriminated cluster
+    unscaled_discriminated = df.loc[discriminated_cluster.index, :]
+
+    # unscaled other data
+    unscaled_remaining = df.drop(discriminated_cluster.index)
+
+    # statistical testing
+    welch_dict = {}
+    CI_dict_left = {}
+    CI_dict_right = {}
+
+    features = [
+        col
+        for col in df.columns.tolist()
+        if col
+        not in [
+            "tweet_id1",
+            "scaled_errors",
+            "predicted_class",
+            "true_class",
+            "errors",
+            "FP_errors",
+            "FN_errors",
+            "clusters",
+            "new_clusters",
+        ]
+    ]
+
+    for i in features:
+        welch_i = stats.ttest_ind(
+            unscaled_discriminated[i], unscaled_remaining[i], equal_var=False
+        )
+        res = pg.ttest(unscaled_discriminated[i], unscaled_remaining[i], paired=False)
+
+        # attach to dictionary
+        welch_dict[i] = welch_i.pvalue
+        CI_dict_left[i] = res["CI95%"][0][0]
+        CI_dict_right[i] = res["CI95%"][0][1]
+
+    # store results in dataframe
+    pd.set_option("display.float_format", lambda x: "%.5f" % x)
+    cluster_analysis_df = pd.DataFrame(
+        [diff_dict, welch_dict, CI_dict_left, CI_dict_right]
+    ).T
+    cluster_analysis_df.columns = ["difference", "p-value", "[0.025", "0.975]"]
+    cluster_analysis_df = cluster_analysis_df.sort_values("p-value", ascending=[True])
+    n_rows = cluster_analysis_df.shape[0]
+
+    # Get errors; (coef - lower bound of conf interval)
+    cluster_analysis_df["errors"] = (
+        cluster_analysis_df["difference"] - cluster_analysis_df["[0.025"]
+    )
+    cluster_analysis_df = cluster_analysis_df.iloc[0:n_rows,]
+    cluster_analysis_df["num"] = [int(i) for i in np.linspace(n_rows - 1, 0, n_rows)]
+
+    cluster_analysis_df = cluster_analysis_df.reset_index()
+
+    return cluster_analysis_df
+
+
+def CI_plot(df, x_lim, feat_ls):
+    """
+    Takes in results of Welch's t-test and returns a plot of
+    the coefficients with 95% confidence intervals.
+    """
+    n_rows = df.shape[0]
+
+    # line segments
+    lines_sign = []
+    lines_non_sign = []
+    index_ls = []
+    i = n_rows
+    for feat in feat_ls:
+        k = df[df["index"] == feat].index[0]
+        p_value = df.iloc[k, 2]
+        if p_value <= 0.05:
+            sub_ls_sign = []
+            sub_ls_sign.append((df.iloc[k, 3], i))
+            sub_ls_sign.append((df.iloc[k, 4], i))
+            lines_sign.append(sub_ls_sign)
+            index_ls.append((i, k))
+            i -= 1
+        else:
+            sub_ls_non_sign = []
+            sub_ls_non_sign.append((df.iloc[k, 3], i))
+            sub_ls_non_sign.append((df.iloc[k, 4], i))
+            lines_non_sign.append(sub_ls_non_sign)
+            index_ls.append((i, k))
+            i -= 1
+
+    fig, ax = plt.subplots(figsize=(10, 7))
+
+    # Line to define zero on the x-axis
+    ax.axvline(x=0, linestyle="--", color="black", linewidth=1)
+
+    # line segments significant
+    lc = mc.LineCollection(lines_sign, colors="steelblue", linewidths=10, alpha=0.75)
+    ax.add_collection(lc)
+    ax.autoscale()
+
+    # line segments non-significant
+    lc = mc.LineCollection(
+        lines_non_sign, colors="steelblue", linewidths=10, alpha=0.25
+    )
+    ax.add_collection(lc)
+    ax.autoscale()
+
+    # title and axes
+    plt.title("Cluster difference 95% confidence interval", fontsize=24)
+
+    # font size axes
+    ax.tick_params(axis="both", which="major", labelsize=16)
+
+    # x-axis
+    ax.set_xlabel("Difference in means", fontsize=22)
+    ax.set_xlim(x_lim)
+    xlims = ax.get_xlim()
+
+    # annotate x-axis
+    ax.annotate(
+        "Cluster mean lower than\nrest of (standardized) dataset",
+        xy=(xlims[0], -0.1),
+        xytext=(xlims[0], -0.5),
+        ha="center",
+        annotation_clip=False,
+        fontsize=14,
+        style="italic",
+    )
+    ax.annotate(
+        "Cluster mean higher than\nrest of (standardized) dataset",
+        xy=(xlims[1], -0.1),
+        xytext=(xlims[1], -0.5),
+        ha="center",
+        annotation_clip=False,
+        fontsize=14,
+        style="italic",
+    )
+
+    # y-axis
+    columns = feat_ls
+    ax.set_yticklabels([""] + columns[::-1])
+
+    # scatter plot
+    idx_ls = [i for (i, k) in index_ls]
+    scatter_ls = [df.iloc[k, 1] for (i, k) in index_ls]
+    ax.scatter(
+        y=idx_ls,
+        marker="o",
+        s=250,
+        edgecolors="none",
+        linewidth=2,
+        x=scatter_ls,
+        color="steelblue",
+    )
+
+    # legend
+    legend_elements = [
+        Line2D([0], [0], color="steelblue", alpha=0.75, lw=10, label="Significant"),
+        Line2D([0], [0], color="steelblue", alpha=0.25, lw=10, label="Not significant"),
+    ]
+    ax.legend(handles=legend_elements, loc="best", fontsize=16)
+
+    return plt.show()
+
+
+def pca_plot(data):
+    """PCA dimensionality reduction to display identified clusters as scatterplot."""
+
+    pca_features = data.drop(
+        [
+            "predicted_class",
+            "true_class",
+            "errors",
+            "FP_errors",
+            "FN_errors",
+            "clusters",
+            "new_clusters",
+        ],
+        axis=1,
+    )
+    other_features = data[
+        [
+            "predicted_class",
+            "true_class",
+            "errors",
+            "FP_errors",
+            "FN_errors",
+            "clusters",
+            "new_clusters",
+        ]
+    ]
+
+    df = pd.DataFrame(pca_features)
+    pca = pd.DataFrame(PCA(n_components=2).fit_transform(df), index=df.index)
+    temp_dataset = pca.join(other_features, how="left")
+    temp_dataset.rename(columns={0: "PCA - 1st"}, inplace=True)
+    temp_dataset.rename(columns={1: "PCA - 2nd"}, inplace=True)
+
+    scatterplot = sns.scatterplot(
+        data=temp_dataset,
+        x="PCA - 1st",
+        y="PCA - 2nd",
+        hue="clusters",
+        size="errors",
+        sizes=(150, 30),
+        palette="Set1",
+    )
+    scatterplot.set_title("HBAC bias scan (k-means) on AI classifier")
+    lgd = scatterplot.legend(loc="center left", bbox_to_anchor=(1.0, 0.5), ncol=1)
+    plt.show()
+
+
+#     plt.savefig('./test.png', bbox_extra_artists=(lgd,), bbox_inches='tight')
diff --git a/classifiers/Loan_approval_classifier/german_dataset.py b/classifiers/Loan_approval_classifier/german_dataset.py
new file mode 100644
index 0000000..8811704
--- /dev/null
+++ b/classifiers/Loan_approval_classifier/german_dataset.py
@@ -0,0 +1,162 @@
+import os
+import pandas as pd
+from aif360.datasets import StandardDataset
+
+
+default_mappings = {
+    "label_maps": [{0: "Good Credit", 1: "Bad Credit"}],
+    "protected_attribute_maps": [
+        {1.0: "Male", 0.0: "Female"},
+        {1.0: "Old", 0.0: "Young"},
+    ],
+}
+
+
+def default_preprocessing(df):
+    """Adds a derived sex attribute based on personal_status."""
+    # TODO: ignores the value of privileged_classes for 'sex'
+    status_map = {
+        "A91": "male",
+        "A93": "male",
+        "A94": "male",
+        "A92": "female",
+        "A95": "female",
+    }
+    df["sex"] = df["personal_status"].replace(status_map)
+
+    return df
+
+
+class GermanDataset(StandardDataset):
+    """German credit Dataset.
+
+    See :file:`aif360/data/raw/german/README.md`.
+    """
+
+    def __init__(
+        self,
+        label_name="credit",
+        favorable_classes=[0],
+        protected_attribute_names=[],
+        privileged_classes=[],
+        instance_weights_name=None,
+        categorical_features=[
+            "status",
+            "credit_history",
+            "purpose",
+            "savings",
+            "employment",
+            "other_debtors",
+            "property",
+            "installment_plans",
+            "housing",
+            "skill_level",
+            "telephone",
+            "foreign_worker",
+        ],
+        features_to_keep=[],
+        features_to_drop=["personal_status"],
+        na_values=[],
+        custom_preprocessing=default_preprocessing,
+        metadata=default_mappings,
+    ):
+        """See :obj:`StandardDataset` for a description of the arguments.
+
+        By default, this code converts the 'age' attribute to a binary value
+        where privileged is `age > 25` and unprivileged is `age <= 25` as
+        proposed by Kamiran and Calders [1]_.
+
+        References:
+            .. [1] F. Kamiran and T. Calders, "Classifying without
+               discriminating," 2nd International Conference on Computer,
+               Control and Communication, 2009.
+
+        Examples:
+            In some cases, it may be useful to keep track of a mapping from
+            `float -> str` for protected attributes and/or labels. If our use
+            case differs from the default, we can modify the mapping stored in
+            `metadata`:
+
+            >>> label_map = {1.0: 'Good Credit', 0.0: 'Bad Credit'}
+            >>> protected_attribute_maps = [{1.0: 'Male', 0.0: 'Female'}]
+            >>> gd = GermanDataset(protected_attribute_names=['sex'],
+            ... privileged_classes=[['male']], metadata={'label_map': label_map,
+            ... 'protected_attribute_maps': protected_attribute_maps})
+
+            Now this information will stay attached to the dataset and can be
+            used for more descriptive visualizations.
+        """
+
+        # change path
+        filepath = "../../data/GermanCredit_dataset/german.data"
+
+        # as given by german.doc
+        column_names = [
+            "status",
+            "month",
+            "credit_history",
+            "purpose",
+            "credit_amount",
+            "savings",
+            "employment",
+            "investment_as_income_percentage",
+            "personal_status",
+            "other_debtors",
+            "residence_since",
+            "property",
+            "age",
+            "installment_plans",
+            "housing",
+            "number_of_credits",
+            "skill_level",
+            "people_liable_for",
+            "telephone",
+            "foreign_worker",
+            "credit",
+        ]
+        try:
+            df = pd.read_csv(
+                filepath, sep=" ", header=None, names=column_names, na_values=na_values
+            )
+        except IOError as err:
+            print("IOError: {}".format(err))
+            print("To use this class, please download the following files:")
+            print(
+                "\n\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
+            )
+            print(
+                "\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc"
+            )
+            print("\nand place them, as-is, in the folder:")
+            print(
+                "\n\t{}\n".format(
+                    os.path.abspath(
+                        os.path.join(
+                            os.path.abspath(__file__),
+                            "..",
+                            "..",
+                            "data",
+                            "raw",
+                            "german",
+                        )
+                    )
+                )
+            )
+            import sys
+
+            sys.exit(1)
+
+        super(GermanDataset, self).__init__(
+            df=df,
+            label_name=label_name,
+            favorable_classes=favorable_classes,
+            protected_attribute_names=protected_attribute_names,
+            privileged_classes=privileged_classes,
+            instance_weights_name=instance_weights_name,
+            categorical_features=categorical_features,
+            features_to_keep=features_to_keep,
+            features_to_drop=features_to_drop,
+            na_values=na_values,
+            custom_preprocessing=custom_preprocessing,
+            metadata=metadata,
+        )
diff --git a/classifiers/Loan_approval_classifier/helper_functions.py b/classifiers/Loan_approval_classifier/helper_functions.py
new file mode 100644
index 0000000..392c3ae
--- /dev/null
+++ b/classifiers/Loan_approval_classifier/helper_functions.py
@@ -0,0 +1,149 @@
+import os
+import pandas as pd
+from aif360.datasets import StandardDataset
+
+
+def default_preprocessing(df):
+    # default: 1, no default: 0
+    df["credit"] = df["credit"].replace({1.0: 0, 2.0: 1})
+
+    # sex
+    # male: 0, female: 1
+    status_map = {"A91": 0, "A93": 0, "A94": 0, "A92": 1, "A95": 1}
+    df["sex"] = df["personal_status"].replace(status_map)
+
+    return df
+
+
+class GermanDataset(StandardDataset):
+    """German credit Dataset.
+    See :file:`aif360/data/raw/german/README.md`.
+    """
+
+    def __init__(
+        self,
+        label_name="credit",
+        favorable_classes=[1],
+        protected_attribute_names=["sex", "age"],
+        privileged_classes=[],
+        instance_weights_name=None,
+        categorical_features=[
+            "status",
+            "credit_history",
+            "purpose",
+            "savings",
+            "employment",
+            "other_debtors",
+            "property",
+            "installment_plans",
+            "housing",
+            "skill_level",
+            "telephone",
+            "foreign_worker",
+        ],
+        features_to_keep=[],
+        features_to_drop=["personal_status"],
+        na_values=[],
+        custom_preprocessing=default_preprocessing,
+        metadata=None,
+    ):
+        """See :obj:`StandardDataset` for a description of the arguments.
+        By default, this code converts the 'age' attribute to a binary value
+        where privileged is `age > 25` and unprivileged is `age <= 25` as
+        proposed by Kamiran and Calders [1]_.
+        References:
+            .. [1] F. Kamiran and T. Calders, "Classifying without
+               discriminating," 2nd International Conference on Computer,
+               Control and Communication, 2009.
+        Examples:
+            In some cases, it may be useful to keep track of a mapping from
+            `float -> str` for protected attributes and/or labels. If our use
+            case differs from the default, we can modify the mapping stored in
+            `metadata`:
+            >>> label_map = {1.0: 'Good Credit', 0.0: 'Bad Credit'}
+            >>> protected_attribute_maps = [{1.0: 'Male', 0.0: 'Female'}]
+            >>> gd = GermanDataset(protected_attribute_names=['sex'],
+            ... privileged_classes=[['male']], metadata={'label_map': label_map,
+            ... 'protected_attribute_maps': protected_attribute_maps})
+            Now this information will stay attached to the dataset and can be
+            used for more descriptive visualizations.
+        """
+
+        filepath = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)),
+            "..",
+            "..",
+            "data",
+            "GermanCredit_dataset",
+            "german.data",
+        )
+        # as given by german.doc
+        column_names = [
+            "status",
+            "month",
+            "credit_history",
+            "purpose",
+            "credit_amount",
+            "savings",
+            "employment",
+            "investment_as_income_percentage",
+            "personal_status",
+            "other_debtors",
+            "residence_since",
+            "property",
+            "age",
+            "installment_plans",
+            "housing",
+            "number_of_credits",
+            "skill_level",
+            "people_liable_for",
+            "telephone",
+            "foreign_worker",
+            "credit",
+        ]
+        try:
+            df = pd.read_csv(
+                filepath, sep=" ", header=None, names=column_names, na_values=na_values
+            )
+        except IOError as err:
+            print("IOError: {}".format(err))
+            print("To use this class, please download the following files:")
+            print(
+                "\n\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
+            )
+            print(
+                "\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc"
+            )
+            print("\nand place them, as-is, in the folder:")
+            print(
+                "\n\t{}\n".format(
+                    os.path.abspath(
+                        os.path.join(
+                            os.path.abspath(__file__),
+                            "..",
+                            "..",
+                            "data",
+                            "raw",
+                            "german",
+                        )
+                    )
+                )
+            )
+            import sys
+
+            sys.exit(1)
+
+        super(GermanDataset, self).__init__(
+            df=df,
+            label_name=label_name,
+            favorable_classes=favorable_classes,
+            protected_attribute_names=protected_attribute_names,
+            privileged_classes=privileged_classes,
+            instance_weights_name=instance_weights_name,
+            categorical_features=categorical_features,
+            features_to_keep=features_to_keep,
+            features_to_drop=features_to_drop,
+            na_values=na_values,
+            custom_preprocessing=custom_preprocessing,
+            metadata=metadata,
+        )
diff --git a/data/GermanCredit_dataset/german_dataset.py b/data/GermanCredit_dataset/german_dataset.py
new file mode 100644
index 0000000..c904f08
--- /dev/null
+++ b/data/GermanCredit_dataset/german_dataset.py
@@ -0,0 +1,162 @@
+import os
+import pandas as pd
+from aif360.datasets import StandardDataset
+
+
+default_mappings = {
+    "label_maps": [{0: "Good Credit", 1: "Bad Credit"}],
+    "protected_attribute_maps": [
+        {1.0: "Male", 0.0: "Female"},
+        {1.0: "Old", 0.0: "Young"},
+    ],
+}
+
+
+def default_preprocessing(df):
+    """Adds a derived sex attribute based on personal_status."""
+    # TODO: ignores the value of privileged_classes for 'sex'
+    status_map = {
+        "A91": "male",
+        "A93": "male",
+        "A94": "male",
+        "A92": "female",
+        "A95": "female",
+    }
+    df["sex"] = df["personal_status"].replace(status_map)
+
+    return df
+
+
+class GermanDataset(StandardDataset):
+    """German credit Dataset.
+
+    See :file:`aif360/data/raw/german/README.md`.
+    """
+
+    def __init__(
+        self,
+        label_name="credit",
+        favorable_classes=[0],
+        protected_attribute_names=[],
+        privileged_classes=[],
+        instance_weights_name=None,
+        categorical_features=[
+            "status",
+            "credit_history",
+            "purpose",
+            "savings",
+            "employment",
+            "other_debtors",
+            "property",
+            "installment_plans",
+            "housing",
+            "skill_level",
+            "telephone",
+            "foreign_worker",
+        ],
+        features_to_keep=[],
+        features_to_drop=["personal_status"],
+        na_values=[],
+        custom_preprocessing=default_preprocessing,
+        metadata=default_mappings,
+    ):
+        """See :obj:`StandardDataset` for a description of the arguments.
+
+        By default, this code converts the 'age' attribute to a binary value
+        where privileged is `age > 25` and unprivileged is `age <= 25` as
+        proposed by Kamiran and Calders [1]_.
+
+        References:
+            .. [1] F. Kamiran and T. Calders, "Classifying without
+               discriminating," 2nd International Conference on Computer,
+               Control and Communication, 2009.
+
+        Examples:
+            In some cases, it may be useful to keep track of a mapping from
+            `float -> str` for protected attributes and/or labels. If our use
+            case differs from the default, we can modify the mapping stored in
+            `metadata`:
+
+            >>> label_map = {1.0: 'Good Credit', 0.0: 'Bad Credit'}
+            >>> protected_attribute_maps = [{1.0: 'Male', 0.0: 'Female'}]
+            >>> gd = GermanDataset(protected_attribute_names=['sex'],
+            ... privileged_classes=[['male']], metadata={'label_map': label_map,
+            ... 'protected_attribute_maps': protected_attribute_maps})
+
+            Now this information will stay attached to the dataset and can be
+            used for more descriptive visualizations.
+        """
+
+        # change path
+        filepath = "./german.data"
+
+        # as given by german.doc
+        column_names = [
+            "status",
+            "month",
+            "credit_history",
+            "purpose",
+            "credit_amount",
+            "savings",
+            "employment",
+            "investment_as_income_percentage",
+            "personal_status",
+            "other_debtors",
+            "residence_since",
+            "property",
+            "age",
+            "installment_plans",
+            "housing",
+            "number_of_credits",
+            "skill_level",
+            "people_liable_for",
+            "telephone",
+            "foreign_worker",
+            "credit",
+        ]
+        try:
+            df = pd.read_csv(
+                filepath, sep=" ", header=None, names=column_names, na_values=na_values
+            )
+        except IOError as err:
+            print("IOError: {}".format(err))
+            print("To use this class, please download the following files:")
+            print(
+                "\n\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
+            )
+            print(
+                "\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc"
+            )
+            print("\nand place them, as-is, in the folder:")
+            print(
+                "\n\t{}\n".format(
+                    os.path.abspath(
+                        os.path.join(
+                            os.path.abspath(__file__),
+                            "..",
+                            "..",
+                            "data",
+                            "raw",
+                            "german",
+                        )
+                    )
+                )
+            )
+            import sys
+
+            sys.exit(1)
+
+        super(GermanDataset, self).__init__(
+            df=df,
+            label_name=label_name,
+            favorable_classes=favorable_classes,
+            protected_attribute_names=protected_attribute_names,
+            privileged_classes=privileged_classes,
+            instance_weights_name=instance_weights_name,
+            categorical_features=categorical_features,
+            features_to_keep=features_to_keep,
+            features_to_drop=features_to_drop,
+            na_values=na_values,
+            custom_preprocessing=custom_preprocessing,
+            metadata=metadata,
+        )
diff --git a/tests/test_bahc.py b/tests/test_bahc.py
index 74619c3..131d6ea 100644
--- a/tests/test_bahc.py
+++ b/tests/test_bahc.py
@@ -5,8 +5,8 @@
 def test_shapes():
     # Checks that labels and scores have the right shapes
     rng = np.random.RandomState(12)
-    X = rng.rand(20, 10)
-    y = rng.rand(20)
+    X = rng.randn(100, 10)
+    y = rng.randn(100)
     bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
     bahc.fit(X, y)
     assert len(bahc.labels_) == len(X)
@@ -16,22 +16,28 @@ def test_shapes():
 def test_labels():
     # Checks that label values are between 0 and n_clusters
     rng = np.random.RandomState(12)
-    X = rng.rand(20, 10)
-    y = rng.rand(20)
+    X = rng.randn(100, 10)
+    y = rng.randn(100)
     bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
     bahc.fit(X, y)
     assert np.array_equal(np.unique(bahc.labels_), np.arange(bahc.n_clusters_))
 
 
-# def test_cluster_sizes():
+def test_cluster_sizes():
     # Checks that cluster sizes are at least bahc_min_cluster_size
+    rng = np.random.RandomState(12)
+    X = rng.randn(100, 10)
+    y = rng.randn(100)
+    bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=20)
+    bahc.fit(X, y)
+    assert np.all(np.bincount(bahc.labels_) >= bahc.bahc_min_cluster_size)
 
 
 def test_constant_metric():
     # Checks that there is only one cluster with a score of 0 if the metric is constant
     rng = np.random.RandomState(12)
-    X = rng.rand(20, 10)
-    y = np.full(20, rng.rand())
+    X = rng.randn(100, 10)
+    y = np.full(100, rng.randn())
     bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
     bahc.fit(X, y)
     assert bahc.n_clusters_ == 1
@@ -41,14 +47,14 @@ def test_constant_metric():
 def test_scores():
     # Checks that scores are computed correctly
     rng = np.random.RandomState(12)
-    X = rng.rand(20, 10)
-    y = rng.rand(20)
+    X = rng.randn(100, 10)
+    y = rng.randn(100)
     bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
     bahc.fit(X, y)
     # TODO: Check this!!!
     for i in range(bahc.n_clusters_):
-        cluster_indices = np.arange(20)[bahc.labels_ == i]
-        complement_indices = np.arange(20)[bahc.labels_ != i]
+        cluster_indices = np.arange(100)[bahc.labels_ == i]
+        complement_indices = np.arange(100)[bahc.labels_ != i]
         score = np.mean(y[complement_indices]) - np.mean(y[cluster_indices])
         assert bahc.scores_[i] == score
 
@@ -56,8 +62,8 @@ def test_scores():
 def test_scores_are_sorted():
     # Checks that scores are sorted in descending order
     rng = np.random.RandomState(12)
-    X = rng.rand(20, 10)
-    y = rng.rand(20)
+    X = rng.randn(100, 10)
+    y = rng.randn(100)
     bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
     bahc.fit(X, y)
     assert np.all(bahc.scores_[:-1] >= bahc.scores_[1:])
@@ -66,8 +72,8 @@ def test_scores_are_sorted():
 def test_predict():
     # Checks that predict returns the same labels as fit
     rng = np.random.RandomState(12)
-    X = rng.rand(20, 10)
-    y = rng.rand(20)
+    X = rng.randn(100, 10)
+    y = rng.randn(100)
     bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
     bahc.fit(X, y)
     assert np.array_equal(bahc.predict(X), bahc.labels_)
diff --git a/unsupervised_bias_detection/cluster/_bahc.py b/unsupervised_bias_detection/cluster/_bahc.py
index ecfeb1b..0ffdaeb 100644
--- a/unsupervised_bias_detection/cluster/_bahc.py
+++ b/unsupervised_bias_detection/cluster/_bahc.py
@@ -68,8 +68,9 @@ def fit(self, X, y):
         leaves = []
         std = np.std(y)
         score = 0
-        root = ClusterNode(std, score)
+        root = ClusterNode(0, std, score)
         self.cluster_tree_ = root
+        n_nodes = 1
         # The entire dataset has a discrimination score of zero
         heap = [root]
         for _ in range(self.bahc_max_iter):
@@ -127,8 +128,9 @@ def fit(self, X, y):
                     child_score = child_scores[i]
                     # heapq implements min-heap
                     # so we have to negate std before pushing
-                    child_node = ClusterNode(-child_std, child_score)
+                    child_node = ClusterNode(n_nodes, -child_std, child_score)
                     children.append(child_node)
+                    n_nodes += 1
                 node.split(clustering_model, children)
                 self.n_clusters_ += n_children - 1
         
diff --git a/unsupervised_bias_detection/cluster/_cluster_node.py b/unsupervised_bias_detection/cluster/_cluster_node.py
index 9bd032c..9433f53 100644
--- a/unsupervised_bias_detection/cluster/_cluster_node.py
+++ b/unsupervised_bias_detection/cluster/_cluster_node.py
@@ -3,9 +3,7 @@
 from typing import Self
 
 class ClusterNode:
-    _id_counter = itertools.count()
-
-    def __init__(self, neg_std: float, score: float):
+    def __init__(self, node_id: int, neg_std: float, score: float):
         """
         Initialize a node in the cluster tree.
         
@@ -14,12 +12,12 @@ def __init__(self, neg_std: float, score: float):
         label : int
             The cluster label for this node (required as all nodes start as leaves)
         """
-        self.id = next(self._id_counter)
+        self.node_id = node_id
         self.neg_std = neg_std
         self.score = score
         # The label is set to the id when the node is a leaf
         # and is set to None when the node is split
-        self.label = self.id
+        self.label = node_id
         self.clustering_model = None
         self.children = []
     
@@ -28,7 +26,7 @@ def is_leaf(self):
         return len(self.children) == 0
     
     def __lt__(self, other: Self):
-        return self.neg_std < other.neg_std or (self.neg_std == other.neg_std and self.id < other.id)
+        return self.neg_std < other.neg_std or (self.neg_std == other.neg_std and self.node_id < other.node_id)
     
     def split(self, clustering_model: ClusterMixin, children: list[Self]):
         """

From f58dd9bc81c43c9e1f7a16d4b83a270846710a14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@MacBook-Pro.local>
Date: Thu, 15 May 2025 18:17:12 +0200
Subject: [PATCH 3/8] Fix bug related to node.score

---
 unsupervised_bias_detection/cluster/_bahc.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/unsupervised_bias_detection/cluster/_bahc.py b/unsupervised_bias_detection/cluster/_bahc.py
index 0ffdaeb..b67de28 100644
--- a/unsupervised_bias_detection/cluster/_bahc.py
+++ b/unsupervised_bias_detection/cluster/_bahc.py
@@ -68,7 +68,7 @@ def fit(self, X, y):
         leaves = []
         std = np.std(y)
         score = 0
-        root = ClusterNode(0, std, score)
+        root = ClusterNode(0, -std, score)
         self.cluster_tree_ = root
         n_nodes = 1
         # The entire dataset has a discrimination score of zero
@@ -80,11 +80,12 @@ def fit(self, X, y):
             # Take the cluster with the highest standard deviation of metric y
             node = heapq.heappop(heap)
             label = node.label
+            score = node.score
             cluster_indices = np.nonzero(labels == label)[0]
-            cluster = X[cluster_indices]
+            X_cluster = X[cluster_indices]
 
             clustering_model = self.clustering_cls(**self.clustering_params)
-            cluster_labels = clustering_model.fit_predict(cluster)
+            cluster_labels = clustering_model.fit_predict(X_cluster)
 
             if hasattr(clustering_model, "n_clusters_"):
                 n_children = clustering_model.n_clusters_
@@ -102,16 +103,16 @@ def fit(self, X, y):
                     leaves.append(node)
                     valid_split = False
                     break
-            
+                        
             # If all children clusters are of sufficient size, we check if the score of each child cluster is greater than or equal to the current score
             if valid_split:
                 child_scores = []
                 for child_indices in children_indices:
-                    cluster_metric = y[child_indices]
+                    y_cluster = y[child_indices]
                     complement_mask = np.ones(n_samples, dtype=bool)
                     complement_mask[child_indices] = False
-                    complement_metric = y[complement_mask]
-                    child_score = np.mean(complement_metric) - np.mean(cluster_metric)
+                    y_complement = y[complement_mask]
+                    child_score = np.mean(y_complement) - np.mean(y_cluster)
                     if child_score >= score:
                         child_scores.append(child_score)
                     else:

From 364585ed69ad0aff58500c4c575a0342892ed1cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@MacBook-Pro.local>
Date: Thu, 15 May 2025 18:26:43 +0200
Subject: [PATCH 4/8] Revert tests to a sample of 20

---
 tests/test_bahc.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tests/test_bahc.py b/tests/test_bahc.py
index 131d6ea..1e353d3 100644
--- a/tests/test_bahc.py
+++ b/tests/test_bahc.py
@@ -5,8 +5,8 @@
 def test_shapes():
     # Checks that labels and scores have the right shapes
     rng = np.random.RandomState(12)
-    X = rng.randn(100, 10)
-    y = rng.randn(100)
+    X = rng.rand(20, 10)
+    y = rng.rand(20)
     bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
     bahc.fit(X, y)
     assert len(bahc.labels_) == len(X)
@@ -16,8 +16,8 @@ def test_shapes():
 def test_labels():
     # Checks that label values are between 0 and n_clusters
     rng = np.random.RandomState(12)
-    X = rng.randn(100, 10)
-    y = rng.randn(100)
+    X = rng.rand(20, 10)
+    y = rng.rand(20)
     bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
     bahc.fit(X, y)
     assert np.array_equal(np.unique(bahc.labels_), np.arange(bahc.n_clusters_))
@@ -26,8 +26,8 @@ def test_labels():
 def test_cluster_sizes():
     # Checks that cluster sizes are at least bahc_min_cluster_size
     rng = np.random.RandomState(12)
-    X = rng.randn(100, 10)
-    y = rng.randn(100)
+    X = rng.rand(20, 10)
+    y = rng.rand(20)
     bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=20)
     bahc.fit(X, y)
     assert np.all(np.bincount(bahc.labels_) >= bahc.bahc_min_cluster_size)
@@ -36,8 +36,8 @@ def test_cluster_sizes():
 def test_constant_metric():
     # Checks that there is only one cluster with a score of 0 if the metric is constant
     rng = np.random.RandomState(12)
-    X = rng.randn(100, 10)
-    y = np.full(100, rng.randn())
+    X = rng.rand(20, 10)
+    y = np.full(20, rng.rand())
     bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
     bahc.fit(X, y)
     assert bahc.n_clusters_ == 1
@@ -47,14 +47,14 @@ def test_constant_metric():
 def test_scores():
     # Checks that scores are computed correctly
     rng = np.random.RandomState(12)
-    X = rng.randn(100, 10)
-    y = rng.randn(100)
+    X = rng.rand(20, 10)
+    y = rng.rand(20)
     bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
     bahc.fit(X, y)
     # TODO: Check this!!!
     for i in range(bahc.n_clusters_):
-        cluster_indices = np.arange(100)[bahc.labels_ == i]
-        complement_indices = np.arange(100)[bahc.labels_ != i]
+        cluster_indices = np.arange(20)[bahc.labels_ == i]
+        complement_indices = np.arange(20)[bahc.labels_ != i]
         score = np.mean(y[complement_indices]) - np.mean(y[cluster_indices])
         assert bahc.scores_[i] == score
 
@@ -62,8 +62,8 @@ def test_scores():
 def test_scores_are_sorted():
     # Checks that scores are sorted in descending order
     rng = np.random.RandomState(12)
-    X = rng.randn(100, 10)
-    y = rng.randn(100)
+    X = rng.rand(20, 10)
+    y = rng.rand(20)
     bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
     bahc.fit(X, y)
     assert np.all(bahc.scores_[:-1] >= bahc.scores_[1:])
@@ -72,8 +72,8 @@ def test_scores_are_sorted():
 def test_predict():
     # Checks that predict returns the same labels as fit
     rng = np.random.RandomState(12)
-    X = rng.randn(100, 10)
-    y = rng.randn(100)
+    X = rng.rand(20, 10)
+    y = rng.rand(20)
     bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
     bahc.fit(X, y)
     assert np.array_equal(bahc.predict(X), bahc.labels_)

From f2534019aa1c1c78305b28437e197b8491d704b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@MacBook-Pro.local>
Date: Thu, 15 May 2025 18:27:25 +0200
Subject: [PATCH 5/8] Fix test_cluster_sizes

---
 tests/test_bahc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_bahc.py b/tests/test_bahc.py
index 1e353d3..187fa39 100644
--- a/tests/test_bahc.py
+++ b/tests/test_bahc.py
@@ -28,7 +28,7 @@ def test_cluster_sizes():
     rng = np.random.RandomState(12)
     X = rng.rand(20, 10)
     y = rng.rand(20)
-    bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=20)
+    bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=5)
     bahc.fit(X, y)
     assert np.all(np.bincount(bahc.labels_) >= bahc.bahc_min_cluster_size)
 

From f43eb580d682251f947b9cd7aed0dc08ad2675b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@MacBook-Pro.local>
Date: Thu, 15 May 2025 23:28:25 +0200
Subject: [PATCH 6/8] Fix bug related to labels

---
 unsupervised_bias_detection/cluster/_bahc.py  | 48 +++++++++++--------
 .../cluster/_cluster_node.py                  | 10 ++--
 2 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/unsupervised_bias_detection/cluster/_bahc.py b/unsupervised_bias_detection/cluster/_bahc.py
index b67de28..4d2d69d 100644
--- a/unsupervised_bias_detection/cluster/_bahc.py
+++ b/unsupervised_bias_detection/cluster/_bahc.py
@@ -61,17 +61,16 @@ def fit(self, X, y):
             order="C",
         )
         n_samples, _ = X.shape
-        # We start with all samples being in a single cluster
+        # We start with all samples being in a single cluster with label 0
         self.n_clusters_ = 1
-        # We assign all samples a label of zero
         labels = np.zeros(n_samples, dtype=np.uint32)
         leaves = []
+        label = 0
         std = np.std(y)
+        # The entire dataset has a discrimination score of zero
         score = 0
-        root = ClusterNode(0, -std, score)
+        root = ClusterNode(label, -std, score)
         self.cluster_tree_ = root
-        n_nodes = 1
-        # The entire dataset has a discrimination score of zero
         heap = [root]
         for _ in range(self.bahc_max_iter):
             if not heap:
@@ -100,12 +99,12 @@ def fit(self, X, y):
                 if len(child_indices) >= self.bahc_min_cluster_size:
                     children_indices.append(child_indices)
                 else:
-                    leaves.append(node)
                     valid_split = False
                     break
                         
-            # If all children clusters are of sufficient size, we check if the score of each child cluster is greater than or equal to the current score
+            # If all children clusters are of sufficient size, we check if the score of any child cluster is greater than or equal to the current score
             if valid_split:
+                valid_split = False
                 child_scores = []
                 for child_indices in children_indices:
                     y_cluster = y[child_indices]
@@ -114,26 +113,33 @@ def fit(self, X, y):
                     y_complement = y[complement_mask]
                     child_score = np.mean(y_complement) - np.mean(y_cluster)
                     if child_score >= score:
-                        child_scores.append(child_score)
-                    else:
-                        leaves.append(node)
-                        valid_split = False
-                        break
+                        valid_split = True
+                    child_scores.append(child_score)
             
             # If the split is valid, we create the children nodes and split the current node
+            # Otherwise, we add the current node to the leaves
             if valid_split:
-                children = []
-                for i in range(n_children):
+                # TODO: Make this nicer!
+                # TODO: Maybe explain why we negate std before pushing to heap
+                first_child_indices = children_indices[0]
+                first_child_std = np.std(y[first_child_indices])
+                first_child_score = child_scores[0]
+                first_child = ClusterNode(label, -first_child_std, first_child_score)
+                heapq.heappush(heap, first_child)
+                labels[first_child_indices] = label
+                children = [first_child]
+                for i in range(1, n_children):
                     child_indices = children_indices[i]
                     child_std = np.std(y[child_indices])
                     child_score = child_scores[i]
-                    # heapq implements min-heap
-                    # so we have to negate std before pushing
-                    child_node = ClusterNode(n_nodes, -child_std, child_score)
+                    child_node = ClusterNode(self.n_clusters_, -child_std, child_score)
+                    heapq.heappush(heap, child_node)
+                    labels[child_indices] = self.n_clusters_
                     children.append(child_node)
-                    n_nodes += 1
+                    self.n_clusters_ += 1
                 node.split(clustering_model, children)
-                self.n_clusters_ += n_children - 1
+            else:
+                leaves.append(node)
         
         leaves.extend(heap)
         leaf_scores = np.array([leaf.score for leaf in leaves])
@@ -145,8 +151,8 @@ def fit(self, X, y):
         label_mapping = np.zeros(self.n_clusters_, dtype=np.uint32)
         label_mapping[leaf_labels] = np.arange(self.n_clusters_, dtype=np.uint32)
         self.labels_ = label_mapping[labels]
-        for leaf in leaves:
-            leaf.label = label_mapping[leaf.label]
+        for i in range(self.n_clusters_):
+            leaves[i].label = leaf_labels[i]
         return self
     
     def predict(self, X):
diff --git a/unsupervised_bias_detection/cluster/_cluster_node.py b/unsupervised_bias_detection/cluster/_cluster_node.py
index 9433f53..d4d0398 100644
--- a/unsupervised_bias_detection/cluster/_cluster_node.py
+++ b/unsupervised_bias_detection/cluster/_cluster_node.py
@@ -3,7 +3,7 @@
 from typing import Self
 
 class ClusterNode:
-    def __init__(self, node_id: int, neg_std: float, score: float):
+    def __init__(self, label: int, neg_std: float, score: float):
         """
         Initialize a node in the cluster tree.
         
@@ -12,12 +12,9 @@ def __init__(self, node_id: int, neg_std: float, score: float):
         label : int
             The cluster label for this node (required as all nodes start as leaves)
         """
-        self.node_id = node_id
+        self.label = label
         self.neg_std = neg_std
         self.score = score
-        # The label is set to the id when the node is a leaf
-        # and is set to None when the node is split
-        self.label = node_id
         self.clustering_model = None
         self.children = []
     
@@ -26,7 +23,8 @@ def is_leaf(self):
         return len(self.children) == 0
     
     def __lt__(self, other: Self):
-        return self.neg_std < other.neg_std or (self.neg_std == other.neg_std and self.node_id < other.node_id)
+        # TODO: Use score before label
+        return self.neg_std < other.neg_std or (self.neg_std == other.neg_std and self.label < other.label)
     
     def split(self, clustering_model: ClusterMixin, children: list[Self]):
         """

From a7ac50c8e9685a09b0c3dc833f085a283cb710f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@MacBook-Pro.local>
Date: Thu, 15 May 2025 23:31:08 +0200
Subject: [PATCH 7/8] Fix leaf labels bug

---
 unsupervised_bias_detection/cluster/_bahc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsupervised_bias_detection/cluster/_bahc.py b/unsupervised_bias_detection/cluster/_bahc.py
index 4d2d69d..278aa32 100644
--- a/unsupervised_bias_detection/cluster/_bahc.py
+++ b/unsupervised_bias_detection/cluster/_bahc.py
@@ -151,8 +151,8 @@ def fit(self, X, y):
         label_mapping = np.zeros(self.n_clusters_, dtype=np.uint32)
         label_mapping[leaf_labels] = np.arange(self.n_clusters_, dtype=np.uint32)
         self.labels_ = label_mapping[labels]
-        for i in range(self.n_clusters_):
-            leaves[i].label = leaf_labels[i]
+        for leaf in leaves:
+            leaf.label = label_mapping[leaf.label]
         return self
     
     def predict(self, X):

From c052f1acf884a5bc87399f3ee44125f0df2ed54d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@MacBook-Pro.local>
Date: Fri, 16 May 2025 00:37:45 +0200
Subject: [PATCH 8/8] Add margin

---
 unsupervised_bias_detection/cluster/_bahc.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/unsupervised_bias_detection/cluster/_bahc.py b/unsupervised_bias_detection/cluster/_bahc.py
index 278aa32..45f2059 100644
--- a/unsupervised_bias_detection/cluster/_bahc.py
+++ b/unsupervised_bias_detection/cluster/_bahc.py
@@ -29,11 +29,13 @@ def __init__(
         clustering_cls: Type[ClusterMixin],
         bahc_max_iter: int,
         bahc_min_cluster_size: int,
+        margin: float = 1e-5,
         **clustering_params: Any,
     ):
         self.clustering_cls = clustering_cls
         self.bahc_max_iter = bahc_max_iter
         self.bahc_min_cluster_size = bahc_min_cluster_size
+        self.margin = margin
         self.clustering_params = clustering_params
 
     def fit(self, X, y):
@@ -112,7 +114,7 @@ def fit(self, X, y):
                     complement_mask[child_indices] = False
                     y_complement = y[complement_mask]
                     child_score = np.mean(y_complement) - np.mean(y_cluster)
-                    if child_score >= score:
+                    if child_score >= score + self.margin:
                         valid_split = True
                     child_scores.append(child_score)