diff --git a/HBAC_scan/helper_functions.py b/HBAC_scan/helper_functions.py new file mode 100644 index 0000000..8e92025 --- /dev/null +++ b/HBAC_scan/helper_functions.py @@ -0,0 +1,492 @@ +import random +import numpy as np +import pandas as pd +import seaborn as sns +import pingouin as pg +import scipy.stats as stats + +# matplotlib +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans +from matplotlib.lines import Line2D +from matplotlib import collections as mc + +# sklearn +from sklearn.decomposition import PCA +from sklearn.preprocessing import StandardScaler + + +def init_GermanCredit_dataset( + raw_data, + features, + with_errors=True, + just_features=True, + scale_features=True, + with_classes=True, +): + """Initializing dataset: scaling features, adding new columns which are required for HBAC""" + + new_data = raw_data.copy(deep=True) + + to_scale = new_data.drop( + ["predicted_class", "true_class", "errors", "FP_errors", "FN_errors"], axis=1 + ).columns + new_data[to_scale] = StandardScaler().fit_transform(features[to_scale]) + + new_data["clusters"] = 0 + new_data["new_clusters"] = -1 + return new_data + + +def init_dataset(raw_data, features): + """Initializing dataset: scaling features, adding new columns which are required for HBAC""" + + # copy dataframe + new_data = raw_data.copy(deep=True) + + # only scale features + to_scale = new_data.drop( + ["predicted_class", "true_class", "errors", "FP_errors", "FN_errors"], axis=1 + ).columns + new_data[to_scale] = StandardScaler().fit_transform(features[to_scale]) + + # initialize clustering parameters + new_data["clusters"] = 0 + new_data["new_clusters"] = -1 + + return new_data + + +def bias(results, metric): + """Return accuracy, FP rate or FN rate of dataframe""" + + if metric == "Accuracy": + correct = results.loc[results["errors"] == 0] + acc = len(correct) / len(results) + return acc + if metric == "FP": + FPs = results.loc[ + (results["predicted_class"] == 1) & (results["true_class"] == 0) + ] + Ns = results.loc[(results["true_class"] == 0)] + if Ns.shape[0] != 0: + FP_rate = len(FPs) / len(Ns) + return 1 - FP_rate + else: + return 1 + if metric == "FN": + FNs = results.loc[ + (results["predicted_class"] == 0) & (results["true_class"] == 1) + ] + Ps = results.loc[(results["true_class"] == 1)] + if Ps.shape[0] != 0: + FN_rate = len(FNs) / len(Ps) + return 1 - FN_rate + else: + return 1 + + +def bias_acc(data, metric, cluster_id, cluster_col): + """Bias := bias metric of the selected cluster - bias metric of the remaining clusters""" + cluster_x = data.loc[data[cluster_col] == cluster_id] + if len(cluster_x) == 0: + print("This is an empty cluster", cluster_id) + remaining_clusters = data.loc[data[cluster_col] != cluster_id] + if len(remaining_clusters) == 0: + print("This cluster is the entire dataset", cluster_id) + return bias(cluster_x, metric) - bias(remaining_clusters, metric) + + +def get_max_bias(fulldata, metric, function=bias_acc): + """Calculates the highest negative bias of the newly introduced clusters""" + max_bias = -999999 + for cluster_number in fulldata["new_clusters"].unique(): + current_bias = function(fulldata, metric, cluster_number, "new_clusters") + if current_bias < max_bias: + print("current bias: ", current_bias) + print("max abs bias: ", max_bias) + max_bias = current_bias + return max_bias + + +def get_max_bias_cluster(fulldata, metric, function=bias_acc): + """Identifies cluster linked to the highest bias of the newly introduced clusters""" + max_bias = 100 + min_bias = -100 + best_cluster = -2 + for cluster_number in fulldata["clusters"].unique(): + current_bias = function(fulldata, metric, cluster_number, "clusters") + print(f"{cluster_number} has bias {current_bias}") + + # Accuracy + if metric == "Accuracy": + if current_bias < max_bias: + max_bias = current_bias + best_cluster = cluster_number + + # FP/FN + if metric == "FP" or metric == "FN": + if current_bias > min_bias: + min_bias = current_bias + best_cluster = cluster_number + + return best_cluster + + +def get_min_cluster_size(data): + """Size of smallest new cluster""" + min_cluster_size = len(data) + for i in data["new_clusters"].unique(): + # exclude the cluster -1 from being seen as a cluster, since it contains outliers + if i == -1: + continue + size = len(data.loc[data["new_clusters"] == i]) + if size < min_cluster_size: + min_cluster_size = size + return min_cluster_size + + +def get_next_cluster(data, metric): + """Identifies cluster number with the highest variance. The variance is calculated based on the error metric of each cluster. The cluster with the highest variance will be selected as splitting cluster""" + n_cluster = max(data["clusters"]) + highest_variance = -1 + cluster_number = 0 + + for i in data["clusters"].unique(): + if i == -1: + continue + cluster_i = data.loc[data["clusters"] == i] + if metric == "Accuracy": + variance_cluster = np.var(cluster_i["errors"]) + if metric == "FP": + variance_cluster = np.var(cluster_i["FP_errors"]) + if metric == "FN": + variance_cluster = np.var(cluster_i["FN_errors"]) + + if variance_cluster > highest_variance: + highest_variance = variance_cluster + cluster_number = i + + return cluster_number + + +def calculate_variance(data, metric): + """Determines variance for a dataframe.""" + variance_list_local = [] + for j in data["clusters"].unique(): + average_bias = bias(data, metric) + bias_clus = bias_acc(data, metric, j, "clusters") + variance_list_local.append(bias_clus) + variance = np.var(variance_list_local) + return variance + + +def get_random_cluster(clusters): + """Identifies value of a random cluster""" + result = -1 + while result == -1: + result = random.randint(0, len(clusters.unique())) + return result + + +def HBAC_bias_scan( + df, metric, split_cluster_size, acc_cluster_size, clustering_paramaters +): + iterations_max = 20 + x = 0 # initial cluster number + initial_bias = 0 + variance_list = [] + average_bias = bias(df, metric) + minimal_splittable_cluster_size = split_cluster_size + minimal_acceptable_cluster_size = acc_cluster_size + print(f"bias {metric} is: ", average_bias) + + for i in range(1, iterations_max): + if i != 1: + + # calculate variance for cluster + variance_list.append(calculate_variance(df, metric)) + + df["new_clusters"] = -1 + candidate_cluster = df.loc[df["clusters"] == x] + + if len(candidate_cluster) < minimal_splittable_cluster_size: + x = get_random_cluster(df["clusters"]) + continue + + # k-means clustering + kmeans_algo = KMeans(**clustering_paramaters).fit( + candidate_cluster.drop( + [ + "clusters", + "new_clusters", + "predicted_class", + "true_class", + "errors", + "FP_errors", + "FN_errors", + ], + axis=1, + ) + ) + + candidate_cluster["new_clusters"] = pd.DataFrame( + kmeans_algo.predict( + candidate_cluster.drop( + [ + "clusters", + "new_clusters", + "predicted_class", + "true_class", + "errors", + "FP_errors", + "FN_errors", + ], + axis=1, + ) + ), + index=candidate_cluster.index, + ) + df["new_clusters"] = candidate_cluster["new_clusters"].combine_first( + df["new_clusters"] + ) + + # find discriminated clusters + max_bias = get_max_bias(df, metric) + min_new_size = get_min_cluster_size(df) + + if (max_bias <= initial_bias) & ( + min_new_size > minimal_acceptable_cluster_size + ): + # Add new cluster + n_cluster = max(df["clusters"]) + df["clusters"][df["new_clusters"] == 1] = n_cluster + 1 + + x = get_next_cluster(df, metric) + initial_bias = max_bias + else: + x = get_random_cluster(df["clusters"]) + + print("done") + return df + + +def stat_df(df, discriminated_cluster, not_discriminated): + + # finding difference + difference = (discriminated_cluster.mean()) - (not_discriminated.mean()) + diff_dict = difference.to_dict() + + # unscaling the discriminated cluster + unscaled_discriminated = df.loc[discriminated_cluster.index, :] + + # unscaled other data + unscaled_remaining = df.drop(discriminated_cluster.index) + + # statistical testing + welch_dict = {} + CI_dict_left = {} + CI_dict_right = {} + + features = [ + col + for col in df.columns.tolist() + if col + not in [ + "tweet_id1", + "scaled_errors", + "predicted_class", + "true_class", + "errors", + "FP_errors", + "FN_errors", + "clusters", + "new_clusters", + ] + ] + + for i in features: + welch_i = stats.ttest_ind( + unscaled_discriminated[i], unscaled_remaining[i], equal_var=False + ) + res = pg.ttest(unscaled_discriminated[i], unscaled_remaining[i], paired=False) + + # attach to dictionary + welch_dict[i] = welch_i.pvalue + CI_dict_left[i] = res["CI95%"][0][0] + CI_dict_right[i] = res["CI95%"][0][1] + + # store results in dataframe + pd.set_option("display.float_format", lambda x: "%.5f" % x) + cluster_analysis_df = pd.DataFrame( + [diff_dict, welch_dict, CI_dict_left, CI_dict_right] + ).T + cluster_analysis_df.columns = ["difference", "p-value", "[0.025", "0.975]"] + cluster_analysis_df = cluster_analysis_df.sort_values("p-value", ascending=[True]) + n_rows = cluster_analysis_df.shape[0] + + # Get errors; (coef - lower bound of conf interval) + cluster_analysis_df["errors"] = ( + cluster_analysis_df["difference"] - cluster_analysis_df["[0.025"] + ) + cluster_analysis_df = cluster_analysis_df.iloc[0:n_rows,] + cluster_analysis_df["num"] = [int(i) for i in np.linspace(n_rows - 1, 0, n_rows)] + + cluster_analysis_df = cluster_analysis_df.reset_index() + + return cluster_analysis_df + + +def CI_plot(df, x_lim, feat_ls): + """ + Takes in results of Welch's t-test and returns a plot of + the coefficients with 95% confidence intervals. + """ + n_rows = df.shape[0] + + # line segments + lines_sign = [] + lines_non_sign = [] + index_ls = [] + i = n_rows + for feat in feat_ls: + k = df[df["index"] == feat].index[0] + p_value = df.iloc[k, 2] + if p_value <= 0.05: + sub_ls_sign = [] + sub_ls_sign.append((df.iloc[k, 3], i)) + sub_ls_sign.append((df.iloc[k, 4], i)) + lines_sign.append(sub_ls_sign) + index_ls.append((i, k)) + i -= 1 + else: + sub_ls_non_sign = [] + sub_ls_non_sign.append((df.iloc[k, 3], i)) + sub_ls_non_sign.append((df.iloc[k, 4], i)) + lines_non_sign.append(sub_ls_non_sign) + index_ls.append((i, k)) + i -= 1 + + fig, ax = plt.subplots(figsize=(10, 7)) + + # Line to define zero on the x-axis + ax.axvline(x=0, linestyle="--", color="black", linewidth=1) + + # line segments significant + lc = mc.LineCollection(lines_sign, colors="steelblue", linewidths=10, alpha=0.75) + ax.add_collection(lc) + ax.autoscale() + + # line segments non-significant + lc = mc.LineCollection( + lines_non_sign, colors="steelblue", linewidths=10, alpha=0.25 + ) + ax.add_collection(lc) + ax.autoscale() + + # title and axes + plt.title("Cluster difference 95% confidence interval", fontsize=24) + + # font size axes + ax.tick_params(axis="both", which="major", labelsize=16) + + # x-axis + ax.set_xlabel("Difference in means", fontsize=22) + ax.set_xlim(x_lim) + xlims = ax.get_xlim() + + # annotate x-axis + ax.annotate( + "Cluster mean lower than\nrest of (standardized) dataset", + xy=(xlims[0], -0.1), + xytext=(xlims[0], -0.5), + ha="center", + annotation_clip=False, + fontsize=14, + style="italic", + ) + ax.annotate( + "Cluster mean higher than\nrest of (standardized) dataset", + xy=(xlims[1], -0.1), + xytext=(xlims[1], -0.5), + ha="center", + annotation_clip=False, + fontsize=14, + style="italic", + ) + + # y-axis + columns = feat_ls + ax.set_yticklabels([""] + columns[::-1]) + + # scatter plot + idx_ls = [i for (i, k) in index_ls] + scatter_ls = [df.iloc[k, 1] for (i, k) in index_ls] + ax.scatter( + y=idx_ls, + marker="o", + s=250, + edgecolors="none", + linewidth=2, + x=scatter_ls, + color="steelblue", + ) + + # legend + legend_elements = [ + Line2D([0], [0], color="steelblue", alpha=0.75, lw=10, label="Significant"), + Line2D([0], [0], color="steelblue", alpha=0.25, lw=10, label="Not significant"), + ] + ax.legend(handles=legend_elements, loc="best", fontsize=16) + + return plt.show() + + +def pca_plot(data): + """PCA dimensionality reduction to display identified clusters as scatterplot.""" + + pca_features = data.drop( + [ + "predicted_class", + "true_class", + "errors", + "FP_errors", + "FN_errors", + "clusters", + "new_clusters", + ], + axis=1, + ) + other_features = data[ + [ + "predicted_class", + "true_class", + "errors", + "FP_errors", + "FN_errors", + "clusters", + "new_clusters", + ] + ] + + df = pd.DataFrame(pca_features) + pca = pd.DataFrame(PCA(n_components=2).fit_transform(df), index=df.index) + temp_dataset = pca.join(other_features, how="left") + temp_dataset.rename(columns={0: "PCA - 1st"}, inplace=True) + temp_dataset.rename(columns={1: "PCA - 2nd"}, inplace=True) + + scatterplot = sns.scatterplot( + data=temp_dataset, + x="PCA - 1st", + y="PCA - 2nd", + hue="clusters", + size="errors", + sizes=(150, 30), + palette="Set1", + ) + scatterplot.set_title("HBAC bias scan (k-means) on AI classifier") + lgd = scatterplot.legend(loc="center left", bbox_to_anchor=(1.0, 0.5), ncol=1) + plt.show() + + +# plt.savefig('./test.png', bbox_extra_artists=(lgd,), bbox_inches='tight') diff --git a/classifiers/Loan_approval_classifier/german_dataset.py b/classifiers/Loan_approval_classifier/german_dataset.py new file mode 100644 index 0000000..8811704 --- /dev/null +++ b/classifiers/Loan_approval_classifier/german_dataset.py @@ -0,0 +1,162 @@ +import os +import pandas as pd +from aif360.datasets import StandardDataset + + +default_mappings = { + "label_maps": [{0: "Good Credit", 1: "Bad Credit"}], + "protected_attribute_maps": [ + {1.0: "Male", 0.0: "Female"}, + {1.0: "Old", 0.0: "Young"}, + ], +} + + +def default_preprocessing(df): + """Adds a derived sex attribute based on personal_status.""" + # TODO: ignores the value of privileged_classes for 'sex' + status_map = { + "A91": "male", + "A93": "male", + "A94": "male", + "A92": "female", + "A95": "female", + } + df["sex"] = df["personal_status"].replace(status_map) + + return df + + +class GermanDataset(StandardDataset): + """German credit Dataset. + + See :file:`aif360/data/raw/german/README.md`. + """ + + def __init__( + self, + label_name="credit", + favorable_classes=[0], + protected_attribute_names=[], + privileged_classes=[], + instance_weights_name=None, + categorical_features=[ + "status", + "credit_history", + "purpose", + "savings", + "employment", + "other_debtors", + "property", + "installment_plans", + "housing", + "skill_level", + "telephone", + "foreign_worker", + ], + features_to_keep=[], + features_to_drop=["personal_status"], + na_values=[], + custom_preprocessing=default_preprocessing, + metadata=default_mappings, + ): + """See :obj:`StandardDataset` for a description of the arguments. + + By default, this code converts the 'age' attribute to a binary value + where privileged is `age > 25` and unprivileged is `age <= 25` as + proposed by Kamiran and Calders [1]_. + + References: + .. [1] F. Kamiran and T. Calders, "Classifying without + discriminating," 2nd International Conference on Computer, + Control and Communication, 2009. + + Examples: + In some cases, it may be useful to keep track of a mapping from + `float -> str` for protected attributes and/or labels. If our use + case differs from the default, we can modify the mapping stored in + `metadata`: + + >>> label_map = {1.0: 'Good Credit', 0.0: 'Bad Credit'} + >>> protected_attribute_maps = [{1.0: 'Male', 0.0: 'Female'}] + >>> gd = GermanDataset(protected_attribute_names=['sex'], + ... privileged_classes=[['male']], metadata={'label_map': label_map, + ... 'protected_attribute_maps': protected_attribute_maps}) + + Now this information will stay attached to the dataset and can be + used for more descriptive visualizations. + """ + + # change path + filepath = "../../data/GermanCredit_dataset/german.data" + + # as given by german.doc + column_names = [ + "status", + "month", + "credit_history", + "purpose", + "credit_amount", + "savings", + "employment", + "investment_as_income_percentage", + "personal_status", + "other_debtors", + "residence_since", + "property", + "age", + "installment_plans", + "housing", + "number_of_credits", + "skill_level", + "people_liable_for", + "telephone", + "foreign_worker", + "credit", + ] + try: + df = pd.read_csv( + filepath, sep=" ", header=None, names=column_names, na_values=na_values + ) + except IOError as err: + print("IOError: {}".format(err)) + print("To use this class, please download the following files:") + print( + "\n\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data" + ) + print( + "\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc" + ) + print("\nand place them, as-is, in the folder:") + print( + "\n\t{}\n".format( + os.path.abspath( + os.path.join( + os.path.abspath(__file__), + "..", + "..", + "data", + "raw", + "german", + ) + ) + ) + ) + import sys + + sys.exit(1) + + super(GermanDataset, self).__init__( + df=df, + label_name=label_name, + favorable_classes=favorable_classes, + protected_attribute_names=protected_attribute_names, + privileged_classes=privileged_classes, + instance_weights_name=instance_weights_name, + categorical_features=categorical_features, + features_to_keep=features_to_keep, + features_to_drop=features_to_drop, + na_values=na_values, + custom_preprocessing=custom_preprocessing, + metadata=metadata, + ) diff --git a/classifiers/Loan_approval_classifier/helper_functions.py b/classifiers/Loan_approval_classifier/helper_functions.py new file mode 100644 index 0000000..392c3ae --- /dev/null +++ b/classifiers/Loan_approval_classifier/helper_functions.py @@ -0,0 +1,149 @@ +import os +import pandas as pd +from aif360.datasets import StandardDataset + + +def default_preprocessing(df): + # default: 1, no default: 0 + df["credit"] = df["credit"].replace({1.0: 0, 2.0: 1}) + + # sex + # male: 0, female: 1 + status_map = {"A91": 0, "A93": 0, "A94": 0, "A92": 1, "A95": 1} + df["sex"] = df["personal_status"].replace(status_map) + + return df + + +class GermanDataset(StandardDataset): + """German credit Dataset. + See :file:`aif360/data/raw/german/README.md`. + """ + + def __init__( + self, + label_name="credit", + favorable_classes=[1], + protected_attribute_names=["sex", "age"], + privileged_classes=[], + instance_weights_name=None, + categorical_features=[ + "status", + "credit_history", + "purpose", + "savings", + "employment", + "other_debtors", + "property", + "installment_plans", + "housing", + "skill_level", + "telephone", + "foreign_worker", + ], + features_to_keep=[], + features_to_drop=["personal_status"], + na_values=[], + custom_preprocessing=default_preprocessing, + metadata=None, + ): + """See :obj:`StandardDataset` for a description of the arguments. + By default, this code converts the 'age' attribute to a binary value + where privileged is `age > 25` and unprivileged is `age <= 25` as + proposed by Kamiran and Calders [1]_. + References: + .. [1] F. Kamiran and T. Calders, "Classifying without + discriminating," 2nd International Conference on Computer, + Control and Communication, 2009. + Examples: + In some cases, it may be useful to keep track of a mapping from + `float -> str` for protected attributes and/or labels. If our use + case differs from the default, we can modify the mapping stored in + `metadata`: + >>> label_map = {1.0: 'Good Credit', 0.0: 'Bad Credit'} + >>> protected_attribute_maps = [{1.0: 'Male', 0.0: 'Female'}] + >>> gd = GermanDataset(protected_attribute_names=['sex'], + ... privileged_classes=[['male']], metadata={'label_map': label_map, + ... 'protected_attribute_maps': protected_attribute_maps}) + Now this information will stay attached to the dataset and can be + used for more descriptive visualizations. + """ + + filepath = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "..", + "..", + "data", + "GermanCredit_dataset", + "german.data", + ) + # as given by german.doc + column_names = [ + "status", + "month", + "credit_history", + "purpose", + "credit_amount", + "savings", + "employment", + "investment_as_income_percentage", + "personal_status", + "other_debtors", + "residence_since", + "property", + "age", + "installment_plans", + "housing", + "number_of_credits", + "skill_level", + "people_liable_for", + "telephone", + "foreign_worker", + "credit", + ] + try: + df = pd.read_csv( + filepath, sep=" ", header=None, names=column_names, na_values=na_values + ) + except IOError as err: + print("IOError: {}".format(err)) + print("To use this class, please download the following files:") + print( + "\n\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data" + ) + print( + "\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc" + ) + print("\nand place them, as-is, in the folder:") + print( + "\n\t{}\n".format( + os.path.abspath( + os.path.join( + os.path.abspath(__file__), + "..", + "..", + "data", + "raw", + "german", + ) + ) + ) + ) + import sys + + sys.exit(1) + + super(GermanDataset, self).__init__( + df=df, + label_name=label_name, + favorable_classes=favorable_classes, + protected_attribute_names=protected_attribute_names, + privileged_classes=privileged_classes, + instance_weights_name=instance_weights_name, + categorical_features=categorical_features, + features_to_keep=features_to_keep, + features_to_drop=features_to_drop, + na_values=na_values, + custom_preprocessing=custom_preprocessing, + metadata=metadata, + ) diff --git a/data/GermanCredit_dataset/german_dataset.py b/data/GermanCredit_dataset/german_dataset.py new file mode 100644 index 0000000..c904f08 --- /dev/null +++ b/data/GermanCredit_dataset/german_dataset.py @@ -0,0 +1,162 @@ +import os +import pandas as pd +from aif360.datasets import StandardDataset + + +default_mappings = { + "label_maps": [{0: "Good Credit", 1: "Bad Credit"}], + "protected_attribute_maps": [ + {1.0: "Male", 0.0: "Female"}, + {1.0: "Old", 0.0: "Young"}, + ], +} + + +def default_preprocessing(df): + """Adds a derived sex attribute based on personal_status.""" + # TODO: ignores the value of privileged_classes for 'sex' + status_map = { + "A91": "male", + "A93": "male", + "A94": "male", + "A92": "female", + "A95": "female", + } + df["sex"] = df["personal_status"].replace(status_map) + + return df + + +class GermanDataset(StandardDataset): + """German credit Dataset. + + See :file:`aif360/data/raw/german/README.md`. + """ + + def __init__( + self, + label_name="credit", + favorable_classes=[0], + protected_attribute_names=[], + privileged_classes=[], + instance_weights_name=None, + categorical_features=[ + "status", + "credit_history", + "purpose", + "savings", + "employment", + "other_debtors", + "property", + "installment_plans", + "housing", + "skill_level", + "telephone", + "foreign_worker", + ], + features_to_keep=[], + features_to_drop=["personal_status"], + na_values=[], + custom_preprocessing=default_preprocessing, + metadata=default_mappings, + ): + """See :obj:`StandardDataset` for a description of the arguments. + + By default, this code converts the 'age' attribute to a binary value + where privileged is `age > 25` and unprivileged is `age <= 25` as + proposed by Kamiran and Calders [1]_. + + References: + .. [1] F. Kamiran and T. Calders, "Classifying without + discriminating," 2nd International Conference on Computer, + Control and Communication, 2009. + + Examples: + In some cases, it may be useful to keep track of a mapping from + `float -> str` for protected attributes and/or labels. If our use + case differs from the default, we can modify the mapping stored in + `metadata`: + + >>> label_map = {1.0: 'Good Credit', 0.0: 'Bad Credit'} + >>> protected_attribute_maps = [{1.0: 'Male', 0.0: 'Female'}] + >>> gd = GermanDataset(protected_attribute_names=['sex'], + ... privileged_classes=[['male']], metadata={'label_map': label_map, + ... 'protected_attribute_maps': protected_attribute_maps}) + + Now this information will stay attached to the dataset and can be + used for more descriptive visualizations. + """ + + # change path + filepath = "./german.data" + + # as given by german.doc + column_names = [ + "status", + "month", + "credit_history", + "purpose", + "credit_amount", + "savings", + "employment", + "investment_as_income_percentage", + "personal_status", + "other_debtors", + "residence_since", + "property", + "age", + "installment_plans", + "housing", + "number_of_credits", + "skill_level", + "people_liable_for", + "telephone", + "foreign_worker", + "credit", + ] + try: + df = pd.read_csv( + filepath, sep=" ", header=None, names=column_names, na_values=na_values + ) + except IOError as err: + print("IOError: {}".format(err)) + print("To use this class, please download the following files:") + print( + "\n\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data" + ) + print( + "\thttps://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc" + ) + print("\nand place them, as-is, in the folder:") + print( + "\n\t{}\n".format( + os.path.abspath( + os.path.join( + os.path.abspath(__file__), + "..", + "..", + "data", + "raw", + "german", + ) + ) + ) + ) + import sys + + sys.exit(1) + + super(GermanDataset, self).__init__( + df=df, + label_name=label_name, + favorable_classes=favorable_classes, + protected_attribute_names=protected_attribute_names, + privileged_classes=privileged_classes, + instance_weights_name=instance_weights_name, + categorical_features=categorical_features, + features_to_keep=features_to_keep, + features_to_drop=features_to_drop, + na_values=na_values, + custom_preprocessing=custom_preprocessing, + metadata=metadata, + ) diff --git a/tests/test_bahc.py b/tests/test_bahc.py index 739963d..187fa39 100644 --- a/tests/test_bahc.py +++ b/tests/test_bahc.py @@ -23,8 +23,25 @@ def test_labels(): assert np.array_equal(np.unique(bahc.labels_), np.arange(bahc.n_clusters_)) -# def test_cluster_sizes(): +def test_cluster_sizes(): # Checks that cluster sizes are at least bahc_min_cluster_size + rng = np.random.RandomState(12) + X = rng.rand(20, 10) + y = rng.rand(20) + bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=5) + bahc.fit(X, y) + assert np.all(np.bincount(bahc.labels_) >= bahc.bahc_min_cluster_size) + + +def test_constant_metric(): + # Checks that there is only one cluster with a score of 0 if the metric is constant + rng = np.random.RandomState(12) + X = rng.rand(20, 10) + y = np.full(20, rng.rand()) + bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2) + bahc.fit(X, y) + assert bahc.n_clusters_ == 1 + assert bahc.scores_[0] == 0 def test_scores(): diff --git a/unsupervised_bias_detection/cluster/_bahc.py b/unsupervised_bias_detection/cluster/_bahc.py index bd3ba2a..45f2059 100644 --- a/unsupervised_bias_detection/cluster/_bahc.py +++ b/unsupervised_bias_detection/cluster/_bahc.py @@ -10,7 +10,8 @@ class BiasAwareHierarchicalClustering(BaseEstimator, ClusterMixin): - """TODO: Add docstring. + """ + TODO: Add docstring. References ---------- @@ -28,11 +29,13 @@ def __init__( clustering_cls: Type[ClusterMixin], bahc_max_iter: int, bahc_min_cluster_size: int, + margin: float = 1e-5, **clustering_params: Any, ): self.clustering_cls = clustering_cls self.bahc_max_iter = bahc_max_iter self.bahc_min_cluster_size = bahc_min_cluster_size + self.margin = margin self.clustering_params = clustering_params def fit(self, X, y): @@ -60,80 +63,91 @@ def fit(self, X, y): order="C", ) n_samples, _ = X.shape - # We start with all samples being in a single cluster + # We start with all samples being in a single cluster with label 0 self.n_clusters_ = 1 - # We assign all samples a label of zero labels = np.zeros(n_samples, dtype=np.uint32) leaves = [] - scores = [] label = 0 - root = ClusterNode(label) - self.cluster_tree_ = root + std = np.std(y) # The entire dataset has a discrimination score of zero score = 0 - heap = [(None, root, score)] + root = ClusterNode(label, -std, score) + self.cluster_tree_ = root + heap = [root] for _ in range(self.bahc_max_iter): if not heap: # If the heap is empty we stop iterating break # Take the cluster with the highest standard deviation of metric y - _, node, score = heapq.heappop(heap) + node = heapq.heappop(heap) label = node.label + score = node.score cluster_indices = np.nonzero(labels == label)[0] - cluster = X[cluster_indices] + X_cluster = X[cluster_indices] clustering_model = self.clustering_cls(**self.clustering_params) - cluster_labels = clustering_model.fit_predict(cluster) + cluster_labels = clustering_model.fit_predict(X_cluster) - # TODO: Generalize for more than 2 clusters - # Can do this by checking clustering_model.n_clusters_ (if it exists) - # or by checking the number of unique values in cluster_labels - indices0 = cluster_indices[np.nonzero(cluster_labels == 0)[0]] - indices1 = cluster_indices[np.nonzero(cluster_labels == 1)[0]] - if ( - len(indices0) >= self.bahc_min_cluster_size - and len(indices1) >= self.bahc_min_cluster_size - ): - # We calculate the discrimination scores using formula (1) in [1] - # TODO: Move y[indices0] and y[indices1] into separate variables - # to avoid recomputing them - # Maybe create a function to compute the score - mask0 = np.ones(n_samples, dtype=bool) - mask0[indices0] = False - score0 = np.mean(y[mask0]) - np.mean(y[indices0]) - mask1 = np.ones(n_samples, dtype=bool) - mask1[indices1] = False - score1 = np.mean(y[mask1]) - np.mean(y[indices1]) - if max(score0, score1) >= score: - std0 = np.std(y[indices0]) - node0 = ClusterNode(label) - # heapq implements min-heap - # so we have to negate std before pushing - heapq.heappush(heap, (-std0, node0, score0)) - std1 = np.std(y[indices1]) - node1 = ClusterNode(self.n_clusters_) - heapq.heappush(heap, (-std1, node1, score1)) - labels[indices1] = self.n_clusters_ - # TODO: Increase n_clusters_ by clustering_model.n_clusters_ - 1 - self.n_clusters_ += 1 - children = [node0, node1] - node.split(clustering_model, children) + if hasattr(clustering_model, "n_clusters_"): + n_children = clustering_model.n_clusters_ + else: + n_children = len(np.unique(cluster_labels)) + + # We first check if all child clusters meet the minimum size requirement + valid_split = True + children_indices = [] + for i in range(n_children): + child_indices = cluster_indices[np.nonzero(cluster_labels == i)[0]] + if len(child_indices) >= self.bahc_min_cluster_size: + children_indices.append(child_indices) else: - leaves.append(node) - scores.append(score) + valid_split = False + break + + # If all children clusters are of sufficient size, we check if the score of any child cluster is greater than or equal to the current score + if valid_split: + valid_split = False + child_scores = [] + for child_indices in children_indices: + y_cluster = y[child_indices] + complement_mask = np.ones(n_samples, dtype=bool) + complement_mask[child_indices] = False + y_complement = y[complement_mask] + child_score = np.mean(y_complement) - np.mean(y_cluster) + if child_score >= score + self.margin: + valid_split = True + child_scores.append(child_score) + + # If the split is valid, we create the children nodes and split the current node + # Otherwise, we add the current node to the leaves + if valid_split: + # TODO: Make this nicer! + # TODO: Maybe explain why we negate std before pushing to heap + first_child_indices = children_indices[0] + first_child_std = np.std(y[first_child_indices]) + first_child_score = child_scores[0] + first_child = ClusterNode(label, -first_child_std, first_child_score) + heapq.heappush(heap, first_child) + labels[first_child_indices] = label + children = [first_child] + for i in range(1, n_children): + child_indices = children_indices[i] + child_std = np.std(y[child_indices]) + child_score = child_scores[i] + child_node = ClusterNode(self.n_clusters_, -child_std, child_score) + heapq.heappush(heap, child_node) + labels[child_indices] = self.n_clusters_ + children.append(child_node) + self.n_clusters_ += 1 + node.split(clustering_model, children) else: leaves.append(node) - scores.append(score) - if heap: - # TODO: Check if this can be made more efficient - leaves.extend((node for _, node, _ in heap)) - scores = np.concatenate([scores, [score for _, _, score in heap]]) - else: - scores = np.array(scores) - + + leaves.extend(heap) + leaf_scores = np.array([leaf.score for leaf in leaves]) # We sort clusters by decreasing scores - sorted_indices = np.argsort(-scores) - self.scores_ = scores[sorted_indices] + sorted_indices = np.argsort(-leaf_scores) + self.scores_ = leaf_scores[sorted_indices] leaf_labels = np.array([leaf.label for leaf in leaves]) leaf_labels = leaf_labels[sorted_indices] label_mapping = np.zeros(self.n_clusters_, dtype=np.uint32) diff --git a/unsupervised_bias_detection/cluster/_cluster_node.py b/unsupervised_bias_detection/cluster/_cluster_node.py index 2901c9b..d4d0398 100644 --- a/unsupervised_bias_detection/cluster/_cluster_node.py +++ b/unsupervised_bias_detection/cluster/_cluster_node.py @@ -1,8 +1,9 @@ +import itertools from sklearn.base import ClusterMixin from typing import Self class ClusterNode: - def __init__(self, label: int): + def __init__(self, label: int, neg_std: float, score: float): """ Initialize a node in the cluster tree. @@ -12,6 +13,8 @@ def __init__(self, label: int): The cluster label for this node (required as all nodes start as leaves) """ self.label = label + self.neg_std = neg_std + self.score = score self.clustering_model = None self.children = [] @@ -19,6 +22,10 @@ def __init__(self, label: int): def is_leaf(self): return len(self.children) == 0 + def __lt__(self, other: Self): + # TODO: Use score before label + return self.neg_std < other.neg_std or (self.neg_std == other.neg_std and self.label < other.label) + def split(self, clustering_model: ClusterMixin, children: list[Self]): """ Split this node by setting its clustering model and adding children. @@ -31,7 +38,7 @@ def split(self, clustering_model: ClusterMixin, children: list[Self]): The clustering model used to split this node children : list of ClusterNode The child nodes resulting from the split - """ + """ self.label = None self.clustering_model = clustering_model self.children = children diff --git a/unsupervised_bias_detection/cluster/_kmeans.py b/unsupervised_bias_detection/cluster/_kmeans.py index e3b4aac..5906be0 100644 --- a/unsupervised_bias_detection/cluster/_kmeans.py +++ b/unsupervised_bias_detection/cluster/_kmeans.py @@ -48,12 +48,7 @@ def __init__( bahc_min_cluster_size, **kmeans_params, ): - # TODO: Remove this once we have a better way to handle the number of clusters - if "n_clusters" in kmeans_params and kmeans_params["n_clusters"] != 2: - raise ValueError( - f"The parameter `n_clusters` should be 2, got {kmeans_params['n_clusters']}." - ) - else: + if "n_clusters" not in kmeans_params: kmeans_params["n_clusters"] = 2 if "n_init" not in kmeans_params: diff --git a/unsupervised_bias_detection/cluster/_kmodes.py b/unsupervised_bias_detection/cluster/_kmodes.py index 2fc84fa..82cbf44 100644 --- a/unsupervised_bias_detection/cluster/_kmodes.py +++ b/unsupervised_bias_detection/cluster/_kmodes.py @@ -43,12 +43,7 @@ class BiasAwareHierarchicalKModes(BaseEstimator, ClusterMixin): """ def __init__(self, bahc_max_iter, bahc_min_cluster_size, **kmodes_params): - # TODO: Remove this once we have a better way to handle the number of clusters - if "n_clusters" in kmodes_params and kmodes_params["n_clusters"] != 2: - raise ValueError( - f"The parameter `n_clusters` should be 2, got {kmodes_params['n_clusters']}." - ) - else: + if "n_clusters" not in kmodes_params: kmodes_params["n_clusters"] = 2 self.bahc_max_iter = bahc_max_iter