diff --git a/config_utils.py b/config_utils.py new file mode 100644 index 0000000..b5bc053 --- /dev/null +++ b/config_utils.py @@ -0,0 +1,50 @@ +import yaml +import copy +from typing import Any, Dict, Optional +import logging + +logger = logging.getLogger(__name__) + + +def load_yaml(path: str) -> Dict[str, Any]: + """Load a YAML file and return its contents as a dictionary.""" + with open(path, 'r') as f: + return yaml.safe_load(f) + + +def deep_merge(base: Dict[str, Any], overrides: Dict[str, Any]) -> Dict[str, Any]: + """ + Recursively merge `overrides` into `base`. + Values in `overrides` take precedence. Returns a new dict. + + Example: + base = {"a": 1, "b": {"c": 2, "d": 3}} + overrides = {"b": {"c": 99}, "e": 5} + result = {"a": 1, "b": {"c": 99, "d": 3}, "e": 5} + """ + result = copy.deepcopy(base) + for key, value in overrides.items(): + if key in result and isinstance(result[key], dict) and isinstance(value, dict): + result[key] = deep_merge(result[key], value) + else: + result[key] = copy.deepcopy(value) + return result + + +def resolve_layer_config(base_config_path: str, overrides: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """ + Load a layer's base YAML config file and apply any experiment-level overrides. + + Args: + base_config_path: Path to the layer's own config YAML. + overrides: Dictionary of keys to override from the experiment config. + + Returns: + The merged configuration dictionary. + """ + base_config = load_yaml(base_config_path) + if overrides: + merged = deep_merge(base_config, overrides) + logger.info(f"Applied {len(overrides)} override(s) to {base_config_path}") + return merged + return base_config \ No newline at end of file diff --git a/data_layer/data_module.py b/data_layer/data_object.py similarity index 95% rename from data_layer/data_module.py rename to data_layer/data_object.py index 8a919ee..860b483 100644 --- a/data_layer/data_module.py +++ b/data_layer/data_object.py @@ -3,9 +3,9 @@ from sklearn.model_selection import train_test_split from sklearn.preprocessing import normalize import yaml -from typing import Dict, Tuple, List, Any +from typing import Dict, Optional, Tuple, List, Any -class DataModule: +class DataObject: """ A unified data ingestion and preprocessing pipeline for algorithmic recourse tasks. @@ -16,8 +16,8 @@ class DataModule: NOTE: this module will essentially take the place of the existing data module and dataset classes, and all the functionality in the loadData method will be transferred here as member functions. - The "get_preprocessing()" acts like a controller that, based on confgs, will call appropriate util - funtions. (think large if else block). + The "get_preprocessing()" acts like a controller that, based on configs, will call appropriate util + funtions. (think large if-else block). The attributes and util member methods can be expanded on a method need bases. @@ -32,19 +32,28 @@ class DataModule: metadata (Dict[str, Any]): Generated bounds, constraints, and structural info for features. """ - def __init__(self, data_path: str, config_path: str): + def __init__(self, data_path: str, config_path: str = None, config_override: Optional[Dict[str, Any]] = None): """ - Initializes the DataModule by loading the raw data and configuration. + Initializes the DataObject by loading the raw data and configuration. Args: data_path (str): The file path to the raw CSV dataset. config_path (str): The file path to the YAML configuration file. + config_override (Optional[Dict[str, Any]]): Optional dictionary of config overrides. """ self._metadata = {} self._raw_df = pd.read_csv(data_path) self._processed_df = self._raw_df.copy() # This will be transformed in place through the preprocessing pipeline. - with open(config_path, 'r') as file: - self._config = yaml.safe_load(file) + + if config_path is not None: + with open(config_path, 'r') as file: + self._config = yaml.safe_load(file) + else: + self._config = {} + + # If a pre-merged config is given, use it entirely (it already contains overrides) + if config_override is not None: + self._config = config_override # drop columns not defined in the config columns_to_drop = [col for col in self._raw_df.columns if col not in self._config['features'].keys()] diff --git a/evaluation_layer/distances.py b/evaluation_layer/distances.py index ecaf9fa..b2fb65f 100644 --- a/evaluation_layer/distances.py +++ b/evaluation_layer/distances.py @@ -3,9 +3,10 @@ import numpy as np import pandas as pd -from evaluation_layer.evaluation_module import EvaluationModule +from evaluation_layer.evaluation_factory import register_evaluation +from evaluation_layer.evaluation_object import EvaluationObject from evaluation_layer.utils import remove_nans -from data_layer.data_module import DataModule +from data_layer.data_object import DataObject def l0_distance(delta: np.ndarray) -> List[float]: @@ -146,13 +147,13 @@ def _get_distances( return [[d1[i], d2[i], d3[i], d4[i]] for i in range(len(d1))] - -class Distance(EvaluationModule): +@register_evaluation("Distance") +class Distance(EvaluationObject): """ Calculates the L0, L1, L2, and L-infty distance measures. """ - def __init__(self, data: DataModule, hyperparameters: dict = None): + def __init__(self, data: DataObject, hyperparameters: dict = None): super().__init__(data, hyperparameters) self.columns = ["L0_distance", "L1_distance", "L2_distance", "Linf_distance"] diff --git a/evaluation_layer/evaluation_factory.py b/evaluation_layer/evaluation_factory.py new file mode 100644 index 0000000..fc71506 --- /dev/null +++ b/evaluation_layer/evaluation_factory.py @@ -0,0 +1,37 @@ +from data_layer.data_object import DataObject +from evaluation_layer.evaluation_object import EvaluationObject +from typing import Dict, Any, List, Optional + +_EVAL_REGISTRY = {} + + +def register_evaluation(name: str): + """Decorator to register an evaluation metric class by name.""" + def decorator(cls): + _EVAL_REGISTRY[name] = cls + return cls + return decorator + + +def create_evaluations(metrics_config: List[Dict[str, Any]], + data: DataObject) -> List[EvaluationObject]: + """ + Instantiate all requested evaluation modules from the experiment config. + + Args: + metrics_config: List of dicts, each with "name" and optional "hyperparameters". + data: The DataObject instance. + + Returns: + List of EvaluationObject instances. + """ + evaluations = [] + for metric in metrics_config: + name = metric["name"] + hyperparams = metric.get("hyperparameters", None) + if name not in _EVAL_REGISTRY: + raise ValueError( + f"Evaluation '{name}' is not registered. Available: {list(_EVAL_REGISTRY.keys())}" + ) + evaluations.append(_EVAL_REGISTRY[name](data, hyperparams)) + return evaluations \ No newline at end of file diff --git a/evaluation_layer/evaluation_module.py b/evaluation_layer/evaluation_object.py similarity index 68% rename from evaluation_layer/evaluation_module.py rename to evaluation_layer/evaluation_object.py index 14c15d3..57fe797 100644 --- a/evaluation_layer/evaluation_module.py +++ b/evaluation_layer/evaluation_object.py @@ -1,16 +1,16 @@ from abc import ABC, abstractmethod import pandas as pd -from data_layer.data_module import DataModule +from data_layer.data_object import DataObject -class EvaluationModule(ABC): - def __init__(self, data: DataModule, hyperparameters: dict = None): +class EvaluationObject(ABC): + def __init__(self, data: DataObject, hyperparameters: dict = None): """ Parameters ---------- - model: - Classification model. (optional) + data: DataObject + The data object containing the processed data and metadata. hyperparameters: Dictionary with hyperparameters, could be used to pass other things. (optional) """ diff --git a/evaluation_layer/utils.py b/evaluation_layer/utils.py index c6aa994..881fea5 100644 --- a/evaluation_layer/utils.py +++ b/evaluation_layer/utils.py @@ -2,13 +2,13 @@ import pandas as pd import numpy as np -from data_layer.data_module import DataModule -from model_layer.model_module import ModelModule +from data_layer.data_object import DataObject +from model_layer.model_object import ModelObject import logging -def check_counterfactuals(model: ModelModule, - data: DataModule, +def check_counterfactuals(model: ModelObject, + data: DataObject, counterfactuals: pd.DataFrame, factual_indices: pd.Index) -> pd.DataFrame: """ @@ -19,7 +19,7 @@ def check_counterfactuals(model: ModelModule, Parameters ---------- - model: ModelModule + model: ModelObject The model module containing the trained model and its configuration. counterfactuals: pd.DataFrame The generated counterfactuals to be checked. diff --git a/experiment.py b/experiment.py index 116585d..cb0829e 100644 --- a/experiment.py +++ b/experiment.py @@ -1,55 +1,154 @@ -# generic example of a full end to end run of the repo -from data_layer.data_module import DataModule -from evaluation_layer.distances import Distance -from model_layer.model_module import ModelModule -from method_layer.ROAR.method import ROAR -import numpy as np +""" +usage example: python -m experiment --config_path experiment/experiment_config.yml +""" + +import argparse import pandas as pd +import numpy as np +import logging -if __name__ == "__main__": +from config_utils import load_yaml, resolve_layer_config +from data_layer.data_object import DataObject +from model_layer.model_object import ModelObject +from method_layer.method_factory import create_method +from evaluation_layer.evaluation_factory import create_evaluations + +# Force registration of all methods and evaluations +import method_layer.ROAR.method # noqa: F401 +import evaluation_layer.distances # noqa: F401 + +_DATA_RAW_PATH = { + "german": "data_layer/raw_csv/german.csv", + # add more datasets and their raw data paths here +} + +_DATA_CONFIG_PATHS = { + "german": "data_layer/config_files/data_config_german.yml", + # add more datasets and their config paths here +} + +_MODEL_CONFIG_PATHS = { + "mlp": "model_layer/model_config_mlp.yml", + # add more model types and their config paths here +} + +_METHOD_CONFIG_PATHS = { + "ROAR": "method_layer/ROAR/library/method_config.yml", + # add more method types and their config paths here +} - data_module = DataModule( - data_path="data_layer/raw_csv/german.csv", - config_path="data_layer/config_files/data_config_german.yml") + +def setup_logging(name: str): + level = getattr(logging, name.upper(), logging.INFO) + logging.basicConfig( + level=level, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s" + ) + + +def select_factuals(model: ModelObject, data: DataObject, X_test, config) -> pd.DataFrame: + num_factuals = config.get("num_factuals", 5) + factual_selection = config.get("factual_selection", "negative_class") + + if factual_selection == "negative_class": + prediction = model.predict(X_test) + neg_indices = np.where(prediction == 0)[0] # returns the indices + selected = X_test[neg_indices][:num_factuals] + elif factual_selection == "all": + prediction = model.predict(X_test) + neg_indices = np.where(prediction == 0)[0] # returns the indices + selected = X_test[neg_indices] + else: + raise ValueError(f"Unknown factual selection method {factual_selection}") - print("here is the processed data:") - print(data_module.get_processed_data().head()) + return pd.DataFrame(selected, columns=data.get_feature_names(expanded=True)) + + +def run_experiment(config_path: str): + # load the top level experiment yaml + + exp_config = load_yaml(config_path) + experiment = exp_config["experiment"] - model_module = ModelModule( - config_path="model_layer/model_config_mlp.yml", - data_module=data_module + setup_logging(experiment.get("logger", "INFO")) + + logger = logging.getLogger("experiment") + + logger.info(f"Running experiment {experiment['name']}") + + # ---------- Data layer loading and config merging ----------- + data_section = exp_config["data"] + data_config_merged = resolve_layer_config( + _DATA_CONFIG_PATHS[data_section["name"]], + data_section.get("overrides") + ) + + data_object = DataObject( + data_path=_DATA_RAW_PATH[data_section["name"]], + config_override=data_config_merged + ) + + logger.info("Data layer loaded and configured.") + + # ---------- Model layer loading and config merging ----------- + model_section = exp_config["model"] + model_config_merged = resolve_layer_config( + _MODEL_CONFIG_PATHS[model_section["name"]], + model_section.get("overrides") ) - # get model accuracy - train_accuracy = model_module.get_train_accuracy() - print(f"Model training accuracy: {train_accuracy}") - accuracy = model_module.get_test_accuracy() - print(f"Model test accuracy: {accuracy}") + model_object = ModelObject( + data_object=data_object, + config_override=model_config_merged + ) + + logger.info(f"Train accuracy: {model_object.get_train_accuracy():.4f}") + logger.info(f"Test accuracy: {model_object.get_test_accuracy():.4f}") + + # ---------- Select factuals for counterfactual generation ----------- + X_test, y_test = model_object.get_test_data() + factuals = select_factuals(model_object, data_object, X_test, experiment) + logger.info(f"Selected {len(factuals)} factual instances.") + + # ---------- Method layer loading and config merging ----------- + method_section = exp_config["method"] + method_config_merged = resolve_layer_config( + _METHOD_CONFIG_PATHS[method_section["name"]], + method_section.get("overrides") + ) - # test to see if ROAR method runs without error - method = ROAR(data_module, model_module) + method_object = create_method( + name=method_section["name"], + model=model_object, + data=data_object, + config_override=method_config_merged + ) - # get some factuals to generate counterfactuals for - X_test, y_test = model_module.get_test_data() + counterfactuals = method_object.get_counterfactuals(factuals) + logger.info(f"Generated counterfactuals for {len(counterfactuals)} factual instances.") - # get the first 5 rows of the processed test data as factuals - # specifically, we can the ones predicted as the negative class (label 0) - predictions = model_module.predict(X_test) - negative_class_indices = np.where(predictions == 0)[0] + # ---------- Evaluation layer loading and config merging ----------- + evaluation_section = exp_config["evaluation"] + evaluations = create_evaluations( + metrics_config=evaluation_section["metrics"], + data=data_object + ) - factuals = pd.DataFrame(X_test[negative_class_indices][:5], columns=data_module.get_feature_names(expanded=True)) + results = [] + for eval_module in evaluations: + eval_result = eval_module.get_evaluation(factuals, counterfactuals) + results.append(eval_result) + logger.info(f"Evaluation {eval_module.__class__.__name__} results: {eval_result}") - print("Here are the factuals we will generate counterfactuals for:") - print(factuals) - # now generate counterfactuals for these factuals using ROAR - counterfactuals = method.get_counterfactuals(factuals) - print("Here are the generated counterfactuals:") - print(counterfactuals) +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run a counterfactual explanation experiment.") + parser.add_argument( + "--config_path", + type=str, + required=True, + help="Path to the experiment config YAML file.") + args = parser.parse_args() - # perform some benchmarking of the method using the evaluation module - evaluation_module = Distance(data_module) + run_experiment(args.config_path) - evaluation_results = evaluation_module.get_evaluation(factuals, counterfactuals) - print("Here are the evaluation results for the generated counterfactuals:") - print(evaluation_results) \ No newline at end of file diff --git a/experiments/experiment_roar_mlp_config.yml b/experiments/experiment_roar_mlp_config.yml new file mode 100644 index 0000000..c087bf5 --- /dev/null +++ b/experiments/experiment_roar_mlp_config.yml @@ -0,0 +1,52 @@ +# ============================================================ +# Top-Level Experiment Configuration +# ============================================================ +# This file is the ONLY thing a user needs to create/modify. +# All other layer configs can be overridden from here. + +experiment: + name: "german_roar_mlp_experiment" + seed: 42 + num_factuals: 5 # how many negative-class samples to generate counterfactuals for + factual_selection: "negative_class" # Options: "negative_class", "all" + output_dir: "./results" + save_results: true + output_format: "csv" # Options: "csv", "json", "both" + logger: "info" # Options: "debug", "info", "warning", "error" + +# ---------- Data Layer ---------- +data: + name: "german" + # Override any key inside the data config without editing it directly. + # Keys here are merged ON TOP of whatever is in data_config_.yml. + overrides: + train_split: 0.8 + balance_classes: false + preprocessing_strategy: "standardize" + +# ---------- Model Layer ---------- +model: + name: "mlp" # Options: "mlp", "logistic_regression", etc. + overrides: + epochs: 100 + learning_rate: 0.001 + batch_size: 1000 + hidden_layers: [[50, 100], [100, 200]] + output_activation: "sigmoid" + +# ---------- Method Layer ---------- +method: + name: "ROAR" # Options: "ROAR", "PROBE", etc. + overrides: + lambda_: 0.1 + delta_max: 0.1 + loss_type: "BCE" + +# ---------- Evaluation Layer ---------- +evaluation: + metrics: + - name: "Distance" + # hyperparameters: {} # Optional per-metric hyperparameters + # Future: you could add more metric objects here + # - name: "Sparsity" + # - name: "Validity" \ No newline at end of file diff --git a/main.py b/main.py index aeafccf..fa11f5a 100644 --- a/main.py +++ b/main.py @@ -1,34 +1,55 @@ # generic example of a full end to end run of the repo -from data_layer.data_module import DataModule +from data_layer.data_object import DataObject from evaluation_layer.distances import Distance -from evaluation_layer.evaluation_module import EvaluationModule +from model_layer.model_object import ModelObject from method_layer.ROAR.method import ROAR -from model_layer.model_module import ModelModule import numpy as np import pandas as pd if __name__ == "__main__": - # Step 1: Initialize the DataModule with the path to the data config YAML - data_module = DataModule(config_path="data_config_adult.yml") - - # Step 2: Initialize the ModelModule with the path to the model config YAML and the processed DataModule - model_module = ModelModule(config_path="model_config_mlp.yml", data_module=data_module) - - # Step 3: Initialize the method module with the DataModule and ModelModule - method = ROAR(data_module, model_module) + + data_object = DataObject( + data_path="data_layer/raw_csv/german.csv", + config_path="data_layer/config_files/data_config_german.yml") - # Step 4: Make predictions on new data (example input) + print("here is the processed data:") + print(data_object.get_processed_data().head()) + + model_module = ModelObject( + config_path="model_layer/model_config_mlp.yml", + data_object=data_object + ) + + # get model accuracy + train_accuracy = model_module.get_train_accuracy() + print(f"Model training accuracy: {train_accuracy}") + accuracy = model_module.get_test_accuracy() + print(f"Model test accuracy: {accuracy}") + + # test to see if ROAR method runs without error + method = ROAR(data_object, model_module) + + # get some factuals to generate counterfactuals for X_test, y_test = model_module.get_test_data() + + # get the first 5 rows of the processed test data as factuals + # specifically, we can the ones predicted as the negative class (label 0) predictions = model_module.predict(X_test) negative_class_indices = np.where(predictions == 0)[0] - factuals = pd.DataFrame(X_test[negative_class_indices][:5], columns=data_module.get_feature_names(expanded=True)) + factuals = pd.DataFrame(X_test[negative_class_indices][:5], columns=data_object.get_feature_names(expanded=True)) + + print("Here are the factuals we will generate counterfactuals for:") + print(factuals) # now generate counterfactuals for these factuals using ROAR counterfactuals = method.get_counterfactuals(factuals) + print("Here are the generated counterfactuals:") + print(counterfactuals) # perform some benchmarking of the method using the evaluation module - evaluation_module = Distance(data_module) + evaluation_module = Distance(data_object) evaluation_results = evaluation_module.get_evaluation(factuals, counterfactuals) + print("Here are the evaluation results for the generated counterfactuals:") print(evaluation_results) \ No newline at end of file diff --git a/method_layer/PROBE/data_config.yml b/method_layer/PROBE/data_config.yml deleted file mode 100644 index e69de29..0000000 diff --git a/method_layer/PROBE/model_config_mlp.yml b/method_layer/PROBE/model_config_mlp.yml deleted file mode 100644 index e69de29..0000000 diff --git a/method_layer/ROAR/library/data_config.yml b/method_layer/ROAR/library/data_config.yml deleted file mode 100644 index e69de29..0000000 diff --git a/method_layer/ROAR/library/method_config.yml b/method_layer/ROAR/library/method_config.yml index 1200473..6aad76b 100644 --- a/method_layer/ROAR/library/method_config.yml +++ b/method_layer/ROAR/library/method_config.yml @@ -1,14 +1,15 @@ "feature_cost": Null -"lr": 0.01 -"lambda_": 0.01 -"delta_max": 0.01 +"lr": 0.001 +"lambda_": 0.1 +"delta_max": 0.1 "norm": 1 -"t_max_min": 0.5 -"loss_type": "BCE" -"y_target": [0, 1] +"t_max_min": 1 +"loss_type": "BCE" # MCE, BCE +"y_target": [0, 1] # [0, 1] if BCE, [1] if MSE "binary_cat_features": False -"loss_threshold": 1e-3 +"loss_threshold": 0.0001 "discretize": False "sample": True "lime_seed": 0 +"enforce_encoding": False "seed": 0 \ No newline at end of file diff --git a/method_layer/ROAR/library/method_utils.py b/method_layer/ROAR/library/method_utils.py index 410a178..d30d44e 100644 --- a/method_layer/ROAR/library/method_utils.py +++ b/method_layer/ROAR/library/method_utils.py @@ -40,9 +40,10 @@ def _calc_max_perturbation( (recourse, torch.ones(1, device=recourse.device)), 0 ) # Add 1 to the feature vector for intercept - loss_fn = torch.nn.BCELoss() + loss_fn = nn.BCELoss() + W.requires_grad = True - f_x_new = torch.nn.Sigmoid()(torch.matmul(W, recourse)) + f_x_new = nn.Sigmoid()(torch.matmul(W, recourse)) w_loss = loss_fn(f_x_new, target_class) gradient_w_loss = grad(w_loss, W)[0] @@ -50,7 +51,7 @@ def _calc_max_perturbation( bound = (-delta_max, delta_max) bounds = [bound] * len(gradient_w_loss) - res = linprog(c, bounds=bounds, method="simplex") + res = linprog(c, bounds=bounds, method="highs") if res.status != 0: logging.warning("Optimization with respect to delta failed to converge") @@ -68,14 +69,15 @@ def roar_recourse( cat_feature_indices: List[List[int]], # binary_cat_features: bool = True, feature_costs: Optional[List[float]] = None, - lr: float = 0.01, - lambda_param: float = 0.01, - delta_max: float = 0.01, + lr: float = 1e-3, + lambda_param: float = 0.1, + delta_max: float = 0.1, y_target: List[int] = [0, 1], - t_max_min: float = 0.5, + t_max_min: float = 1, norm: int = 1, loss_type: str = "BCE", - loss_threshold: float = 1e-3, + loss_threshold: float = 1e-4, + enforce_encoding: bool = False, seed: int = 0, ) -> np.ndarray: """ @@ -126,13 +128,15 @@ def roar_recourse( intercept = torch.from_numpy(np.asarray([intercept])).float().to(device) x = torch.from_numpy(x).float().to(device) y_target = torch.tensor(y_target).float().to(device) - print(f"Target class for ROAR: {y_target}") + lamb = torch.tensor(lambda_param).float().to(device) + print(f"This is the value of x {x}") + # x_new is used for gradient search in optimizing process x_new = Variable(x.clone(), requires_grad=True) - optimizer = optim.Adam([x_new], lr=lr, amsgrad=True) + optimizer = optim.Adam([x_new], lr=lr) if loss_type == "MSE": if len(y_target) != 1: @@ -155,8 +159,9 @@ def roar_recourse( raise ValueError(f"loss_type {loss_type} not supported") # Placeholder values for first loop - loss = torch.tensor(0) - loss_diff = loss_threshold + 1 + loss = torch.tensor(1) + loss_diff = 1 + f_x_new = 0 t0 = datetime.datetime.now() t_max = datetime.timedelta(minutes=t_max_min) @@ -164,28 +169,31 @@ def roar_recourse( while loss_diff > loss_threshold: loss_prev = loss.clone().detach() - # x_new_enc is a copy of x_new with reconstructed encoding constraints of x_new - # such that categorical data is either 0 or 1 - # go through the list of categorical features given to us from the - # data module and use the list of encoded feature names to reconstruct the encoding constraints for the categorical features in x_new - x_new_enc = x_new.clone() - - for cat_feature_group in cat_feature_indices: - # We can reconstruct the encoding constraints by taking the argmax of the group of features to find the index of the feature that should be 1 (if any), and setting that feature to 1 and the rest to 0. - # print(f"Reconstructing encoding constraints for categorical feature group {cat_feature_group}") - - max_index = torch.argmax(x_new_enc[cat_feature_group[0]:cat_feature_group[-1]+1]).item() + cat_feature_group[0] # find the index of the maximum value in the group of features corresponding to the categorical feature - - # print(f"Reconstructing encoding constraints for categorical feature group {cat_feature_group}, max index: {max_index}") - for index in cat_feature_group: - if index != max_index: - x_new_enc[index] = 0 - else: - x_new_enc[index] = 1 + if enforce_encoding == True: + # x_new_enc is a copy of x_new with reconstructed encoding constraints of x_new + # such that categorical data is either 0 or 1 + # go through the list of categorical features given to us from the + # data module and use the list of encoded feature names to reconstruct the encoding constraints for the categorical features in x_new + x_new_enc = x_new.clone() + + # NOTE: This reconstruction isn't done in original code during CFX search! + # Could this lead to results not aligned with the paper? + for cat_feature_group in cat_feature_indices: + # We can reconstruct the encoding constraints by taking the argmax of the group of features to find the index of the feature that should be 1 (if any), and setting that feature to 1 and the rest to 0. + # print(f"Reconstructing encoding constraints for categorical feature group {cat_feature_group}") + + max_index = torch.argmax(x_new_enc[cat_feature_group[0]:cat_feature_group[-1]+1]).item() + cat_feature_group[0] # find the index of the maximum value in the group of features corresponding to the categorical feature + + # print(f"Reconstructing encoding constraints for categorical feature group {cat_feature_group}, max index: {max_index}") + for index in cat_feature_group: + if index != max_index: + x_new_enc[index] = 0 + else: + x_new_enc[index] = 1 # Calculate max delta perturbation on weights delta_W, delta_W0 = _calc_max_perturbation( - x_new_enc.squeeze(), coeff, intercept, delta_max, target_class + x_new.squeeze(), coeff, intercept, delta_max, target_class ) delta_W, delta_W0 = ( torch.from_numpy(delta_W).float().to(device), @@ -196,18 +204,14 @@ def roar_recourse( # get the probability of the target class f_x_new = nn.Sigmoid()( - torch.matmul(coeff + delta_W, x_new_enc.squeeze()) + intercept + delta_W0 + torch.matmul(coeff + delta_W, x_new.squeeze()) + intercept + delta_W0 ).squeeze() if loss_type == "MSE": # single logit score for the target class for MSE loss f_x_new = torch.log(f_x_new / (1 - f_x_new)) - cost = ( - torch.dist(x_new_enc, x, norm) - # if feature_costs is None - # else torch.norm(feature_costs * (x_new_enc - x), norm) - ) + cost = torch.dist(x_new, x, norm) loss = loss_fn(f_x_new, target_class) + lamb * cost loss.backward() @@ -220,4 +224,4 @@ def roar_recourse( logging.info("Timeout - ROAR didn't converge") break - return x_new_enc.cpu().detach().numpy() #.squeeze(axis=0) \ No newline at end of file + return x_new.cpu().detach().numpy() #.squeeze(axis=0) \ No newline at end of file diff --git a/method_layer/ROAR/library/model_config_mlp.yml b/method_layer/ROAR/library/model_config_mlp.yml deleted file mode 100644 index e69de29..0000000 diff --git a/method_layer/ROAR/method.py b/method_layer/ROAR/method.py index 88884de..141f500 100644 --- a/method_layer/ROAR/method.py +++ b/method_layer/ROAR/method.py @@ -1,32 +1,41 @@ import pandas as pd import numpy as np -from typing import Optional, Tuple +from typing import Any, Dict, Dict, Optional, Tuple from lime.lime_tabular import LimeTabularExplainer from sklearn.linear_model import LogisticRegression import yaml -from data_layer.data_module import DataModule +from data_layer.data_object import DataObject from evaluation_layer.utils import check_counterfactuals from method_layer.ROAR.library.method_utils import roar_recourse -from method_layer.method_module import MethodModule -from model_layer.model_module import ModelModule +from method_layer.method_factory import register_method +from method_layer.method_object import MethodObject +from model_layer.model_object import ModelObject +from config_utils import deep_merge import logging -class ROAR(MethodModule): + +@register_method("ROAR") +class ROAR(MethodObject): """ Implementation of ROAR [1]_. .. [1] Upadhyay, S., Joshi, S., & Lakkaraju, H. (2021). Towards Robust and Reliable Algorithmic Recourse. NeurIPS. """ - def __init__(self, data: DataModule, - model: ModelModule, + def __init__(self, data: DataObject, + model: ModelObject, coeffs: Optional[np.ndarray] = None, - intercepts: Optional[np.ndarray] = None): - super().__init__(data, model) + intercepts: Optional[np.ndarray] = None, + config_override: Optional[Dict[str, Any]] = None): + super().__init__(data, model, config_override=config_override) # get configs from config file self.config = yaml.safe_load(open("method_layer/ROAR/library/method_config.yml", 'r')) + + # merge configs with user specified, if they exist + if self._config_override is not None: + self.config = deep_merge(self.config, self._config_override) # store the feature ordering self._feature_order = self._data.get_feature_names(expanded=True) # ensure the feature ordering is correct for the model input @@ -44,6 +53,7 @@ def __init__(self, data: DataModule, self._discretize = self.config['discretize'] self._sample = self.config['sample'] self._lime_seed = self.config['lime_seed'] + self._enforce_encoding = self.config['enforce_encoding'] self._seed = self.config['seed'] self._coeffs = coeffs @@ -121,6 +131,7 @@ def get_counterfactuals(self, factuals: pd.DataFrame): t_max_min=self._t_max_min, loss_type=self._loss_type, loss_threshold=self._loss_threshold, + enforce_encoding=self._enforce_encoding, seed=self._seed, ) cfs.append(counterfactual) @@ -162,7 +173,6 @@ def _get_lime_coefficients(self, factuals: pd.DataFrame) -> Tuple[np.ndarray, np factual, self._model.predict_proba, num_features=len(self._data.get_feature_names(expanded=True)), - # model_regressor=LogisticRegression() ) intercepts.append(explanations.intercept[1]) diff --git a/method_layer/method_factory.py b/method_layer/method_factory.py new file mode 100644 index 0000000..5e854aa --- /dev/null +++ b/method_layer/method_factory.py @@ -0,0 +1,42 @@ +from typing import Any, Dict, Optional + +from data_layer.data_object import DataObject +from method_layer.method_object import MethodObject +from model_layer.model_object import ModelObject + + +_METHOD_REGISTRY = {} + + +def register_method(name: str): + """Decorator to register a method class by name.""" + def decorator(cls): + _METHOD_REGISTRY[name.upper()] = cls + return cls + return decorator + + +def create_method(name: str, + data: DataObject, + model: ModelObject, + config_override: Optional[Dict[str, Any]] = None) -> MethodObject: + """ + Factory function to instantiate a counterfactual method by name. + + Args: + name: The method name (e.g., "ROAR", "PROBE"). + data: The DataObject instance. + model: The ModelObject instance. + config_override: Pre-merged method config to inject. + + Returns: + An instance of the requested MethodObject subclass. + """ + name_upper = name.upper() + if name_upper not in _METHOD_REGISTRY: + raise ValueError( + f"Method '{name}' is not registered. Available: {list(_METHOD_REGISTRY.keys())}" + ) + method_cls = _METHOD_REGISTRY[name_upper] + return method_cls(data, model, config_override=config_override) + diff --git a/method_layer/method_module.py b/method_layer/method_object.py similarity index 57% rename from method_layer/method_module.py rename to method_layer/method_object.py index bf3ae7b..dc04c25 100644 --- a/method_layer/method_module.py +++ b/method_layer/method_object.py @@ -1,20 +1,21 @@ from abc import ABC, abstractmethod +from typing import Any, Dict, Optional import pandas as pd -from data_layer.data_module import DataModule -from model_layer.model_module import ModelModule +from data_layer.data_object import DataObject +from model_layer.model_object import ModelObject -class MethodModule(ABC): +class MethodObject(ABC): """ Abstract class to implement custom recourse methods for a given black-box-model. Parameters ---------- - data: data_layer.DataModule - The data module containing the processed data and metadata. - model: model_layer.ModelModule + data: data_layer.DataObject + The data object containing the processed data and metadata. + model: model_layer.ModelObject The model module containing the trained model and its configuration. Methods @@ -24,9 +25,10 @@ class MethodModule(ABC): """ - def __init__(self, data: DataModule, model: ModelModule): + def __init__(self, data: DataObject, model: ModelObject, config_override: Optional[Dict[str, Any]] = None): self._data = data self._model = model + self._config_override = config_override @abstractmethod def get_counterfactuals(self, factuals: pd.DataFrame): diff --git a/model_layer/model_module.py b/model_layer/model_object.py similarity index 84% rename from model_layer/model_module.py rename to model_layer/model_object.py index 16ba224..e33cb75 100644 --- a/model_layer/model_module.py +++ b/model_layer/model_object.py @@ -1,12 +1,12 @@ import yaml -from typing import Any, List, Union +from typing import Any, Dict, List, Optional, Union import pandas as pd import numpy as np import torch -from data_layer.data_module import DataModule +from data_layer.data_object import DataObject from model_layer.model_builder import PyTorchNeuralNetwork # make use of the existing wrapper class for pytorch models, we can add more wrapper classes for other backends as needed. -class ModelModule: +class ModelObject: """ A decoupled model instantiation and routing layer. @@ -21,24 +21,28 @@ class ModelModule: - config: The parsed YAML configuration dictionary for model architecture and training hyperparameters. """ - def __init__(self, config_path: str, data_module: DataModule): + def __init__(self, config_path: str = None, data_object: DataObject = None, config_override: Optional[Dict[str, Any]] = None): """ - Initializes the ModelModule without redundantly loading raw data. + Initializes the ModelObject without redundantly loading raw data. Args: config_path (str): Path to the model configuration YAML. - data_module (DataModule): The instantiated data layer containing + data_object (DataObject): The instantiated data layer containing the processed data, feature ordering, and bounds. """ - self._data_module = data_module - self._config = yaml.safe_load(open(config_path, 'r')) + self._data_object = data_object + self._config = yaml.safe_load(open(config_path, 'r')) if config_path is not None else {} self._device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + # If a pre-merged config is given, use it entirely (it already contains overrides) + if config_override is not None: + self._config = config_override + self._instantiate_model() # Dynamically instantiate the model based on the config self._model.to(self._device) # Move model to GPU if available - # get training data from the data module and fit the model - X_train, X_test, y_train, y_test = self._data_module.get_train_test_split() + # get training data from the data object and fit the model + X_train, X_test, y_train, y_test = self._data_object.get_train_test_split() self._x_train = X_train self._y_train = y_train @@ -52,14 +56,14 @@ def _instantiate_model(self) -> None: Maps the requested architecture and backend from the YAML config to the corresponding wrapper class (e.g., PyTorchNeuralNetwork, XGBClassifier). - Dynamically fetches input dimensions directly via `self.data_module.get_feature_names(expanded=True)` + Dynamically fetches input dimensions directly via `self.data_object.get_feature_names(expanded=True)` to ensure the input layer precisely matches the encoded dataset. """ architecture = self._config['architecture'] backend = self._config['backend'] params = { - "n_inputs" : len(self._data_module.get_feature_names(expanded=True)), # Dynamically determine input size + "n_inputs" : len(self._data_object.get_feature_names(expanded=True)), # Dynamically determine input size "n_outputs" : self._config.get('n_output', 2), # Default to 2 for binary classification, can be overridden in config "layers" : self._config['hidden_layers'], # describes the number of input and output neurons in each hidden layer, e.g., [[10,100], [100,10]] for two hidden layers with 10 neurons each "batch_size" : self._config.get('batch_size', 1000), @@ -92,9 +96,9 @@ def get_train_accuracy(self) -> float: are ordered correctly according to the DataModule's specifications before making predictions and calculating accuracy. """ - # ensure X_train is in the correct feature order as specified by the DataModule + # ensure X_train is in the correct feature order as specified by the DataObject if isinstance(self._x_train, pd.DataFrame): - feature_names = self._data_module.get_feature_names(expanded=True) + feature_names = self._data_object.get_feature_names(expanded=True) self._x_train = self._x_train[feature_names].values # reorder columns to match the expected feature order predictions = self.predict(self._x_train) @@ -110,9 +114,9 @@ def get_test_accuracy(self) -> float: are ordered correctly according to the DataModule's specifications before making predictions and calculating accuracy. """ - # ensure X_test is in the correct feature order as specified by the DataModule + # ensure X_test is in the correct feature order as specified by the DataObject if isinstance(self._x_test, pd.DataFrame): - feature_names = self._data_module.get_feature_names(expanded=True) + feature_names = self._data_object.get_feature_names(expanded=True) self._x_test = self._x_test[feature_names].values # reorder columns to match the expected feature order predictions = self.predict(self._x_test) @@ -124,15 +128,15 @@ def predict(self, x: Union[np.ndarray, pd.DataFrame, torch.Tensor]) -> Union[np. Returns raw predictions in the correct format for counterfactual search algorithms. This method ensures that the input features are ordered according to the - DataModule's specifications before passing them to the underlying model. + DataObject's specifications before passing them to the underlying model. The output is returned in a consistent format (e.g., numpy array or tensor) regardless of the backend. """ - # ensure input is in tensor format for PyTorch models, and in the correct feature order as specified by the DataModule + # ensure input is in tensor format for PyTorch models, and in the correct feature order as specified by the DataObject # should return a list of 1s or 0s. if isinstance(x, pd.DataFrame): - feature_names = self._data_module.get_feature_names(expanded=True) + feature_names = self._data_object.get_feature_names(expanded=True) x = x[feature_names].values # reorder columns to match the expected feature order x_tensor = torch.tensor(x, dtype=torch.float32, device=self._device) @@ -154,9 +158,9 @@ def predict_proba(self, x: Union[np.ndarray, pd.DataFrame, torch.Tensor]) -> Uni Automatically enforces the correct feature input order before passing data to the underlying model. """ - # ensure input is in tensor format for PyTorch models, and in the correct feature order as specified by the DataModule + # ensure input is in tensor format for PyTorch models, and in the correct feature order as specified by the DataObject if isinstance(x, pd.DataFrame): - feature_names = self._data_module.get_feature_names(expanded=True) + feature_names = self._data_object.get_feature_names(expanded=True) x = x[feature_names].values # reorder columns to match the expected feature order x_tensor = torch.tensor(x, dtype=torch.float32, device=self._device)