Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions config_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import yaml
import copy
from typing import Any, Dict, Optional
import logging

logger = logging.getLogger(__name__)


def load_yaml(path: str) -> Dict[str, Any]:
"""Load a YAML file and return its contents as a dictionary."""
with open(path, 'r') as f:
return yaml.safe_load(f)


def deep_merge(base: Dict[str, Any], overrides: Dict[str, Any]) -> Dict[str, Any]:
"""
Recursively merge `overrides` into `base`.
Values in `overrides` take precedence. Returns a new dict.

Example:
base = {"a": 1, "b": {"c": 2, "d": 3}}
overrides = {"b": {"c": 99}, "e": 5}
result = {"a": 1, "b": {"c": 99, "d": 3}, "e": 5}
"""
result = copy.deepcopy(base)
for key, value in overrides.items():
if key in result and isinstance(result[key], dict) and isinstance(value, dict):
result[key] = deep_merge(result[key], value)
else:
result[key] = copy.deepcopy(value)
return result


def resolve_layer_config(base_config_path: str, overrides: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
"""
Load a layer's base YAML config file and apply any experiment-level overrides.

Args:
base_config_path: Path to the layer's own config YAML.
overrides: Dictionary of keys to override from the experiment config.

Returns:
The merged configuration dictionary.
"""
base_config = load_yaml(base_config_path)
if overrides:
merged = deep_merge(base_config, overrides)
logger.info(f"Applied {len(overrides)} override(s) to {base_config_path}")
return merged
return base_config
Empty file.
113 changes: 113 additions & 0 deletions data_layer/config_files/data_config_compas_carla.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# The carla version of compas, for the paper on PROBE, says to treat all their variables as continous, but that doesnt make sense?

name: "compas"
target_column: "score"
train_split: 0.7
balance_classes: false
preprocessing_strategy: "standardize" # Options: normalize, standardize, min-max none
cache_dir: "./.data_cache" # For caching, currently we pickle and save the processed datasets
# Explicitly defines the base order of columns before encoding.
feature_order: ["age", "two_year_recid", "priors_count", "length_of_stay", "c_charge_degree", "race", "sex"]
# post_encoding_feat_order: ["age", "two_year_recid", "priors_count", "length_of_stay", "c_charge_degree", "race", "sex"]
post_encoding_feat_order: ["age", "two_year_recid", "priors_count", "length_of_stay", "c_charge_degree_cat_M", "c_charge_degree_cat_F", "race_cat_African-American", "race_cat_Other", "sex_cat_Female", "sex_cat_Male"]

features:
age:
short_name: "x0"
type: "numerical"
node_type: "input"
actionability: "same-or-increase"
mutability: true
parent: null
parent_short: null
encode: null # Numeric, no encoding needed
encoded_feature_names: null
impute: "median" # Handle missing values dynamically
domain: [17, 90] # Optional: can be used for validation or scaling

two_year_recid:
short_name: "x1"
type: "numerical"
node_type: "input"
actionability: "same-or-increase"
mutability: true
parent: null
parent_short: null
encode: null # Numeric, no encoding needed
encoded_feature_names: null
impute: "median" # Handle missing values dynamically

priors_count:
short_name: "x2"
type: "numerical"
node_type: "input"
actionability: "same-or-increase"
mutability: true
parent: null
parent_short: null
encode: null # Numeric, no encoding needed
encoded_feature_names: null
impute: "median" # Handle missing values dynamically

length_of_stay:
short_name: "x3"
type: "numerical"
node_type: "input"
actionability: "same-or-increase"
mutability: true
parent: null
parent_short: null
encode: null # Numeric, no encoding needed
encoded_feature_names: null
impute: "median" # Handle missing values dynamically

c_charge_degree:
short_name: "x4"
type: "categorical"
node_type: "input"
actionability: "any"
mutability: true
parent: null
parent_short: null
encode: "one-hot"
encoded_feature_names: ["c_charge_degree_cat_M", "c_charge_degree_cat_F"]
impute: "mode"

race:
short_name: "x5"
type: "categorical"
node_type: "input"
actionability: "any"
mutability: true
parent: null
parent_short: null
encode: "one-hot"
encoded_feature_names: ["race_cat_African-American", "race_cat_Other"]
impute: "mode"

sex:
short_name: "x6"
type: "categorical"
node_type: "input"
actionability: "any"
mutability: true
parent: null
parent_short: null
encode: "one-hot"
encoded_feature_names: ["sex_cat_Female", "sex_cat_Male"]
impute: "mode"

score:
short_name: "y"
type: "binary"
node_type: "output"
actionability: "none"
mutability: false
parent: null
parent_short: null
encode: null
encoded_feature_names: null
impute: "drop"
domain: [0, 1]

# etc
26 changes: 17 additions & 9 deletions data_layer/data_module.py → data_layer/data_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
import yaml
from typing import Dict, Tuple, List, Any
from typing import Dict, Optional, Tuple, List, Any

class DataModule:
class DataObject:
"""
A unified data ingestion and preprocessing pipeline for algorithmic recourse tasks.

Expand All @@ -16,8 +16,8 @@ class DataModule:

NOTE: this module will essentially take the place of the existing data module and dataset classes,
and all the functionality in the loadData method will be transferred here as member functions.
The "get_preprocessing()" acts like a controller that, based on confgs, will call appropriate util
funtions. (think large if else block).
The "get_preprocessing()" acts like a controller that, based on configs, will call appropriate util
funtions. (think large if-else block).

The attributes and util member methods can be expanded on a method need bases.

Expand All @@ -32,19 +32,28 @@ class DataModule:
metadata (Dict[str, Any]): Generated bounds, constraints, and structural info for features.
"""

def __init__(self, data_path: str, config_path: str):
def __init__(self, data_path: str, config_path: str = None, config_override: Optional[Dict[str, Any]] = None):
"""
Initializes the DataModule by loading the raw data and configuration.
Initializes the DataObject by loading the raw data and configuration.

Args:
data_path (str): The file path to the raw CSV dataset.
config_path (str): The file path to the YAML configuration file.
config_override (Optional[Dict[str, Any]]): Optional dictionary of config overrides.
"""
self._metadata = {}
self._raw_df = pd.read_csv(data_path)
self._processed_df = self._raw_df.copy() # This will be transformed in place through the preprocessing pipeline.
with open(config_path, 'r') as file:
self._config = yaml.safe_load(file)

if config_path is not None:
with open(config_path, 'r') as file:
self._config = yaml.safe_load(file)
else:
self._config = {}

# If a pre-merged config is given, use it entirely (it already contains overrides)
if config_override is not None:
self._config = config_override

# drop columns not defined in the config
columns_to_drop = [col for col in self._raw_df.columns if col not in self._config['features'].keys()]
Expand Down Expand Up @@ -177,7 +186,6 @@ def _apply_scaling(self) -> None:
"""
if self._config['preprocessing_strategy'] == 'normalize':
# NOTE: needs to be implemented
#scaler = normalize()
raise NotImplementedError("Normalization strategy is not yet implemented.")
elif self._config['preprocessing_strategy'] == 'standardize':
scaler = StandardScaler()
Expand Down
Loading