charmlab · HashirA123 · Feb 26, 2026 · Feb 27, 2026 · Feb 27, 2026 · Feb 28, 2026
diff --git a/config_utils.py b/config_utils.py
@@ -0,0 +1,50 @@
+import yaml
+import copy
+from typing import Any, Dict, Optional
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def load_yaml(path: str) -> Dict[str, Any]:
+    """Load a YAML file and return its contents as a dictionary."""
+    with open(path, 'r') as f:
+        return yaml.safe_load(f)
+
+
+def deep_merge(base: Dict[str, Any], overrides: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Recursively merge `overrides` into `base`. 
+    Values in `overrides` take precedence. Returns a new dict.
+
+    Example:
+        base = {"a": 1, "b": {"c": 2, "d": 3}}
+        overrides = {"b": {"c": 99}, "e": 5}
+        result = {"a": 1, "b": {"c": 99, "d": 3}, "e": 5}
+    """
+    result = copy.deepcopy(base)
+    for key, value in overrides.items():
+        if key in result and isinstance(result[key], dict) and isinstance(value, dict):
+            result[key] = deep_merge(result[key], value)
+        else:
+            result[key] = copy.deepcopy(value)
+    return result
+
+
+def resolve_layer_config(base_config_path: str, overrides: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    """
+    Load a layer's base YAML config file and apply any experiment-level overrides.
+
+    Args:
+        base_config_path: Path to the layer's own config YAML.
+        overrides: Dictionary of keys to override from the experiment config.
+
+    Returns:
+        The merged configuration dictionary.
+    """
+    base_config = load_yaml(base_config_path)
+    if overrides:
+        merged = deep_merge(base_config, overrides)
+        logger.info(f"Applied {len(overrides)} override(s) to {base_config_path}")
+        return merged
+    return base_config
diff --git a/data_layer/config_files/data_config_compas.yml b/data_layer/config_files/data_config_compas.yml
diff --git a/data_layer/config_files/data_config_compas_carla.yml b/data_layer/config_files/data_config_compas_carla.yml
@@ -0,0 +1,113 @@
+# The carla version of compas, for the paper on PROBE, says to treat all their variables as continous, but that doesnt make sense?
+
+name: "compas"
+target_column: "score"
+train_split: 0.7
+balance_classes: false
+preprocessing_strategy: "standardize" # Options: normalize, standardize, min-max none
+cache_dir: "./.data_cache" # For caching, currently we pickle and save the processed datasets
+# Explicitly defines the base order of columns before encoding.
+feature_order: ["age", "two_year_recid", "priors_count", "length_of_stay", "c_charge_degree", "race", "sex"]
+# post_encoding_feat_order: ["age", "two_year_recid", "priors_count", "length_of_stay", "c_charge_degree", "race", "sex"]
+post_encoding_feat_order: ["age", "two_year_recid", "priors_count", "length_of_stay", "c_charge_degree_cat_M", "c_charge_degree_cat_F", "race_cat_African-American", "race_cat_Other", "sex_cat_Female", "sex_cat_Male"]
+
+features:
+  age:
+    short_name: "x0"
+    type: "numerical"
+    node_type: "input"
+    actionability: "same-or-increase"
+    mutability: true
+    parent: null
+    parent_short: null
+    encode: null # Numeric, no encoding needed
+    encoded_feature_names: null
+    impute: "median" # Handle missing values dynamically
+    domain: [17, 90] # Optional: can be used for validation or scaling
+
+  two_year_recid:
+    short_name: "x1"
+    type: "numerical"
+    node_type: "input"
+    actionability: "same-or-increase"
+    mutability: true
+    parent: null
+    parent_short: null
+    encode: null # Numeric, no encoding needed
+    encoded_feature_names: null
+    impute: "median" # Handle missing values dynamically
+
+  priors_count:
+    short_name: "x2"
+    type: "numerical"
+    node_type: "input"
+    actionability: "same-or-increase"
+    mutability: true
+    parent: null
+    parent_short: null
+    encode: null # Numeric, no encoding needed
+    encoded_feature_names: null
+    impute: "median" # Handle missing values dynamically
+
+  length_of_stay:
+    short_name: "x3"
+    type: "numerical"
+    node_type: "input"
+    actionability: "same-or-increase"
+    mutability: true
+    parent: null
+    parent_short: null
+    encode: null # Numeric, no encoding needed
+    encoded_feature_names: null
+    impute: "median" # Handle missing values dynamically
+
+  c_charge_degree:
+    short_name: "x4"
+    type: "categorical"
+    node_type: "input"
+    actionability: "any"
+    mutability: true
+    parent: null
+    parent_short: null
+    encode: "one-hot"
+    encoded_feature_names: ["c_charge_degree_cat_M", "c_charge_degree_cat_F"]
+    impute: "mode"
+
+  race:
+    short_name: "x5"
+    type: "categorical"
+    node_type: "input"
+    actionability: "any"
+    mutability: true
+    parent: null
+    parent_short: null
+    encode: "one-hot"
+    encoded_feature_names: ["race_cat_African-American", "race_cat_Other"]
+    impute: "mode"
+
+  sex:
+    short_name: "x6"
+    type: "categorical"
+    node_type: "input"
+    actionability: "any"
+    mutability: true
+    parent: null
+    parent_short: null
+    encode: "one-hot"
+    encoded_feature_names: ["sex_cat_Female", "sex_cat_Male"]
+    impute: "mode"
+
+  score:
+    short_name: "y"
+    type: "binary"
+    node_type: "output"
+    actionability: "none"
+    mutability: false
+    parent: null
+    parent_short: null
+    encode: null
+    encoded_feature_names: null
+    impute: "drop"
+    domain: [0, 1]
+
+  # etc
diff --git a/data_layer/data_module.py → data_layer/data_object.py b/data_layer/data_module.py → data_layer/data_object.py
@@ -3,9 +3,9 @@
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import normalize
 import yaml
-from typing import Dict, Tuple, List, Any
+from typing import Dict, Optional, Tuple, List, Any
 
-class DataModule:
+class DataObject:
     """
     A unified data ingestion and preprocessing pipeline for algorithmic recourse tasks.
 
@@ -16,8 +16,8 @@ class DataModule:
 
     NOTE: this module will essentially take the place of the existing data module and dataset classes,
     and all the functionality in the loadData method will be transferred here as member functions.
-    The "get_preprocessing()" acts like a controller that, based on confgs, will call appropriate util 
-    funtions. (think large if else block).
+    The "get_preprocessing()" acts like a controller that, based on configs, will call appropriate util 
+    funtions. (think large if-else block).
 
     The attributes and util member methods can be expanded on a method need bases. 
 
@@ -32,19 +32,28 @@ class DataModule:
         metadata (Dict[str, Any]): Generated bounds, constraints, and structural info for features.
     """
 
-    def __init__(self, data_path: str, config_path: str):
+    def __init__(self, data_path: str, config_path: str = None, config_override: Optional[Dict[str, Any]] = None):
         """
-        Initializes the DataModule by loading the raw data and configuration.
+        Initializes the DataObject by loading the raw data and configuration.
 
         Args:
             data_path (str): The file path to the raw CSV dataset.
             config_path (str): The file path to the YAML configuration file.
+            config_override (Optional[Dict[str, Any]]): Optional dictionary of config overrides.
         """
         self._metadata = {}
         self._raw_df = pd.read_csv(data_path)
         self._processed_df = self._raw_df.copy() # This will be transformed in place through the preprocessing pipeline.
-        with open(config_path, 'r') as file:
-            self._config = yaml.safe_load(file)
+
+        if config_path is not None:
+            with open(config_path, 'r') as file:
+                self._config = yaml.safe_load(file)
+        else:
+            self._config = {}
+
+        # If a pre-merged config is given, use it entirely (it already contains overrides)
+        if config_override is not None:
+            self._config = config_override
 
         # drop columns not defined in the config
         columns_to_drop = [col for col in self._raw_df.columns if col not in self._config['features'].keys()]
@@ -177,7 +186,6 @@ def _apply_scaling(self) -> None:
         """
         if self._config['preprocessing_strategy'] == 'normalize':
             # NOTE: needs to be implemented
-            #scaler = normalize()
             raise NotImplementedError("Normalization strategy is not yet implemented.")
         elif self._config['preprocessing_strategy'] == 'standardize':
             scaler = StandardScaler()