diff --git a/alphapy/alphapy_main.py b/alphapy/alphapy_main.py index 9053a5b..988bce9 100644 --- a/alphapy/alphapy_main.py +++ b/alphapy/alphapy_main.py @@ -169,6 +169,7 @@ def training_pipeline(alphapy_specs, model): drop = model.specs['drop'] extension = model.specs['extension'] fs_lofo = model.specs['fs_lofo'] + fs_lofo_algorithms = model.specs.get('fs_lofo_algorithms', []) fs_univariate = model.specs['fs_univariate'] group_id = model.specs['rank_group_id'] grid_search = model.specs['grid_search'] @@ -358,7 +359,7 @@ def training_pipeline(alphapy_specs, model): failed_algos.append(algo) if est is not None: # select LOFO features - if fs_lofo: + if fs_lofo and (not fs_lofo_algorithms or algo in fs_lofo_algorithms): select_features_lofo(model, algo, est) # run classic train/test model pipeline model = first_fit(model, algo, est) diff --git a/alphapy/model.py b/alphapy/model.py index 341883f..3fca072 100644 --- a/alphapy/model.py +++ b/alphapy/model.py @@ -308,6 +308,7 @@ def get_model_config(directory='.'): specs['iso_neighbors'] = features_section['isomap']['neighbors'] # LOFO specs['fs_lofo'] = features_section['lofo']['option'] + specs['fs_lofo_algorithms'] = features_section['lofo'].get('algorithms', []) # log transformation specs['log_transform'] = features_section['log_transform']['option'] specs['pvalue_level'] = features_section['log_transform']['pvalue_level'] @@ -455,6 +456,7 @@ def get_model_config(directory='.'): logger.info('factors = %s', specs['factors']) logger.info('features [X] = %s', specs['features']) logger.info('fs_lofo = %r', specs['fs_lofo']) + logger.info('fs_lofo_algorithms= %r', specs['fs_lofo_algorithms']) logger.info('fs_univariate = %r', specs['fs_univariate']) logger.info('fs_uni_grid = %s', specs['fs_uni_grid']) logger.info('fs_uni_pct = %d', specs['fs_uni_pct']) @@ -674,6 +676,8 @@ def save_feature_map(model): # Save model object logger.info("Writing feature map to %s", full_path) + if model.lofo_df: + model.feature_map['lofo_importances'] = model.lofo_df.copy() joblib.dump(model.feature_map, full_path) @@ -681,6 +685,43 @@ def save_feature_map(model): # Function first_fit # +def get_feature_selection_support(model, algo): + r"""Get the effective pre-model feature selection support for an algorithm. + + Feature selection is staged in AlphaPy. Univariate selection is global, LOFO + is algorithm-specific and computed after univariate selection, and RFE is + handled separately in ``model.support``. This helper returns the feature mask + that should be applied before fitting or predicting with the base estimator. + If LOFO rejects every feature, fall back to the univariate mask so the + estimator never receives an empty matrix. + """ + + try: + support = model.feature_map['support_lofo', algo] + if np.any(support): + return support + except KeyError: + pass + + try: + support = model.feature_map['support_uni'] + if np.any(support): + return support + except KeyError: + pass + + return None + + +def apply_feature_selection_support(model, algo, X): + r"""Apply the effective pre-model feature selection support to ``X``.""" + + support = get_feature_selection_support(model, algo) + if support is None: + return X + return X[:, support] + + def first_fit(model, algo, est): r"""Fit the model before optimization. @@ -725,7 +766,7 @@ def first_fit(model, algo, est): # Extract model data. - X_train = model.X_train + X_train = apply_feature_selection_support(model, algo, model.X_train) y_train = model.y_train groups_train = model.groups_train @@ -784,7 +825,11 @@ def first_fit(model, algo, est): model.estimators[algo] = est # Copy feature name master into feature names per algorithm - model.fnames_algo[algo] = model.feature_names + support = get_feature_selection_support(model, algo) + if support is None: + model.fnames_algo[algo] = model.feature_names + else: + model.fnames_algo[algo] = list(np.array(model.feature_names)[support]) # Record importances and coefficients if necessary. @@ -840,13 +885,14 @@ def make_predictions(model, algo): # Extract model data + X_train = apply_feature_selection_support(model, algo, model.X_train) + X_test = apply_feature_selection_support(model, algo, model.X_test) try: support = model.support[algo] - X_train = model.X_train[:, support] - X_test = model.X_test[:, support] + X_train = X_train[:, support] + X_test = X_test[:, support] except: - X_train = model.X_train - X_test = model.X_test + pass y_train = model.y_train groups_train = model.groups_train diff --git a/alphapy/optimize.py b/alphapy/optimize.py index 0adf3c0..163132c 100644 --- a/alphapy/optimize.py +++ b/alphapy/optimize.py @@ -27,6 +27,7 @@ # from alphapy.globals import ModelType +from alphapy.model import apply_feature_selection_support from contextlib import contextmanager from datetime import datetime @@ -292,11 +293,12 @@ def hyper_grid_search(model, estimator): est = model.estimators[algo] # Extract model data + X_train = apply_feature_selection_support(model, algo, model.X_train) try: support = model.support[algo] - X_train = model.X_train[:, support] + X_train = X_train[:, support] except: - X_train = model.X_train + pass y_train = model.y_train # Clone estimator and set to single-threaded to avoid nested parallelism with Optuna diff --git a/alphapy/plots.py b/alphapy/plots.py index e42debf..8da6520 100644 --- a/alphapy/plots.py +++ b/alphapy/plots.py @@ -407,7 +407,7 @@ def plot_importances(model, partition): for algo in model.algolist: tag = USEP.join([pstring, algo]) # LOFO Feature Importances - if fs_lofo: + if fs_lofo and algo in model.lofo_df: logger.info("LOFO Importances for Algorithm: %s", algo) importance_df = model.lofo_df[algo].copy() importance_df['color'] = (importance_df['importance_mean'] > 0).map({True: 'g', False: 'r'}) diff --git a/tests/test_model_feature_selection.py b/tests/test_model_feature_selection.py new file mode 100644 index 0000000..eb9ded7 --- /dev/null +++ b/tests/test_model_feature_selection.py @@ -0,0 +1,79 @@ +import polars as pl +from sklearn.base import BaseEstimator +from sklearn.base import ClassifierMixin + +from alphapy.globals import ModelType +from alphapy.model import first_fit +from alphapy.model import Model + + +class ShapeRecordingClassifier(BaseEstimator, ClassifierMixin): + _estimator_type = "classifier" + + def fit(self, X, y): + self.n_features_seen_ = X.shape[1] + self.classes_ = sorted(set(y)) + return self + + def predict(self, X): + return [self.classes_[0]] * X.shape[0] + + +def make_model(): + model = Model( + { + "algorithms": ["TEST"], + "cv_folds": 2, + "esr": 10, + "model_type": ModelType.classification, + "n_jobs": 1, + "scorer": "accuracy", + "seed": 13, + "shuffle": False, + "split": 0.25, + "ts_option": False, + "verbosity": 0, + } + ) + model.X_train = pl.DataFrame( + { + "a": [0, 1, 2, 3, 4, 5], + "b": [5, 4, 3, 2, 1, 0], + "c": [1, 1, 0, 0, 1, 0], + } + ) + model.y_train = pl.DataFrame({"target": [0, 1, 0, 1, 0, 1]}) + model.feature_names = ["a", "b", "c"] + return model + + +def test_first_fit_applies_univariate_support(): + model = make_model() + model.feature_map["support_uni"] = [True, False, True] + + model = first_fit(model, "TEST", ShapeRecordingClassifier()) + + assert model.estimators["TEST"].n_features_seen_ == 2 + assert model.fnames_algo["TEST"] == ["a", "c"] + + +def test_first_fit_prefers_lofo_support_over_univariate_support(): + model = make_model() + model.feature_map["support_uni"] = [True, True, True] + model.feature_map["support_lofo", "TEST"] = [False, True, False] + + model = first_fit(model, "TEST", ShapeRecordingClassifier()) + + assert model.estimators["TEST"].n_features_seen_ == 1 + assert model.fnames_algo["TEST"] == ["b"] + + +def test_first_fit_uses_univariate_support_when_lofo_selects_no_features(): + model = make_model() + model.feature_map["support_uni"] = [True, False, True] + model.feature_map["support_lofo", "TEST"] = [False, False, False] + + model = first_fit(model, "TEST", ShapeRecordingClassifier()) + + assert model.estimators["TEST"].n_features_seen_ == 2 + assert model.fnames_algo["TEST"] == ["a", "c"]