From e4657bfcf9227d7ccbb8794ef8c98ad885175810 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 29 May 2024 21:57:34 -0700 Subject: [PATCH 1/7] loosening numpy and pandas versions --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index de708f8..a898c17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "ISLP" dependencies = ["numpy>=1.7.1", "scipy>=0.9", - "pandas>=0.20", + "pandas>=1.5", "lxml", # pandas needs this for html "scikit-learn>=1.2", "joblib", @@ -15,7 +15,7 @@ dependencies = ["numpy>=1.7.1", ] description = "Library for ISLP labs" readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.10" license = {file = "LICENSE"} keywords = [] authors = [ From a5d54b3c9816b0194bbee119b79bff70ca0b56e7 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Sat, 31 Jan 2026 16:27:28 -0800 Subject: [PATCH 2/7] fixes to build requirements --- pyproject.toml | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a898c17..aa30d72 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,23 @@ classifiers = ["Development Status :: 3 - Alpha", ] dynamic = ["version"] +[tool.setuptools] +packages = [ + "ISLP", + "ISLP.models", + "ISLP.bart", + "ISLP.torch", + "ISLP.data" +] +include-package-data = true + +[tool.setuptools.package-data] +ISLP = ["data/*.csv", "data/*.npy", "data/*.data"] + +[tool.setuptools.dynamic] +version = {attr = "ISLP.__version__"} # Assuming ISLP.__version__ holds your version + + [project.urls] # Optional "Homepage" = "https://github.com/intro-stat-learning/ISLP" "Bug Reports" = "https://github.com/intro-stat-learning/ISLP/issues" @@ -51,8 +68,14 @@ doc = ['Sphinx>=3.0'] [build-system] requires = ["setuptools>=42", "wheel", - "versioneer[toml]", "Sphinx>=1.0" + "numpy", + "pandas", + "scipy", + "scikit-learn", + "joblib", + "statsmodels", + "versioneer[toml]" ] build-backend = "setuptools.build_meta" From 273a72765e9f875753b4de1b8fb318bd9cc6eb24 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Sat, 31 Jan 2026 16:28:31 -0800 Subject: [PATCH 3/7] simplifying setup.py, logic in pyproject.toml --- setup.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/setup.py b/setup.py index e068529..e0710d3 100755 --- a/setup.py +++ b/setup.py @@ -27,21 +27,21 @@ def main(**extra_args): setup(version=versioneer.get_version(), - packages = ['ISLP', - 'ISLP.models', - 'ISLP.models', - 'ISLP.bart', - 'ISLP.torch', - 'ISLP.data' - ], - ext_modules = EXTS, - package_data = {"ISLP":["data/*csv", "data/*npy", "data/*data"]}, - include_package_data=True, - data_files=[], - scripts=[], - long_description=long_description, - cmdclass = cmdclass, - **extra_args + # packages = ['ISLP', + # 'ISLP.models', + # 'ISLP.models', + # 'ISLP.bart', + # 'ISLP.torch', + # 'ISLP.data' + # ], + # ext_modules = EXTS, + # package_data = {"ISLP":["data/*csv", "data/*npy", "data/*data"]}, + # include_package_data=True, + # data_files=[], + # scripts=[], + # long_description=long_description, + # cmdclass = cmdclass, + # **extra_args ) #simple way to test what setup will do From 4281ada66688c196aed6a5fcb998585e13fa0688 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Sat, 31 Jan 2026 16:37:11 -0800 Subject: [PATCH 4/7] fixing _estimator_type flags --- ISLP/models/generic_selector.py | 40 +++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/ISLP/models/generic_selector.py b/ISLP/models/generic_selector.py index 37b1f93..7c9329e 100644 --- a/ISLP/models/generic_selector.py +++ b/ISLP/models/generic_selector.py @@ -28,7 +28,10 @@ import scipy as sp from sklearn.metrics import get_scorer -from sklearn.base import (clone, MetaEstimatorMixin) +from sklearn.base import (clone, + MetaEstimatorMixin, + is_classifier, + is_regressor) from sklearn.model_selection import cross_val_score from joblib import Parallel, delayed @@ -149,13 +152,13 @@ def __init__(self, self.scoring = scoring if scoring is None: - if self.est_._estimator_type == 'classifier': + if is_classifier(self.est_): scoring = 'accuracy' - elif self.est_._estimator_type == 'regressor': + elif is_regressor(self.est_): scoring = 'r2' else: - raise AttributeError('Estimator must ' - 'be a Classifier or Regressor.') + scoring = None + if isinstance(scoring, str): self.scorer = get_scorer(scoring) else: @@ -166,7 +169,7 @@ def __init__(self, # don't mess with this unless testing self._TESTING_INTERRUPT_MODE = False - def fit(self, X, y, groups=None, **params): + def fit(self, X, y, groups=None, **fit_params): """Perform feature selection and learn model from training data. Parameters @@ -183,7 +186,7 @@ def fit(self, X, y, groups=None, **params): groups: array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. Passed to the fit method of the cross-validator. - params: various, optional + fit_params: various, optional Additional parameters that are being passed to the estimator. For example, `sample_weights=weights`. @@ -218,7 +221,7 @@ def fit(self, X, y, groups=None, **params): groups=groups, cv=self.cv, pre_dispatch=self.pre_dispatch, - **params) + **fit_params) # keep a running track of the best state @@ -242,7 +245,7 @@ def fit(self, X, y, groups=None, **params): X, y, groups=groups, - **params) + **fit_params) iteration += 1 cur, best_, self.finished_ = self.update_results_check(results_, self.path_, @@ -287,7 +290,7 @@ def fit_transform(self, X, y, groups=None, - **params): + **fit_params): """Fit to training data then reduce X to its most important features. Parameters @@ -304,7 +307,7 @@ def fit_transform(self, groups: array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. Passed to the fit method of the cross-validator. - params: various, optional + fit_params: various, optional Additional parameters that are being passed to the estimator. For example, `sample_weights=weights`. @@ -313,7 +316,7 @@ def fit_transform(self, Reduced feature subset of X, shape={n_samples, k_features} """ - self.fit(X, y, groups=groups, **params) + self.fit(X, y, groups=groups, **fit_params) return self.transform(X) def get_metric_dict(self, confidence_interval=0.95): @@ -368,7 +371,7 @@ def _batch(self, X, y, groups=None, - **params): + **fit_params): results = [] @@ -388,7 +391,7 @@ def _batch(self, groups=groups, cv=self.cv, pre_dispatch=self.pre_dispatch, - **params) + **fit_params) for state in candidates) for state, scores in work: @@ -484,8 +487,11 @@ def _calc_score(estimator, groups=None, cv=None, pre_dispatch='2*n_jobs', - **params): + **fit_params): + if scorer is None: + scorer = lambda estimator, X, y: estimator.score(X, y) + X_state = build_submodel(X, state) if cv: @@ -497,11 +503,11 @@ def _calc_score(estimator, scoring=scorer, n_jobs=1, pre_dispatch=pre_dispatch, - params=params) + fit_params=fit_params) else: estimator.fit(X_state, y, - **params) + **fit_params) scores = np.array([scorer(estimator, X_state, y)]) From 4d9f0286be6c94834c9a0657a0aa7c42daf0093f Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 2 Feb 2026 12:34:49 -0800 Subject: [PATCH 5/7] feat: Add scikit-learn estimator type tags Adds the `__sklearn_tags__` method to the `sklearn_sm` and `sklearn_selected` wrappers. This allows scikit-learn to correctly identify the estimator type (regressor or classifier) based on the statsmodels model. This change enables the use of scikit-learn's cross-validation and model selection tools with these wrappers. Tests have been added to verify that OLS and GLM Binomial models are correctly identified. --- ISLP/models/sklearn_wrap.py | 15 +++++++++- pyproject.toml | 2 +- tests/models/test_sklearn_wrap.py | 46 +++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 tests/models/test_sklearn_wrap.py diff --git a/ISLP/models/sklearn_wrap.py b/ISLP/models/sklearn_wrap.py index 123130b..121da75 100644 --- a/ISLP/models/sklearn_wrap.py +++ b/ISLP/models/sklearn_wrap.py @@ -49,7 +49,17 @@ def __init__(self, self.model_type = model_type self.model_spec = model_spec self.model_args = model_args - + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + if self.model_type == sm.OLS: + tags.estimator_type = 'regressor' + elif (issubclass(self.model_type, sm.GLM) and + 'family' in self.model_args and + isinstance(self.model_args.get('family', None), sm.families.Binomial)): + tags.estimator_type = 'classifier' + return tags + def fit(self, X, y): """ Fit a statsmodel model @@ -171,6 +181,9 @@ def __init__(self, self.cv = cv self.scoring = scoring + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + return tags def fit(self, X, y): """ diff --git a/pyproject.toml b/pyproject.toml index aa30d72..5fe63fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,7 +68,7 @@ doc = ['Sphinx>=3.0'] [build-system] requires = ["setuptools>=42", "wheel", - "Sphinx>=1.0" + "Sphinx>=1.0", "numpy", "pandas", "scipy", diff --git a/tests/models/test_sklearn_wrap.py b/tests/models/test_sklearn_wrap.py new file mode 100644 index 0000000..c3616bd --- /dev/null +++ b/tests/models/test_sklearn_wrap.py @@ -0,0 +1,46 @@ + +import numpy as np +import pandas as pd +import statsmodels.api as sm +from sklearn.base import is_classifier, is_regressor +import pytest + +from ISLP.models.sklearn_wrap import sklearn_sm, sklearn_selected +from ISLP.models.model_spec import ModelSpec +from ISLP.models.strategy import min_max + +@pytest.fixture +def model_setup(): + X = pd.DataFrame({'X1': np.random.rand(10), 'X2': np.random.rand(10), 'X3': np.random.rand(10)}) + y = pd.Series(np.random.randint(0, 2, 10)) # For classifier + model_spec_dummy = ModelSpec(['X1', 'X2', 'X3']).fit(X) + min_max_strategy_dummy = min_max(model_spec_dummy, min_terms=1, max_terms=2) + return X, y, model_spec_dummy, min_max_strategy_dummy + +def test_OLS_is_regressor(): + model = sklearn_sm(sm.OLS) + assert model.__sklearn_tags__().estimator_type == 'regressor' + assert is_regressor(model) + +def test_GLM_binomial_is_classifier(): + model = sklearn_sm(sm.GLM, model_args={'family': sm.families.Binomial()}) + assert model.__sklearn_tags__().estimator_type == 'classifier' + assert is_classifier(model) + +def test_GLM_binomial_probit_is_classifier(): + model = sklearn_sm(sm.GLM, model_args={'family': sm.families.Binomial(link=sm.families.links.Probit())}) + assert model.__sklearn_tags__().estimator_type == 'classifier' + assert is_classifier(model) + + +def test_selected_OLS_is_regressor(model_setup): + X, y, model_spec_dummy, min_max_strategy_dummy = model_setup + model = sklearn_selected(sm.OLS, strategy=min_max_strategy_dummy) + assert model.__sklearn_tags__().estimator_type == 'regressor' + assert is_regressor(model) + +def test_selected_GLM_binomial_is_classifier(model_setup): + X, y, model_spec_dummy, min_max_strategy_dummy = model_setup + model = sklearn_selected(sm.GLM, strategy=min_max_strategy_dummy, model_args={'family': sm.families.Binomial()}) + assert model.__sklearn_tags__().estimator_type == 'classifier' + assert is_classifier(model) From 2beabce8cc2f027cd3f44eb158f47f1b4a40a9c3 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 2 Feb 2026 12:44:26 -0800 Subject: [PATCH 6/7] removing redundant setup.py --- setup.py | 50 -------------------------------------------------- 1 file changed, 50 deletions(-) delete mode 100755 setup.py diff --git a/setup.py b/setup.py deleted file mode 100755 index e0710d3..0000000 --- a/setup.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python -''' Installation script for ISLP package ''' - -import os -import sys -from os.path import join as pjoin, dirname, exists -# BEFORE importing distutils, remove MANIFEST. distutils doesn't properly -# update it when the contents of directories change. -if exists('MANIFEST'): os.remove('MANIFEST') - -# Unconditionally require setuptools -import setuptools - -# Package for getting versions from git tags -import versioneer - -from setuptools import setup - -# Define extensions -EXTS = [] - -cmdclass = versioneer.get_cmdclass() - -# get long_description - -long_description = open('README.md', 'rt', encoding='utf-8').read() - -def main(**extra_args): - setup(version=versioneer.get_version(), - # packages = ['ISLP', - # 'ISLP.models', - # 'ISLP.models', - # 'ISLP.bart', - # 'ISLP.torch', - # 'ISLP.data' - # ], - # ext_modules = EXTS, - # package_data = {"ISLP":["data/*csv", "data/*npy", "data/*data"]}, - # include_package_data=True, - # data_files=[], - # scripts=[], - # long_description=long_description, - # cmdclass = cmdclass, - # **extra_args - ) - -#simple way to test what setup will do -#python setup.py install --prefix=/tmp -if __name__ == "__main__": - main() From bcd85989111f50bca3667cc123708b571614cfc6 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 2 Feb 2026 12:49:40 -0800 Subject: [PATCH 7/7] unused pkg_resources import --- ISLP/torch/imdb.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ISLP/torch/imdb.py b/ISLP/torch/imdb.py index 617489d..3dfacfe 100644 --- a/ISLP/torch/imdb.py +++ b/ISLP/torch/imdb.py @@ -12,7 +12,6 @@ import torch from torch.utils.data import TensorDataset from scipy.sparse import load_npz -from pkg_resources import resource_filename from pickle import load as load_pickle import urllib