From 147695e3486a5dc3a726aab6075c737e21925b37 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Mon, 4 Mar 2024 15:51:31 -0800 Subject: [PATCH 1/4] Allow sklearn to update to current version. --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 74d704a..3b92e16 100644 --- a/setup.cfg +++ b/setup.cfg @@ -39,7 +39,7 @@ install_requires = pandas==2.1.4 requests seaborn==0.13.0 - scikit-learn==1.2.1 + scikit-learn>=1.2.1 sklearn_pandas>=2.0.0 tables==3.9.1 tqdm From 9c495241bbb73024ee075eac8a416fe0de135ba2 Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Mon, 4 Mar 2024 15:59:09 -0800 Subject: [PATCH 2/4] Replaces deprecated utlity decorator. --- afqinsight/_serial_bagging.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/afqinsight/_serial_bagging.py b/afqinsight/_serial_bagging.py index fe06d6e..5d5a85a 100644 --- a/afqinsight/_serial_bagging.py +++ b/afqinsight/_serial_bagging.py @@ -25,7 +25,7 @@ from sklearn.ensemble._base import _partition_estimators from sklearn.utils import check_random_state, check_array, indices_to_mask, resample from sklearn.utils.random import sample_without_replacement -from sklearn.utils.metaestimators import if_delegate_has_method +from sklearn.utils.metaestimators import available_if from sklearn.utils.validation import ( check_is_fitted, _check_sample_weight, @@ -610,7 +610,7 @@ def predict_log_proba(self, X): else: return np.log(self.predict_proba(X)) - @if_delegate_has_method(delegate="base_estimator") + @available_if(lambda self: hasattr(self, "base_estimator")) def decision_function(self, X): """Average of the decision functions of the base classifiers. From 09161074fbe198546f1ac1a6696d41e3a1de382f Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Mon, 4 Mar 2024 17:12:51 -0800 Subject: [PATCH 3/4] Update: "base_estimator" => "estimator". --- afqinsight/_serial_bagging.py | 28 ++++++------ afqinsight/tests/test_bagging.py | 78 ++++++++++++++++---------------- 2 files changed, 52 insertions(+), 54 deletions(-) diff --git a/afqinsight/_serial_bagging.py b/afqinsight/_serial_bagging.py index 5d5a85a..2dd4382 100644 --- a/afqinsight/_serial_bagging.py +++ b/afqinsight/_serial_bagging.py @@ -103,7 +103,7 @@ def _parallel_build_estimators( max_samples = ensemble._max_samples bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features - support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight") + support_sample_weight = has_fit_parameter(ensemble.estimator_, "sample_weight") if not support_sample_weight and sample_weight is not None: raise ValueError("The base estimator doesn't support sample weight") @@ -182,7 +182,7 @@ class SerialBaggingClassifier(BaggingClassifier): Parameters ---------- - base_estimator : object, default=None + estimator : object, default=None The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a decision tree. @@ -236,7 +236,7 @@ class SerialBaggingClassifier(BaggingClassifier): Attributes ---------- - base_estimator_ : estimator + estimator_ : estimator The base estimator from which the ensemble is grown. n_features_in_ : int @@ -287,7 +287,7 @@ class SerialBaggingClassifier(BaggingClassifier): def __init__( self, - base_estimator=None, + estimator=None, n_estimators=10, *, max_samples=1.0, @@ -301,7 +301,7 @@ def __init__( verbose=0, ): super().__init__( - base_estimator=base_estimator, + estimator=estimator, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, @@ -367,7 +367,7 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): self._validate_estimator() if max_depth is not None: # pragma: no cover - self.base_estimator_.max_depth = max_depth + self.estimator_.max_depth = max_depth # Validate max_samples if max_samples is None: # pragma: no cover @@ -568,7 +568,7 @@ def predict_log_proba(self, X): classes corresponds to that in the attribute :term:`classes_`. """ check_is_fitted(self) - if hasattr(self.base_estimator_, "predict_log_proba"): + if hasattr(self.estimator_, "predict_log_proba"): # Check data X = check_array( X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False @@ -610,7 +610,7 @@ def predict_log_proba(self, X): else: return np.log(self.predict_proba(X)) - @available_if(lambda self: hasattr(self, "base_estimator")) + @available_if(lambda self: hasattr(self, "estimator")) def decision_function(self, X): """Average of the decision functions of the base classifiers. @@ -690,7 +690,7 @@ class SerialBaggingRegressor(BaggingRegressor): Parameters ---------- - base_estimator : object, default=None + estimator : object, default=None The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a decision tree. @@ -745,7 +745,7 @@ class SerialBaggingRegressor(BaggingRegressor): Attributes ---------- - base_estimator_ : estimator + estimator_ : estimator The base estimator from which the ensemble is grown. n_features_in_ : int @@ -780,7 +780,7 @@ class SerialBaggingRegressor(BaggingRegressor): >>> X, y = make_regression(n_samples=100, n_features=4, ... n_informative=2, n_targets=1, ... random_state=0, shuffle=False) - >>> regr = BaggingRegressor(base_estimator=SVR(), + >>> regr = BaggingRegressor(estimator=SVR(), ... n_estimators=10, random_state=0).fit(X, y) >>> regr.predict([[0, 0, 0, 0]]) array([-2.8720...]) @@ -803,7 +803,7 @@ class SerialBaggingRegressor(BaggingRegressor): def __init__( self, - base_estimator=None, + estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, @@ -816,7 +816,7 @@ def __init__( verbose=0, ): super().__init__( - base_estimator=base_estimator, + estimator=estimator, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, @@ -881,7 +881,7 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): self._validate_estimator() if max_depth is not None: # pragma: no cover - self.base_estimator_.max_depth = max_depth + self.estimator_.max_depth = max_depth # Validate max_samples if max_samples is None: # pragma: no cover diff --git a/afqinsight/tests/test_bagging.py b/afqinsight/tests/test_bagging.py index 09bd10e..354cfa7 100644 --- a/afqinsight/tests/test_bagging.py +++ b/afqinsight/tests/test_bagging.py @@ -69,7 +69,7 @@ def test_classification(): } ) - for base_estimator in [ + for estimator in [ None, DummyClassifier(), Perceptron(), @@ -79,7 +79,7 @@ def test_classification(): ]: for params in grid: SerialBaggingClassifier( - base_estimator=base_estimator, random_state=rng, **params + estimator=estimator, random_state=rng, **params ).fit(X_train, y_train).predict(X_test) @@ -127,7 +127,7 @@ def fit(self, X, y): ]: # Trained on sparse format sparse_classifier = SerialBaggingClassifier( - base_estimator=CustomSVC(decision_function_shape="ovr"), + estimator=CustomSVC(decision_function_shape="ovr"), random_state=1, **params, ).fit(X_train_sparse, y_train) @@ -135,7 +135,7 @@ def fit(self, X, y): # Trained on dense format dense_classifier = SerialBaggingClassifier( - base_estimator=CustomSVC(decision_function_shape="ovr"), + estimator=CustomSVC(decision_function_shape="ovr"), random_state=1, **params, ).fit(X_train, y_train) @@ -163,7 +163,7 @@ def test_regression(): } ) - for base_estimator in [ + for estimator in [ None, DummyRegressor(), DecisionTreeRegressor(), @@ -171,9 +171,9 @@ def test_regression(): SVR(), ]: for params in grid: - SerialBaggingRegressor( - base_estimator=base_estimator, random_state=rng, **params - ).fit(X_train, y_train).predict(X_test) + SerialBaggingRegressor(estimator=estimator, random_state=rng, **params).fit( + X_train, y_train + ).predict(X_test) def test_sparse_regression(): @@ -214,15 +214,13 @@ def fit(self, X, y): for params in parameter_sets: # Trained on sparse format sparse_classifier = SerialBaggingRegressor( - base_estimator=CustomSVR(), random_state=1, **params + estimator=CustomSVR(), random_state=1, **params ).fit(X_train_sparse, y_train) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_results = ( - SerialBaggingRegressor( - base_estimator=CustomSVR(), random_state=1, **params - ) + SerialBaggingRegressor(estimator=CustomSVR(), random_state=1, **params) .fit(X_train, y_train) .predict(X_test) ) @@ -251,33 +249,33 @@ def test_bootstrap_samples(): diabetes.data, diabetes.target, random_state=rng ) - base_estimator = DecisionTreeRegressor().fit(X_train, y_train) + estimator = DecisionTreeRegressor().fit(X_train, y_train) # without bootstrap, all trees are perfect on the training set ensemble = SerialBaggingRegressor( - base_estimator=DecisionTreeRegressor(), + estimator=DecisionTreeRegressor(), max_samples=1.0, bootstrap=False, random_state=rng, ).fit(X_train, y_train) - assert base_estimator.score(X_train, y_train) == ensemble.score(X_train, y_train) + assert estimator.score(X_train, y_train) == ensemble.score(X_train, y_train) # with bootstrap, trees are no longer perfect on the training set ensemble = SerialBaggingRegressor( - base_estimator=DecisionTreeRegressor(), + estimator=DecisionTreeRegressor(), max_samples=1.0, bootstrap=True, random_state=rng, ).fit(X_train, y_train) - assert base_estimator.score(X_train, y_train) > ensemble.score(X_train, y_train) + assert estimator.score(X_train, y_train) > ensemble.score(X_train, y_train) # check that each sampling correspond to a complete bootstrap resample. # the size of each bootstrap should be the same as the input data but # the data should be different (checked using the hash of the data). ensemble = SerialBaggingRegressor( - base_estimator=DummySizeEstimator(), bootstrap=True + estimator=DummySizeEstimator(), bootstrap=True ).fit(X_train, y_train) training_hash = [] for estimator in ensemble.estimators_: @@ -294,7 +292,7 @@ def test_bootstrap_features(): ) ensemble = SerialBaggingRegressor( - base_estimator=DecisionTreeRegressor(), + estimator=DecisionTreeRegressor(), max_features=1.0, bootstrap_features=False, random_state=rng, @@ -304,7 +302,7 @@ def test_bootstrap_features(): assert diabetes.data.shape[1] == np.unique(features).shape[0] ensemble = SerialBaggingRegressor( - base_estimator=DecisionTreeRegressor(), + estimator=DecisionTreeRegressor(), max_features=1.0, bootstrap_features=True, random_state=rng, @@ -324,7 +322,7 @@ def test_probability(): with np.errstate(divide="ignore", invalid="ignore"): # Normal case ensemble = SerialBaggingClassifier( - base_estimator=DecisionTreeClassifier(), random_state=rng + estimator=DecisionTreeClassifier(), random_state=rng ).fit(X_train, y_train) assert_array_almost_equal( @@ -337,7 +335,7 @@ def test_probability(): # Degenerate case, where some classes are missing ensemble = SerialBaggingClassifier( - base_estimator=LogisticRegression(), random_state=rng, max_samples=5 + estimator=LogisticRegression(), random_state=rng, max_samples=5 ).fit(X_train, y_train) assert_array_almost_equal( @@ -357,9 +355,9 @@ def test_oob_score_classification(): iris.data, iris.target, random_state=rng ) - for base_estimator in [DecisionTreeClassifier(), SVC()]: + for estimator in [DecisionTreeClassifier(), SVC()]: clf = SerialBaggingClassifier( - base_estimator=base_estimator, + estimator=estimator, n_estimators=100, bootstrap=True, oob_score=True, @@ -374,7 +372,7 @@ def test_oob_score_classification(): assert_warns( UserWarning, SerialBaggingClassifier( - base_estimator=base_estimator, + estimator=estimator, n_estimators=1, bootstrap=True, oob_score=True, @@ -394,7 +392,7 @@ def test_oob_score_regression(): ) clf = SerialBaggingRegressor( - base_estimator=DecisionTreeRegressor(), + estimator=DecisionTreeRegressor(), n_estimators=50, bootstrap=True, oob_score=True, @@ -409,7 +407,7 @@ def test_oob_score_regression(): assert_warns( UserWarning, SerialBaggingRegressor( - base_estimator=DecisionTreeRegressor(), + estimator=DecisionTreeRegressor(), n_estimators=1, bootstrap=True, oob_score=True, @@ -428,7 +426,7 @@ def test_single_estimator(): ) clf1 = SerialBaggingRegressor( - base_estimator=KNeighborsRegressor(), + estimator=KNeighborsRegressor(), n_estimators=1, bootstrap=False, bootstrap_features=False, @@ -464,7 +462,7 @@ def test_error(): ) # Test support of decision_function - assert not hasattr(SerialBaggingClassifier(base).fit(X, y), "decision_function") + assert hasattr(SerialBaggingClassifier(base).fit(X, y), "decision_function") def test_parallel_classification(): @@ -555,15 +553,15 @@ def test_gridsearch(): y[y == 2] = 1 # Grid search with scoring based on decision_function - parameters = {"n_estimators": (1, 2), "base_estimator__C": (1, 2)} + parameters = {"n_estimators": (1, 2), "estimator__C": (1, 2)} GridSearchCV(SerialBaggingClassifier(SVC()), parameters, scoring="roc_auc").fit( X, y ) -def test_base_estimator(): - # Check base_estimator and its default values. +def test_estimator(): + # Check estimator and its default values. rng = check_random_state(0) # Classification @@ -575,19 +573,19 @@ def test_base_estimator(): X_train, y_train ) - assert isinstance(ensemble.base_estimator_, DecisionTreeClassifier) + assert isinstance(ensemble.estimator_, DecisionTreeClassifier) ensemble = SerialBaggingClassifier( DecisionTreeClassifier(), n_jobs=3, random_state=0 ).fit(X_train, y_train) - assert isinstance(ensemble.base_estimator_, DecisionTreeClassifier) + assert isinstance(ensemble.estimator_, DecisionTreeClassifier) ensemble = SerialBaggingClassifier(Perceptron(), n_jobs=3, random_state=0).fit( X_train, y_train ) - assert isinstance(ensemble.base_estimator_, Perceptron) + assert isinstance(ensemble.estimator_, Perceptron) # Regression X_train, X_test, y_train, y_test = train_test_split( @@ -598,18 +596,18 @@ def test_base_estimator(): X_train, y_train ) - assert isinstance(ensemble.base_estimator_, DecisionTreeRegressor) + assert isinstance(ensemble.estimator_, DecisionTreeRegressor) ensemble = SerialBaggingRegressor( DecisionTreeRegressor(), n_jobs=3, random_state=0 ).fit(X_train, y_train) - assert isinstance(ensemble.base_estimator_, DecisionTreeRegressor) + assert isinstance(ensemble.estimator_, DecisionTreeRegressor) ensemble = SerialBaggingRegressor(SVR(), n_jobs=3, random_state=0).fit( X_train, y_train ) - assert isinstance(ensemble.base_estimator_, SVR) + assert isinstance(ensemble.estimator_, SVR) def test_bagging_with_pipeline(): @@ -802,7 +800,7 @@ def test_estimators_samples_deterministic(): SparseRandomProjection(n_components=2), LogisticRegression() ) clf = SerialBaggingClassifier( - base_estimator=base_pipeline, max_samples=0.5, random_state=0 + estimator=base_pipeline, max_samples=0.5, random_state=0 ) clf.fit(X, y) pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy() @@ -942,7 +940,7 @@ def fit(self, X, y): self._sample_indices = y clf = SerialBaggingRegressor( - base_estimator=MyEstimator(), n_estimators=1, random_state=0 + estimator=MyEstimator(), n_estimators=1, random_state=0 ) clf.fit(X, y) From df47b7ff04305e29c07ba4371dbe59a83a112e5e Mon Sep 17 00:00:00 2001 From: Ariel Rokem Date: Mon, 4 Mar 2024 17:20:14 -0800 Subject: [PATCH 4/4] More "base_estimator" => "estimator". --- afqinsight/pipeline.py | 2 +- afqinsight/tests/test_pipelines.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/afqinsight/pipeline.py b/afqinsight/pipeline.py index ad5e8a6..c218051 100755 --- a/afqinsight/pipeline.py +++ b/afqinsight/pipeline.py @@ -273,7 +273,7 @@ def call_with_kwargs(Transformer, kwargs): else: ensembler_kwargs = {} - ensembler_kwargs["base_estimator"] = base_estimator + ensembler_kwargs["estimator"] = base_estimator if isinstance(ensemble_meta_estimator, str): if ensemble_meta_estimator.lower() == "bagging": diff --git a/afqinsight/tests/test_pipelines.py b/afqinsight/tests/test_pipelines.py index 5e27bae..1042192 100644 --- a/afqinsight/tests/test_pipelines.py +++ b/afqinsight/tests/test_pipelines.py @@ -113,9 +113,9 @@ def test_classifier_pipeline_steps( else: assert isinstance(pipeline.named_steps["estimate"], EnsembleStep) # nosec ensemble_params = pipeline.named_steps["estimate"].get_params() - correct_params = EnsembleStep(base_estimator=EstimatorStep()).get_params() - ensemble_base_est = ensemble_params.pop("base_estimator") - correct_params.pop("base_estimator") + correct_params = EnsembleStep(estimator=EstimatorStep()).get_params() + ensemble_base_est = ensemble_params.pop("estimator") + correct_params.pop("estimator") assert ensemble_params == correct_params # nosec assert isinstance(ensemble_base_est, EstimatorStep) # nosec else: @@ -132,9 +132,9 @@ def test_classifier_pipeline_steps( pipeline.named_steps["estimate"].regressor, EnsembleStep ) ensemble_params = pipeline.named_steps["estimate"].regressor.get_params() - correct_params = EnsembleStep(base_estimator=EstimatorStep()).get_params() - ensemble_base_est = ensemble_params.pop("base_estimator") - correct_params.pop("base_estimator") + correct_params = EnsembleStep(estimator=EstimatorStep()).get_params() + ensemble_base_est = ensemble_params.pop("estimator") + correct_params.pop("estimator") assert ensemble_params == correct_params # nosec assert isinstance(ensemble_base_est, EstimatorStep) # nosec