From ac6b6b71f86950b7bd6def48f1df74963e0e1f3c Mon Sep 17 00:00:00 2001 From: Arnav Mishra Date: Mon, 17 Feb 2025 15:38:17 +0000 Subject: [PATCH 01/11] Added dynamic polynomial degree allocation system to the polynomial regression model --- src/smlp_py/train_sklearn.py | 42 ++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/src/smlp_py/train_sklearn.py b/src/smlp_py/train_sklearn.py index edbf7b59..3e3a7700 100644 --- a/src/smlp_py/train_sklearn.py +++ b/src/smlp_py/train_sklearn.py @@ -2,6 +2,8 @@ # This file is part of smlp. # Fitting sklearn regression tree models +from sklearn.model_selection import GridSearchCV +from sklearn.pipeline import Pipeline from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier #from sklearn.tree import _tree from sklearn import tree, ensemble @@ -379,17 +381,39 @@ def df_cols_summary(self, df): # train polynomial regression model with sklearn def poly_train(self, input_names, resp_names, hparam_dict, - X_train, X_test, y_train, y_test, weights): - #print('poly_degree', degree); print('weigts', weights); + X_train, X_test, y_train, y_test, weights): + # Extract hyperparameters hparam_dict_local = self._hparam_dict_global_to_local('poly', hparam_dict) - degree = hparam_dict_local['degree'] + + # Define range for automatic polynomial degree selection + max_degree = 3 # Internal choice, can be changed if needed + param_grid = {'polynomialfeatures__degree': range(1, max_degree + 1)} hparam_dict_local.pop('degree') - poly_reg = PolynomialFeatures(degree) - X_train = poly_reg.fit_transform(X_train) - X_test = poly_reg.transform(X_test) - pol_reg = LinearRegression(**hparam_dict_local) - model = pol_reg.fit(X_train, y_train, sample_weight=weights) - assert(model == pol_reg) + + # Create pipeline for polynomial regression + pipeline = Pipeline([ + ('polynomialfeatures', PolynomialFeatures()), + ('linearregression', LinearRegression(**hparam_dict_local)) + ]) + + # Perform cross-validation to find the best polynomial degree + grid_search = GridSearchCV( + pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1 + ) + + # Pass sample_weight only to LinearRegression.fit() + grid_search.fit(X_train, y_train, **{'linearregression__sample_weight': weights}) + + # Get best model and degree + best_model = grid_search.best_estimator_ + best_degree = grid_search.best_params_['polynomialfeatures__degree'] + + print(f"Automatically selected best polynomial degree: {best_degree}") + + # Use the best polynomial transformation and model + poly_reg = best_model.named_steps['polynomialfeatures'] + model = best_model.named_steps['linearregression'] + return model, poly_reg #, X_train, X_test # model for sklearn poly model is in fact a pair (linear_model, poly_reg), where From b64e31b38dd31cbb63604209b72420bfa8602840 Mon Sep 17 00:00:00 2001 From: Arnav Mishra Date: Mon, 24 Feb 2025 20:42:16 +0000 Subject: [PATCH 02/11] Improved polynomial model's performance by adding ridge regularization with dynamic alpha value selection --- src/smlp_py/train_sklearn.py | 49 ++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/src/smlp_py/train_sklearn.py b/src/smlp_py/train_sklearn.py index 3e3a7700..f9305276 100644 --- a/src/smlp_py/train_sklearn.py +++ b/src/smlp_py/train_sklearn.py @@ -9,8 +9,9 @@ from sklearn import tree, ensemble # Fitting sklearn polynomial regression model -from sklearn.preprocessing import PolynomialFeatures -from sklearn.linear_model import LinearRegression +from sklearn.preprocessing import PolynomialFeatures, StandardScaler +from sklearn.linear_model import ElasticNet +from sklearn.linear_model import Ridge # general import numpy as np @@ -381,40 +382,50 @@ def df_cols_summary(self, df): # train polynomial regression model with sklearn def poly_train(self, input_names, resp_names, hparam_dict, - X_train, X_test, y_train, y_test, weights): + X_train, X_test, y_train, y_test, weights): # Extract hyperparameters hparam_dict_local = self._hparam_dict_global_to_local('poly', hparam_dict) + hparam_dict_local.pop('degree', None) # Remove 'degree' if present + hparam_dict_local.pop('n_jobs', None) # Remove 'n_jobs' to avoid parallel conflicts - # Define range for automatic polynomial degree selection - max_degree = 3 # Internal choice, can be changed if needed - param_grid = {'polynomialfeatures__degree': range(1, max_degree + 1)} - hparam_dict_local.pop('degree') + # Define polynomial degree range (1 to 3) + max_degree = 3 - # Create pipeline for polynomial regression + # Alpha range dynamically chosen to cover small and large values + alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100] + + param_grid = { + 'polynomialfeatures__degree': range(1, max_degree + 1), + 'ridge__alpha': alphas # Cross-validate over multiple alpha values + } + + # Create pipeline with Standardization and Ridge Regression pipeline = Pipeline([ - ('polynomialfeatures', PolynomialFeatures()), - ('linearregression', LinearRegression(**hparam_dict_local)) + ('scaler', StandardScaler()), # Standardizes input features + ('polynomialfeatures', PolynomialFeatures()), # Generates polynomial features + ('ridge', Ridge()) # Ridge regression with dynamically tuned alpha ]) - # Perform cross-validation to find the best polynomial degree + # Perform cross-validation to find best polynomial degree and alpha grid_search = GridSearchCV( pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1 ) - - # Pass sample_weight only to LinearRegression.fit() - grid_search.fit(X_train, y_train, **{'linearregression__sample_weight': weights}) - # Get best model and degree + # Train model using sample weights + grid_search.fit(X_train, y_train, **{'ridge__sample_weight': weights}) + + # Retrieve best model, degree, and alpha best_model = grid_search.best_estimator_ best_degree = grid_search.best_params_['polynomialfeatures__degree'] + best_alpha = grid_search.best_params_['ridge__alpha'] - print(f"Automatically selected best polynomial degree: {best_degree}") + print(f"Best polynomial degree: {best_degree}, Best alpha: {best_alpha}") - # Use the best polynomial transformation and model + # Extract best polynomial transformer and Ridge model poly_reg = best_model.named_steps['polynomialfeatures'] - model = best_model.named_steps['linearregression'] + model = best_model.named_steps['ridge'] - return model, poly_reg #, X_train, X_test + return model, poly_reg # model for sklearn poly model is in fact a pair (linear_model, poly_reg), where # poly_reg is transformer that creates polynomial terems (like x^2) from the original From e4402131242a7532b3c306e07c30bb53950bf072 Mon Sep 17 00:00:00 2001 From: Arnav Mishra Date: Tue, 25 Feb 2025 14:19:38 +0000 Subject: [PATCH 03/11] Further improvements to the polynomial model to enhance the optimisation process --- src/smlp_py/train_sklearn.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/smlp_py/train_sklearn.py b/src/smlp_py/train_sklearn.py index f9305276..f858ca64 100644 --- a/src/smlp_py/train_sklearn.py +++ b/src/smlp_py/train_sklearn.py @@ -10,7 +10,6 @@ # Fitting sklearn polynomial regression model from sklearn.preprocessing import PolynomialFeatures, StandardScaler -from sklearn.linear_model import ElasticNet from sklearn.linear_model import Ridge # general @@ -380,7 +379,6 @@ def df_cols_summary(self, df): 'z_min', (df[col].min() - df[col].mean())/df[col].std(), 'z_max', (df[col].max() - df[col].mean())/df[col].std()) - # train polynomial regression model with sklearn def poly_train(self, input_names, resp_names, hparam_dict, X_train, X_test, y_train, y_test, weights): # Extract hyperparameters @@ -419,7 +417,7 @@ def poly_train(self, input_names, resp_names, hparam_dict, best_degree = grid_search.best_params_['polynomialfeatures__degree'] best_alpha = grid_search.best_params_['ridge__alpha'] - print(f"Best polynomial degree: {best_degree}, Best alpha: {best_alpha}") + print(f"\nBest polynomial degree: {best_degree}, Best alpha: {best_alpha}\n") # Extract best polynomial transformer and Ridge model poly_reg = best_model.named_steps['polynomialfeatures'] From 1cb7e43fc352023e36ce2395bd3f55f752576a48 Mon Sep 17 00:00:00 2001 From: Arnav Mishra Date: Wed, 5 Mar 2025 09:31:33 +0000 Subject: [PATCH 04/11] Minor changes to the polynomial model --- src/smlp_py/train_sklearn.py | 37 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/src/smlp_py/train_sklearn.py b/src/smlp_py/train_sklearn.py index f858ca64..e15a9329 100644 --- a/src/smlp_py/train_sklearn.py +++ b/src/smlp_py/train_sklearn.py @@ -2,15 +2,15 @@ # This file is part of smlp. # Fitting sklearn regression tree models -from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold from sklearn.pipeline import Pipeline from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier #from sklearn.tree import _tree from sklearn import tree, ensemble # Fitting sklearn polynomial regression model -from sklearn.preprocessing import PolynomialFeatures, StandardScaler -from sklearn.linear_model import Ridge +from sklearn.preprocessing import PolynomialFeatures, RobustScaler +from sklearn.linear_model import Ridge, ElasticNet # general import numpy as np @@ -381,47 +381,44 @@ def df_cols_summary(self, df): def poly_train(self, input_names, resp_names, hparam_dict, X_train, X_test, y_train, y_test, weights): - # Extract hyperparameters + hparam_dict_local = self._hparam_dict_global_to_local('poly', hparam_dict) - hparam_dict_local.pop('degree', None) # Remove 'degree' if present - hparam_dict_local.pop('n_jobs', None) # Remove 'n_jobs' to avoid parallel conflicts + hparam_dict_local.pop('degree', None) + hparam_dict_local.pop('n_jobs', None) - # Define polynomial degree range (1 to 3) max_degree = 3 - # Alpha range dynamically chosen to cover small and large values alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100] param_grid = { 'polynomialfeatures__degree': range(1, max_degree + 1), - 'ridge__alpha': alphas # Cross-validate over multiple alpha values + 'ridge__alpha': alphas } - # Create pipeline with Standardization and Ridge Regression pipeline = Pipeline([ - ('scaler', StandardScaler()), # Standardizes input features - ('polynomialfeatures', PolynomialFeatures()), # Generates polynomial features - ('ridge', Ridge()) # Ridge regression with dynamically tuned alpha + ('scaler', RobustScaler()), + ('polynomialfeatures', PolynomialFeatures()), + ('ridge', Ridge()) ]) - # Perform cross-validation to find best polynomial degree and alpha - grid_search = GridSearchCV( - pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1 - ) - # Train model using sample weights + cv = KFold(n_splits=5, shuffle=True, random_state=42) + grid_search = RandomizedSearchCV( + pipeline, param_grid, cv=cv, scoring='r2', n_jobs=-1, n_iter=100 + ) grid_search.fit(X_train, y_train, **{'ridge__sample_weight': weights}) - # Retrieve best model, degree, and alpha best_model = grid_search.best_estimator_ best_degree = grid_search.best_params_['polynomialfeatures__degree'] best_alpha = grid_search.best_params_['ridge__alpha'] print(f"\nBest polynomial degree: {best_degree}, Best alpha: {best_alpha}\n") - # Extract best polynomial transformer and Ridge model poly_reg = best_model.named_steps['polynomialfeatures'] model = best_model.named_steps['ridge'] + + test_score = best_model.score(X_test, y_test) + print(f"Test set R^2: {test_score}") return model, poly_reg From f01dcfeeb0ea328086d145745f8ec1eca0d84d42 Mon Sep 17 00:00:00 2001 From: Arnav Mishra Date: Wed, 5 Mar 2025 17:11:41 +0000 Subject: [PATCH 05/11] Improvement to bring back the polynomial model to original results --- src/smlp_py/train_sklearn.py | 49 +++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/src/smlp_py/train_sklearn.py b/src/smlp_py/train_sklearn.py index e15a9329..a8e1c3f7 100644 --- a/src/smlp_py/train_sklearn.py +++ b/src/smlp_py/train_sklearn.py @@ -10,7 +10,8 @@ # Fitting sklearn polynomial regression model from sklearn.preprocessing import PolynomialFeatures, RobustScaler -from sklearn.linear_model import Ridge, ElasticNet +from sklearn.linear_model import Ridge, LinearRegression +from sklearn.metrics import mean_squared_error # general import numpy as np @@ -381,46 +382,54 @@ def df_cols_summary(self, df): def poly_train(self, input_names, resp_names, hparam_dict, X_train, X_test, y_train, y_test, weights): - + hparam_dict_local = self._hparam_dict_global_to_local('poly', hparam_dict) hparam_dict_local.pop('degree', None) hparam_dict_local.pop('n_jobs', None) max_degree = 3 - alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100] + mse_values = [] + degrees = range(1, max_degree + 1) + + for d in degrees: + poly = PolynomialFeatures(degree=d) + X_poly_train = poly.fit_transform(X_train) + model = LinearRegression().fit(X_poly_train, y_train) + y_pred = model.predict(X_poly_train) + mse = mean_squared_error(y_train, y_pred) + mse_values.append(mse) - param_grid = { - 'polynomialfeatures__degree': range(1, max_degree + 1), - 'ridge__alpha': alphas - } + second_derivatives = np.diff(mse_values, n=2) + best_degree = degrees[np.argmin(second_derivatives) + 1] pipeline = Pipeline([ ('scaler', RobustScaler()), - ('polynomialfeatures', PolynomialFeatures()), - ('ridge', Ridge()) + ('polynomialfeatures', PolynomialFeatures(degree=best_degree)), ]) + param_grid = {'polynomialfeatures__degree': [best_degree]} - cv = KFold(n_splits=5, shuffle=True, random_state=42) + cv = KFold(n_splits=10, shuffle=True, random_state=42) grid_search = RandomizedSearchCV( - pipeline, param_grid, cv=cv, scoring='r2', n_jobs=-1, n_iter=100 + pipeline, param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1, n_iter=2000 ) - grid_search.fit(X_train, y_train, **{'ridge__sample_weight': weights}) + grid_search.fit(X_train, y_train) best_model = grid_search.best_estimator_ - best_degree = grid_search.best_params_['polynomialfeatures__degree'] - best_alpha = grid_search.best_params_['ridge__alpha'] - print(f"\nBest polynomial degree: {best_degree}, Best alpha: {best_alpha}\n") + print(f"\nBest polynomial degree: {best_degree}\n") poly_reg = best_model.named_steps['polynomialfeatures'] - model = best_model.named_steps['ridge'] - - test_score = best_model.score(X_test, y_test) + + X_poly_train_best = poly_reg.fit_transform(X_train) + lin_reg_final = LinearRegression().fit(X_poly_train_best, y_train, sample_weight=weights) + + test_score = lin_reg_final.score(poly_reg.transform(X_test), y_test) print(f"Test set R^2: {test_score}") - return model, poly_reg + return lin_reg_final, poly_reg + # model for sklearn poly model is in fact a pair (linear_model, poly_reg), where # poly_reg is transformer that creates polynomial terems (like x^2) from the original @@ -473,7 +482,7 @@ def _sklearn_train_multi_response(self, get_model_file_prefix, feat_names, resp_ formula_report_file = get_model_file_prefix(None, self._algo_name_local2global(algo)) + '_formula.txt' model_formula = self._instPolyTerms.poly_model_to_term_single_response(feat_names, resp_names, linear_model.coef_, poly_reg.powers_, resp_id, True, formula_report_file) - #print('poly model computed', (linear_model, linear_model.coef_, poly_reg, poly_reg.powers_)) + # print('poly model computed', (linear_model, linear_model.coef_, poly_reg, poly_reg.powers_)) return linear_model, poly_reg #, X_train, X_test else: raise Exception('Unsupported model type ' + str(algo) + ' in function tree_main') From bf8a42f18c4dd27768280d5f19c353f53921f5f4 Mon Sep 17 00:00:00 2001 From: Arnav Mishra Date: Wed, 5 Mar 2025 19:12:29 +0000 Subject: [PATCH 06/11] Minor improvements --- src/smlp_py/train_sklearn.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/smlp_py/train_sklearn.py b/src/smlp_py/train_sklearn.py index a8e1c3f7..8516b311 100644 --- a/src/smlp_py/train_sklearn.py +++ b/src/smlp_py/train_sklearn.py @@ -9,9 +9,9 @@ from sklearn import tree, ensemble # Fitting sklearn polynomial regression model -from sklearn.preprocessing import PolynomialFeatures, RobustScaler +from sklearn.preprocessing import PolynomialFeatures, RobustScaler, StandardScaler from sklearn.linear_model import Ridge, LinearRegression -from sklearn.metrics import mean_squared_error +from sklearn.metrics import mean_absolute_error # general import numpy as np @@ -397,22 +397,20 @@ def poly_train(self, input_names, resp_names, hparam_dict, X_poly_train = poly.fit_transform(X_train) model = LinearRegression().fit(X_poly_train, y_train) y_pred = model.predict(X_poly_train) - mse = mean_squared_error(y_train, y_pred) + mse = mean_absolute_error(y_train, y_pred) mse_values.append(mse) - second_derivatives = np.diff(mse_values, n=2) - best_degree = degrees[np.argmin(second_derivatives) + 1] + best_degree = degrees[np.argmin(mse_values)] pipeline = Pipeline([ - ('scaler', RobustScaler()), ('polynomialfeatures', PolynomialFeatures(degree=best_degree)), ]) param_grid = {'polynomialfeatures__degree': [best_degree]} - cv = KFold(n_splits=10, shuffle=True, random_state=42) - grid_search = RandomizedSearchCV( - pipeline, param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1, n_iter=2000 + cv = KFold(n_splits=5, shuffle=True, random_state=42) + grid_search = GridSearchCV( + pipeline, param_grid, cv=cv, scoring='neg_mean_absolute_error', n_jobs=-1 ) grid_search.fit(X_train, y_train) @@ -423,10 +421,7 @@ def poly_train(self, input_names, resp_names, hparam_dict, poly_reg = best_model.named_steps['polynomialfeatures'] X_poly_train_best = poly_reg.fit_transform(X_train) - lin_reg_final = LinearRegression().fit(X_poly_train_best, y_train, sample_weight=weights) - - test_score = lin_reg_final.score(poly_reg.transform(X_test), y_test) - print(f"Test set R^2: {test_score}") + lin_reg_final = LinearRegression(**hparam_dict_local).fit(X_poly_train_best, y_train, sample_weight=weights) return lin_reg_final, poly_reg From 11a0d248e74627f7f3ee4b898b8bc06423be1399 Mon Sep 17 00:00:00 2001 From: Arnav Mishra Date: Fri, 7 Mar 2025 13:40:12 +0000 Subject: [PATCH 07/11] Improvements to the polynomial model for further performance improvements --- src/smlp_py/train_sklearn.py | 42 +++++++++++++++++------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/src/smlp_py/train_sklearn.py b/src/smlp_py/train_sklearn.py index 8516b311..89bdffd1 100644 --- a/src/smlp_py/train_sklearn.py +++ b/src/smlp_py/train_sklearn.py @@ -2,7 +2,6 @@ # This file is part of smlp. # Fitting sklearn regression tree models -from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold from sklearn.pipeline import Pipeline from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier #from sklearn.tree import _tree @@ -10,8 +9,9 @@ # Fitting sklearn polynomial regression model from sklearn.preprocessing import PolynomialFeatures, RobustScaler, StandardScaler -from sklearn.linear_model import Ridge, LinearRegression +from sklearn.linear_model import Ridge, LinearRegression, RidgeCV from sklearn.metrics import mean_absolute_error +from sklearn.preprocessing import MinMaxScaler # general import numpy as np @@ -389,39 +389,37 @@ def poly_train(self, input_names, resp_names, hparam_dict, max_degree = 3 + lin_reg = LinearRegression().fit(X_train, y_train) + residuals = y_train - lin_reg.predict(X_train) + noise_level = np.std(residuals).item() + mse_values = [] degrees = range(1, max_degree + 1) - for d in degrees: poly = PolynomialFeatures(degree=d) X_poly_train = poly.fit_transform(X_train) - model = LinearRegression().fit(X_poly_train, y_train) + model = LinearRegression(**hparam_dict_local).fit(X_poly_train, y_train) y_pred = model.predict(X_poly_train) mse = mean_absolute_error(y_train, y_pred) mse_values.append(mse) best_degree = degrees[np.argmin(mse_values)] - pipeline = Pipeline([ - ('polynomialfeatures', PolynomialFeatures(degree=best_degree)), - ]) - - param_grid = {'polynomialfeatures__degree': [best_degree]} + poly_reg = PolynomialFeatures(degree=best_degree) - cv = KFold(n_splits=5, shuffle=True, random_state=42) - grid_search = GridSearchCV( - pipeline, param_grid, cv=cv, scoring='neg_mean_absolute_error', n_jobs=-1 - ) - grid_search.fit(X_train, y_train) - - best_model = grid_search.best_estimator_ - - print(f"\nBest polynomial degree: {best_degree}\n") + if noise_level < 1e-10: + print("\nNo noise in the data! Using Linear Regression") + lin_reg_final = LinearRegression(**hparam_dict_local).fit(poly_reg.fit_transform(X_train), y_train) + else: + alphas = np.logspace(-6, 1, 50) + ridge_model = RidgeCV(alphas=alphas, store_cv_values=True, scoring='neg_mean_squared_error') + ridge_model.fit(poly_reg.fit_transform(X_train), y_train) - poly_reg = best_model.named_steps['polynomialfeatures'] + best_alpha = ridge_model.alpha_ + print("\nBest alpha found:", best_alpha) + lin_reg_final = Ridge(alpha=best_alpha, **hparam_dict_local, solver='svd').fit(poly_reg.fit_transform(X_train), y_train) - X_poly_train_best = poly_reg.fit_transform(X_train) - lin_reg_final = LinearRegression(**hparam_dict_local).fit(X_poly_train_best, y_train, sample_weight=weights) + lin_reg_final = Ridge(alpha=best_alpha, **hparam_dict_local, solver='svd').fit(poly_reg.fit_transform(X_train), y_train) return lin_reg_final, poly_reg @@ -487,7 +485,7 @@ def sklearn_main(self, get_model_file_prefix, feat_names_dict, resp_names, algo, seed, sample_weights_vect, model_per_response): # train a separate models for each response, pack into a dictionary with response names # as keys and the correponding models as values - #print('sklearn_main: feat_names_dict', feat_names_dict, 'X_train cols', X_train.columns.tolist()) + print('sklearn_main: feat_names_dict', feat_names_dict, 'X_train cols', X_train.columns.tolist()) if model_per_response: model = {} for rn in resp_names: From faed96e60fd52249994a363de5e5f2c64c81b8b5 Mon Sep 17 00:00:00 2001 From: Arnav Mishra Date: Sat, 8 Mar 2025 14:12:35 +0000 Subject: [PATCH 08/11] Major improvement to include the y-intercept into the polynomial equation found by the model --- src/smlp_py/train_sklearn.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/smlp_py/train_sklearn.py b/src/smlp_py/train_sklearn.py index 89bdffd1..d837d219 100644 --- a/src/smlp_py/train_sklearn.py +++ b/src/smlp_py/train_sklearn.py @@ -386,6 +386,7 @@ def poly_train(self, input_names, resp_names, hparam_dict, hparam_dict_local = self._hparam_dict_global_to_local('poly', hparam_dict) hparam_dict_local.pop('degree', None) hparam_dict_local.pop('n_jobs', None) + hparam_dict_local.pop('fit_intercept', None) max_degree = 3 @@ -396,7 +397,7 @@ def poly_train(self, input_names, resp_names, hparam_dict, mse_values = [] degrees = range(1, max_degree + 1) for d in degrees: - poly = PolynomialFeatures(degree=d) + poly = PolynomialFeatures(degree=d, include_bias=True) X_poly_train = poly.fit_transform(X_train) model = LinearRegression(**hparam_dict_local).fit(X_poly_train, y_train) y_pred = model.predict(X_poly_train) @@ -409,7 +410,7 @@ def poly_train(self, input_names, resp_names, hparam_dict, if noise_level < 1e-10: print("\nNo noise in the data! Using Linear Regression") - lin_reg_final = LinearRegression(**hparam_dict_local).fit(poly_reg.fit_transform(X_train), y_train) + lin_reg_final = LinearRegression(**hparam_dict_local, fit_intercept=False).fit(poly_reg.fit_transform(X_train), y_train) else: alphas = np.logspace(-6, 1, 50) ridge_model = RidgeCV(alphas=alphas, store_cv_values=True, scoring='neg_mean_squared_error') @@ -417,9 +418,7 @@ def poly_train(self, input_names, resp_names, hparam_dict, best_alpha = ridge_model.alpha_ print("\nBest alpha found:", best_alpha) - lin_reg_final = Ridge(alpha=best_alpha, **hparam_dict_local, solver='svd').fit(poly_reg.fit_transform(X_train), y_train) - - lin_reg_final = Ridge(alpha=best_alpha, **hparam_dict_local, solver='svd').fit(poly_reg.fit_transform(X_train), y_train) + lin_reg_final = Ridge(alpha=best_alpha, **hparam_dict_local, solver='svd', fit_intercept=False).fit(poly_reg.fit_transform(X_train), y_train) return lin_reg_final, poly_reg @@ -485,7 +484,7 @@ def sklearn_main(self, get_model_file_prefix, feat_names_dict, resp_names, algo, seed, sample_weights_vect, model_per_response): # train a separate models for each response, pack into a dictionary with response names # as keys and the correponding models as values - print('sklearn_main: feat_names_dict', feat_names_dict, 'X_train cols', X_train.columns.tolist()) + # print('sklearn_main: feat_names_dict', feat_names_dict, 'X_train cols', X_train.columns.tolist()) if model_per_response: model = {} for rn in resp_names: From 66cb4981b1658d0585550170765a1c7f0f2f7ba2 Mon Sep 17 00:00:00 2001 From: Arnav Mishra Date: Fri, 14 Mar 2025 15:41:38 +0000 Subject: [PATCH 09/11] Final improvements to the poly train function --- src/smlp_py/train_sklearn.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/smlp_py/train_sklearn.py b/src/smlp_py/train_sklearn.py index d837d219..90d0c17d 100644 --- a/src/smlp_py/train_sklearn.py +++ b/src/smlp_py/train_sklearn.py @@ -394,23 +394,23 @@ def poly_train(self, input_names, resp_names, hparam_dict, residuals = y_train - lin_reg.predict(X_train) noise_level = np.std(residuals).item() - mse_values = [] + mae_values = [] degrees = range(1, max_degree + 1) for d in degrees: poly = PolynomialFeatures(degree=d, include_bias=True) X_poly_train = poly.fit_transform(X_train) - model = LinearRegression(**hparam_dict_local).fit(X_poly_train, y_train) + model = LinearRegression(**hparam_dict_local).fit(X_poly_train, y_train, sample_weight=weights) y_pred = model.predict(X_poly_train) - mse = mean_absolute_error(y_train, y_pred) - mse_values.append(mse) + mae = mean_absolute_error(y_train, y_pred) + mae_values.append(mae) - best_degree = degrees[np.argmin(mse_values)] + best_degree = degrees[np.argmin(mae_values)] poly_reg = PolynomialFeatures(degree=best_degree) if noise_level < 1e-10: print("\nNo noise in the data! Using Linear Regression") - lin_reg_final = LinearRegression(**hparam_dict_local, fit_intercept=False).fit(poly_reg.fit_transform(X_train), y_train) + lin_reg_final = LinearRegression(**hparam_dict_local, fit_intercept=False).fit(poly_reg.fit_transform(X_train), y_train, sample_weight=weights) else: alphas = np.logspace(-6, 1, 50) ridge_model = RidgeCV(alphas=alphas, store_cv_values=True, scoring='neg_mean_squared_error') @@ -418,7 +418,7 @@ def poly_train(self, input_names, resp_names, hparam_dict, best_alpha = ridge_model.alpha_ print("\nBest alpha found:", best_alpha) - lin_reg_final = Ridge(alpha=best_alpha, **hparam_dict_local, solver='svd', fit_intercept=False).fit(poly_reg.fit_transform(X_train), y_train) + lin_reg_final = Ridge(alpha=best_alpha, **hparam_dict_local, solver='svd', fit_intercept=False).fit(poly_reg.fit_transform(X_train), y_train, sample_weight=weights) return lin_reg_final, poly_reg From 52eafad3427103d09cb422a14486cc3b636caa1f Mon Sep 17 00:00:00 2001 From: Arnav Mishra Date: Fri, 14 Mar 2025 17:14:46 +0000 Subject: [PATCH 10/11] Improvements for handling multi-response data in the polynomial model --- src/smlp_py/train_sklearn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/smlp_py/train_sklearn.py b/src/smlp_py/train_sklearn.py index 90d0c17d..194e4abc 100644 --- a/src/smlp_py/train_sklearn.py +++ b/src/smlp_py/train_sklearn.py @@ -391,7 +391,7 @@ def poly_train(self, input_names, resp_names, hparam_dict, max_degree = 3 lin_reg = LinearRegression().fit(X_train, y_train) - residuals = y_train - lin_reg.predict(X_train) + residuals = np.array(y_train - lin_reg.predict(X_train)).flatten() noise_level = np.std(residuals).item() mae_values = [] From 5f47223bcf3b70265998c3fbd300ce0f6db2e3b0 Mon Sep 17 00:00:00 2001 From: Arnav Mishra Date: Fri, 14 Mar 2025 17:23:42 +0000 Subject: [PATCH 11/11] Added documentation and necessary comments for the function --- src/smlp_py/train_sklearn.py | 74 +++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 14 deletions(-) diff --git a/src/smlp_py/train_sklearn.py b/src/smlp_py/train_sklearn.py index 194e4abc..a63bd396 100644 --- a/src/smlp_py/train_sklearn.py +++ b/src/smlp_py/train_sklearn.py @@ -382,46 +382,92 @@ def df_cols_summary(self, df): def poly_train(self, input_names, resp_names, hparam_dict, X_train, X_test, y_train, y_test, weights): + """ + Trains a polynomial regression model using dynamically selected polynomial degree. + If noise is negligible, a simple Linear Regression is used; otherwise, Ridge Regression + with cross-validated alpha is applied. + Parameters: + ---------- + self : object + Instance of the calling class. + input_names : list + Names of the input features. + resp_names : list + Names of the response variables. + hparam_dict : dict + Hyperparameter dictionary for model configuration. + X_train : ndarray or DataFrame + Training feature data. + X_test : ndarray or DataFrame + Testing feature data. + y_train : ndarray or Series + Target values for training. + y_test : ndarray or Series + Target values for testing. + weights : ndarray + Sample weights for weighted regression. + + Returns: + ------- + lin_reg_final : sklearn.linear_model + Trained regression model (either Linear Regression or Ridge Regression). + poly_reg : sklearn.preprocessing.PolynomialFeatures + Polynomial feature transformer with the selected degree. + """ + + # Extract hyperparameters and remove non-applicable ones hparam_dict_local = self._hparam_dict_global_to_local('poly', hparam_dict) - hparam_dict_local.pop('degree', None) + hparam_dict_local.pop('degree', None) hparam_dict_local.pop('n_jobs', None) hparam_dict_local.pop('fit_intercept', None) - max_degree = 3 + max_degree = 3 # Maximum polynomial degree considered + # Fit initial Linear Regression model to estimate noise level lin_reg = LinearRegression().fit(X_train, y_train) residuals = np.array(y_train - lin_reg.predict(X_train)).flatten() - noise_level = np.std(residuals).item() + noise_level = np.std(residuals) # Measure standard deviation of residuals + # List to store mean absolute error (MAE) values for different polynomial degrees mae_values = [] degrees = range(1, max_degree + 1) + + # Iterate over polynomial degrees and compute MAE for d in degrees: - poly = PolynomialFeatures(degree=d, include_bias=True) - X_poly_train = poly.fit_transform(X_train) - model = LinearRegression(**hparam_dict_local).fit(X_poly_train, y_train, sample_weight=weights) - y_pred = model.predict(X_poly_train) - mae = mean_absolute_error(y_train, y_pred) - mae_values.append(mae) + poly = PolynomialFeatures(degree=d, include_bias=True) # Generate polynomial features + X_poly_train = poly.fit_transform(X_train) # Transform training data + model = LinearRegression(**hparam_dict_local).fit(X_poly_train, y_train, sample_weight=weights) # Train model + y_pred = model.predict(X_poly_train) # Make predictions + mae = mean_absolute_error(y_train, y_pred) # Compute MAE + mae_values.append(mae) # Store MAE for current degree + # Select the polynomial degree with the lowest MAE best_degree = degrees[np.argmin(mae_values)] - poly_reg = PolynomialFeatures(degree=best_degree) + # If noise level is extremely low, use standard Linear Regression if noise_level < 1e-10: print("\nNo noise in the data! Using Linear Regression") - lin_reg_final = LinearRegression(**hparam_dict_local, fit_intercept=False).fit(poly_reg.fit_transform(X_train), y_train, sample_weight=weights) + lin_reg_final = LinearRegression(**hparam_dict_local, fit_intercept=False).fit( + poly_reg.fit_transform(X_train), y_train, sample_weight=weights + ) else: - alphas = np.logspace(-6, 1, 50) + # Perform cross-validation to find the best Ridge regularization parameter (alpha) + alphas = np.logspace(-6, 1, 50) # Range of alpha values for RidgeCV ridge_model = RidgeCV(alphas=alphas, store_cv_values=True, scoring='neg_mean_squared_error') ridge_model.fit(poly_reg.fit_transform(X_train), y_train) + # Extract the best alpha value best_alpha = ridge_model.alpha_ print("\nBest alpha found:", best_alpha) - lin_reg_final = Ridge(alpha=best_alpha, **hparam_dict_local, solver='svd', fit_intercept=False).fit(poly_reg.fit_transform(X_train), y_train, sample_weight=weights) - return lin_reg_final, poly_reg + # Train final Ridge Regression model with optimized alpha + lin_reg_final = Ridge( + alpha=best_alpha, **hparam_dict_local, solver='svd', fit_intercept=False + ).fit(poly_reg.fit_transform(X_train), y_train, sample_weight=weights) + return lin_reg_final, poly_reg # model for sklearn poly model is in fact a pair (linear_model, poly_reg), where # poly_reg is transformer that creates polynomial terems (like x^2) from the original