Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 93 additions & 16 deletions src/smlp_py/train_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@
# This file is part of smlp.

# Fitting sklearn regression tree models
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
#from sklearn.tree import _tree
from sklearn import tree, ensemble

# Fitting sklearn polynomial regression model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, RobustScaler, StandardScaler
from sklearn.linear_model import Ridge, LinearRegression, RidgeCV
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

# general
import numpy as np
Expand Down Expand Up @@ -377,20 +380,94 @@ def df_cols_summary(self, df):
'z_min', (df[col].min() - df[col].mean())/df[col].std(),
'z_max', (df[col].max() - df[col].mean())/df[col].std())

# train polynomial regression model with sklearn
def poly_train(self, input_names, resp_names, hparam_dict,
X_train, X_test, y_train, y_test, weights):
#print('poly_degree', degree); print('weigts', weights);
X_train, X_test, y_train, y_test, weights):
"""
Trains a polynomial regression model using dynamically selected polynomial degree.
If noise is negligible, a simple Linear Regression is used; otherwise, Ridge Regression
with cross-validated alpha is applied.

Parameters:
----------
self : object
Instance of the calling class.
input_names : list
Names of the input features.
resp_names : list
Names of the response variables.
hparam_dict : dict
Hyperparameter dictionary for model configuration.
X_train : ndarray or DataFrame
Training feature data.
X_test : ndarray or DataFrame
Testing feature data.
y_train : ndarray or Series
Target values for training.
y_test : ndarray or Series
Target values for testing.
weights : ndarray
Sample weights for weighted regression.

Returns:
-------
lin_reg_final : sklearn.linear_model
Trained regression model (either Linear Regression or Ridge Regression).
poly_reg : sklearn.preprocessing.PolynomialFeatures
Polynomial feature transformer with the selected degree.
"""

# Extract hyperparameters and remove non-applicable ones
hparam_dict_local = self._hparam_dict_global_to_local('poly', hparam_dict)
degree = hparam_dict_local['degree']
hparam_dict_local.pop('degree')
poly_reg = PolynomialFeatures(degree)
X_train = poly_reg.fit_transform(X_train)
X_test = poly_reg.transform(X_test)
pol_reg = LinearRegression(**hparam_dict_local)
model = pol_reg.fit(X_train, y_train, sample_weight=weights)
assert(model == pol_reg)
return model, poly_reg #, X_train, X_test
hparam_dict_local.pop('degree', None)
hparam_dict_local.pop('n_jobs', None)
hparam_dict_local.pop('fit_intercept', None)

max_degree = 3 # Maximum polynomial degree considered

# Fit initial Linear Regression model to estimate noise level
lin_reg = LinearRegression().fit(X_train, y_train)
residuals = np.array(y_train - lin_reg.predict(X_train)).flatten()
noise_level = np.std(residuals) # Measure standard deviation of residuals

# List to store mean absolute error (MAE) values for different polynomial degrees
mae_values = []
degrees = range(1, max_degree + 1)

# Iterate over polynomial degrees and compute MAE
for d in degrees:
poly = PolynomialFeatures(degree=d, include_bias=True) # Generate polynomial features
X_poly_train = poly.fit_transform(X_train) # Transform training data
model = LinearRegression(**hparam_dict_local).fit(X_poly_train, y_train, sample_weight=weights) # Train model
y_pred = model.predict(X_poly_train) # Make predictions
mae = mean_absolute_error(y_train, y_pred) # Compute MAE
mae_values.append(mae) # Store MAE for current degree

# Select the polynomial degree with the lowest MAE
best_degree = degrees[np.argmin(mae_values)]
poly_reg = PolynomialFeatures(degree=best_degree)

# If noise level is extremely low, use standard Linear Regression
if noise_level < 1e-10:
print("\nNo noise in the data! Using Linear Regression")
lin_reg_final = LinearRegression(**hparam_dict_local, fit_intercept=False).fit(
poly_reg.fit_transform(X_train), y_train, sample_weight=weights
)
else:
# Perform cross-validation to find the best Ridge regularization parameter (alpha)
alphas = np.logspace(-6, 1, 50) # Range of alpha values for RidgeCV
ridge_model = RidgeCV(alphas=alphas, store_cv_values=True, scoring='neg_mean_squared_error')
ridge_model.fit(poly_reg.fit_transform(X_train), y_train)

# Extract the best alpha value
best_alpha = ridge_model.alpha_
print("\nBest alpha found:", best_alpha)

# Train final Ridge Regression model with optimized alpha
lin_reg_final = Ridge(
alpha=best_alpha, **hparam_dict_local, solver='svd', fit_intercept=False
).fit(poly_reg.fit_transform(X_train), y_train, sample_weight=weights)

return lin_reg_final, poly_reg

# model for sklearn poly model is in fact a pair (linear_model, poly_reg), where
# poly_reg is transformer that creates polynomial terems (like x^2) from the original
Expand Down Expand Up @@ -443,7 +520,7 @@ def _sklearn_train_multi_response(self, get_model_file_prefix, feat_names, resp_
formula_report_file = get_model_file_prefix(None, self._algo_name_local2global(algo)) + '_formula.txt'
model_formula = self._instPolyTerms.poly_model_to_term_single_response(feat_names, resp_names, linear_model.coef_,
poly_reg.powers_, resp_id, True, formula_report_file)
#print('poly model computed', (linear_model, linear_model.coef_, poly_reg, poly_reg.powers_))
# print('poly model computed', (linear_model, linear_model.coef_, poly_reg, poly_reg.powers_))
return linear_model, poly_reg #, X_train, X_test
else:
raise Exception('Unsupported model type ' + str(algo) + ' in function tree_main')
Expand All @@ -453,7 +530,7 @@ def sklearn_main(self, get_model_file_prefix, feat_names_dict, resp_names, algo,
seed, sample_weights_vect, model_per_response):
# train a separate models for each response, pack into a dictionary with response names
# as keys and the correponding models as values
#print('sklearn_main: feat_names_dict', feat_names_dict, 'X_train cols', X_train.columns.tolist())
# print('sklearn_main: feat_names_dict', feat_names_dict, 'X_train cols', X_train.columns.tolist())
if model_per_response:
model = {}
for rn in resp_names:
Expand Down