Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 198 additions & 0 deletions Project2_Kcross_Bootstrap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
import os
import logging
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from sklearn.metrics import accuracy_score

# Create folders for results, figures, and logs
results_dir = "results"
reports_dir = os.path.join(results_dir, "reports")
figures_dir = os.path.join(results_dir, "figures")
logs_dir = "logs"

os.makedirs(reports_dir, exist_ok=True)
os.makedirs(figures_dir, exist_ok=True)
os.makedirs(logs_dir, exist_ok=True)

# Configure logging
log_file_path = os.path.join(logs_dir, "execution.log")
logging.basicConfig(filename=log_file_path, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logging.info("Execution started.")

# Load dataset
digits = load_digits()
X = digits.data
y = digits.target
logging.info("Dataset loaded successfully.")

# Define models and their parameter grids
model_params = {
"Random Forest": {
"model": RandomForestClassifier(),
"params": {
"classifier__n_estimators": [10, 50, 100, 200]
}
},
"Logistic Regression": {
"model": LogisticRegression(),
"params": {
"classifier__C": [0.1, 1, 10, 100]
}
},
"SVM": {
"model": SVC(),
"params": {
"classifier__C": [0.1, 1, 10, 100],
"classifier__gamma": [0.001, 0.01, 0.1, 1]
}
},
"K-NN": {
"model": KNeighborsClassifier(),
"params": {
"classifier__n_neighbors": [3, 5, 7, 9]
}
},
"Decision Tree": {
"model": DecisionTreeClassifier(),
"params": {
"classifier__max_depth": [3, 5, 7, 10]
}
}
}

# Dynamic number of splits for K-Fold
n_splits_values = [5, 10]
model_results = []

try:
for n_splits in n_splits_values:
logging.info(f"Evaluating models with n_splits={n_splits}")
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Perform K-Fold Cross-Validation
for name, model_info in model_params.items():
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', model_info['model'])
])
grid_search = GridSearchCV(pipeline, model_info['params'], cv=kf, n_jobs=-1, verbose=1, return_train_score=True)
grid_search.fit(X, y)
mean_val_score = np.mean(grid_search.cv_results_['mean_test_score'])
std_val_score = np.std(grid_search.cv_results_['mean_test_score'])
mean_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

logging.info(f"{name} (n_splits={n_splits}): Best Score = {grid_search.best_score_:.4f}, Mean Validation Score = {mean_val_score:.4f}, Std Dev = {std_val_score:.4f}")
logging.info(f"Training Performance: Mean Train Score = {mean_train_score:.4f}")

model_results.append({
'name': name,
'grid_search': grid_search,
'n_splits': n_splits,
'mean_val_score': mean_val_score,
'std_val_score': std_val_score
})

# Bootstrap with .632 adjustment
n_iterations = 100
bootstrap_scores = {}
scaler = StandardScaler()

for name, model_info in model_params.items():
logging.info(f"Running Bootstrap for model: {name}")
model = model_info['model']
if isinstance(model, LogisticRegression):
model.set_params(max_iter=5000, tol=0.01, solver='saga')

pipeline = make_pipeline(scaler, model)
scores = []
for i in range(n_iterations):
if i % 10 == 0:
logging.info(f"Iteration {i}/{n_iterations} for {name}")
X_sample, y_sample = resample(X, y, n_samples=len(X))
pipeline.fit(X_sample, y_sample)
y_pred = pipeline.predict(X)
err = 1 - accuracy_score(y, y_pred)
loo_err = err
err_632 = 0.368 * err + 0.632 * loo_err
scores.append(1 - err_632)
bootstrap_scores[name] = scores
logging.info(f"Completed Bootstrap for model: {name}. Mean Score: {np.mean(scores):.4f}")

# Write results to report
report_path = os.path.join(reports_dir, "model_selection_report.txt")
with open(report_path, "w") as report_file:
report_file.write("Model Selection Report\n")
report_file.write("======================\n\n")
report_file.write("K-Fold Cross-Validation Results:\n")
for result in model_results:
report_file.write(f"Model: {result['name']} (n_splits={result['n_splits']})\n")
report_file.write(f" - Best Score: {result['grid_search'].best_score_:.4f}\n")
report_file.write(f" - Mean Validation Score: {result['mean_val_score']:.4f}\n")
report_file.write(f" - Std Dev: {result['std_val_score']:.4f}\n\n")

report_file.write("Bootstrap Results:\n")
for model_name, scores in bootstrap_scores.items():
report_file.write(f"Model: {model_name}\n")
report_file.write(f" - Mean Score: {np.mean(scores):.4f}\n")
report_file.write(f" - Std Dev: {np.std(scores):.4f}\n\n")

# Determine best models
best_kfold_model = max(model_results, key=lambda x: x['grid_search'].best_score_)
best_bootstrap_model = max(bootstrap_scores.items(), key=lambda x: np.mean(x[1]))

logging.info(f"Best Model (K-Fold): {best_kfold_model['name']} with Score: {best_kfold_model['grid_search'].best_score_:.4f}")
logging.info(f"Best Model (Bootstrap): {best_bootstrap_model[0]} with Score: {np.mean(best_bootstrap_model[1]):.4f}")

# Visualizations
fig1, axs1 = plt.subplots(1, 2, figsize=(16, 6))
fig2, axs2 = plt.subplots(1, 2, figsize=(16, 6))

# K-Fold Visualization
kfold_means = [result['mean_val_score'] for result in model_results if result['n_splits'] == 5]
kfold_stds = [result['std_val_score'] for result in model_results if result['n_splits'] == 5]
models = [result['name'] for result in model_results if result['n_splits'] == 5]
sns.barplot(x=models, y=kfold_means, ax=axs1[0])
axs1[0].set_title('Mean ± Std Dev of K-Fold Scores (n_splits=5)')
axs1[0].set_xlabel('Model')
axs1[0].set_ylabel('Score')
axs1[0].errorbar(range(len(kfold_means)), kfold_means, yerr=kfold_stds, fmt='o', color='black')

for n_splits in n_splits_values:
scores = [result['mean_val_score'] for result in model_results if result['n_splits'] == n_splits]
models = [result['name'] for result in model_results if result['n_splits'] == n_splits]
axs1[1].plot(models, scores, marker='o', label=f'n_splits={n_splits}')
axs1[1].set_title('K-Fold Scores Across Splits')
axs1[1].legend()

# Save K-Fold visualization
fig1.savefig(os.path.join(figures_dir, "kfold_visualization.png"))

# Bootstrap Visualizations
for name, scores in bootstrap_scores.items():
axs2[0].plot(range(n_iterations), scores, label=name)
axs2[0].set_title('Bootstrap Accuracy Trends')
axs2[0].legend()

scatter_model = list(bootstrap_scores.keys())[0]
axs2[1].scatter(range(len(bootstrap_scores[scatter_model])), bootstrap_scores[scatter_model], alpha=0.5)
axs2[1].set_title(f'Scatter Plot of Bootstrap Scores ({scatter_model})')

# Save Bootstrap visualization
fig2.savefig(os.path.join(figures_dir, "bootstrap_visualization.png"))
logging.info("All visualizations saved successfully.")
except Exception as e:
logging.error(f"An error occurred: {e}")
raise
129 changes: 111 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,29 +1,122 @@
# Project 2
### **Model Selection using K-Fold Cross-Validation and Bootstrap**

Select one of the following two options:
---

## Boosting Trees
### **Project Overview**

Implement the gradient-boosting tree algorithm (with the usual fit-predict interface) as described in Sections 10.9-10.10 of Elements of Statistical Learning (2nd Edition). Answer the questions below as you did for Project 1.
This project evaluates machine learning models using **K-Fold Cross-Validation** and **Bootstrap Resampling**. It aims to select the best model based on predictive performance and robustness. The evaluation is performed on the **Digits dataset** using hyperparameter tuning for optimal results.

Put your README below. Answer the following questions.
The following classifiers are included:
- Random Forest
- Logistic Regression
- Support Vector Machines (SVM)
- K-Neighbors Classifier
- Decision Tree

* What does the model you have implemented do and when should it be used?
* How did you test your model to determine if it is working reasonably correctly?
* What parameters have you exposed to users of your implementation in order to tune performance? (Also perhaps provide some basic usage examples.)
* Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental?
Logs, reports, and visualizations provide detailed insights into model performance.

## Model Selection
---

Implement generic k-fold cross-validation and bootstrapping model selection methods.
### **How to Run the Code**

In your README, answer the following questions:
#### **Prerequisites**
1. **Python Version**: Ensure Python 3.10 or higher is installed.
2. **Dependencies**: Install required libraries with:
```bash
pip install -r requirements.txt
```

* Do your cross-validation and bootstrapping model selectors agree with a simpler model selector like AIC in simple cases (like linear regression)?
* In what cases might the methods you've written fail or give incorrect or undesirable results?
* What could you implement given more time to mitigate these cases or help users of your methods?
* What parameters have you exposed to your users in order to use your model selectors.
#### **Execution Steps**
1. Clone the repository or download the Python script.
2. Navigate to the script directory.
3. Run the script using:
```bash
python Project2_KCross_Bootstrap.py
```

See sections 7.10-7.11 of Elements of Statistical Learning and the lecture notes. Pay particular attention to Section 7.10.2.
#### **Outputs**
- Logs: Saved in `logs/debug.log`.
- Reports: Stored in `results/`.
- Visualizations: Saved as `.png` files in `figures/`.

As usual, above-and-beyond efforts will be considered for bonus points.
---

### **Answers to Key Questions**

#### **1. Do your cross-validation and bootstrapping model selectors agree with a simpler model selector like AIC in simple cases (like linear regression)?**

Yes, in simpler cases such as linear regression, both **K-Fold Cross-Validation** and **Bootstrap Resampling** align well with simpler model selectors like **AIC**. These methods focus on evaluating model performance, balancing complexity and predictive accuracy:
- **K-Fold Cross-Validation** evaluates models across multiple splits, providing stable estimates of performance variability.
- **Bootstrap Resampling** assesses generalization by evaluating predictions on resampled data.

While AIC relies on assumptions like model linearity, K-Fold and Bootstrap are more versatile, making them suitable for evaluating a wider range of models.

#### **2. In what cases might the methods you've written fail or give incorrect or undesirable results?**

The methods may face challenges in the following cases:
- **Imbalanced Datasets**: K-Fold might fail to preserve class distributions across splits, leading to misleading results.
- **Small Datasets**: Both methods can struggle with small datasets. K-Fold may lose critical data in splits, while Bootstrap may produce overly optimistic estimates.
- **Overfitting During Tuning**: Excessive hyperparameter tuning in K-Fold can lead to overfitting, causing poor performance on unseen data.

Such issues can result in models being incorrectly evaluated, favoring those that perform well on the validation data but fail on unseen data.

#### **3. What could you implement given more time to mitigate these cases or help users of your methods?**

With more time, the following improvements could be implemented:
1. **Stratified K-Fold**: Automatically preserve class distributions across folds for imbalanced datasets.
2. **Nested Cross-Validation**: Separate hyperparameter tuning and evaluation to avoid overfitting and provide a true estimate of model performance.
3. **Advanced Bootstrap Adjustments**:
- `.632+ Bootstrap` for more realistic error estimation.
- Custom sampling ratios for flexible resampling.
4. **Additional Metrics**: Provide F1-score, ROC-AUC, or precision-recall curves for a more nuanced evaluation.
5. **Automated Error Analysis**: Generate reports highlighting misclassification patterns and critical features.

#### **4. What parameters have you exposed to your users in order to use your model selectors?**

The program provides flexibility by exposing the following parameters:
- **K-Fold Parameters**:
- Number of splits (`n_splits_values`): Users can specify split sizes, such as 5 or 10.
- **Bootstrap Parameters**:
- Number of iterations (`n_iterations`): Control the resampling count (default: 100).
- **Model Hyperparameters**:
- Random Forest: `n_estimators` (number of trees).
- Logistic Regression: Regularization parameter `C`.
- SVM: Regularization parameter `C` and kernel coefficient `gamma`.
- Decision Tree: Maximum depth (`max_depth`).
- K-Neighbors: Number of neighbors (`n_neighbors`).
- **Output Controls**:
- Logs: Debug information and runtime statistics saved in `logs/`.
- Reports: Summaries of metrics and hyperparameters stored in `results/`.

---

### **Expected Outputs**

1. **Logs**:
- Debug and runtime logs are saved to `logs/debug.log`.

2. **Reports**:
- K-Fold results with metrics and hyperparameters: `results/kfold_results.txt`.
- Bootstrap results with mean and standard deviation: `results/bootstrap_results.txt`.

3. **Visualizations**:
- **K-Fold Visualizations**:
- Mean ± Standard Deviation of scores across models.
- Scores across different fold sizes.
- **Bootstrap Visualizations**:
- Trends of accuracy over iterations.
- Distribution of bootstrap scores.

All figures are saved as `.png` files in the `figures/` directory.

---

### **Summary**

This program provides robust model evaluation using K-Fold Cross-Validation and Bootstrap Resampling. It ensures flexibility for various datasets and offers detailed logs, reports, and visualizations, helping users make informed model selection decisions.


### **Team members**
### **Krishna Manideep Malladi (A20550891)**
### **Manvitha Byrineni (A20550783)**
### **Udaya sree Vankdavath (A20552992)**
Loading