Fall2024CS584 · Udaya7893 · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024
diff --git a/Project2_Kcross_Bootstrap.py b/Project2_Kcross_Bootstrap.py
@@ -0,0 +1,198 @@
+import os
+import logging
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+from sklearn.datasets import load_digits
+from sklearn.model_selection import KFold, GridSearchCV
+from sklearn.pipeline import Pipeline
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.utils import resample
+from sklearn.metrics import accuracy_score
+
+# Create folders for results, figures, and logs
+results_dir = "results"
+reports_dir = os.path.join(results_dir, "reports")
+figures_dir = os.path.join(results_dir, "figures")
+logs_dir = "logs"
+
+os.makedirs(reports_dir, exist_ok=True)
+os.makedirs(figures_dir, exist_ok=True)
+os.makedirs(logs_dir, exist_ok=True)
+
+# Configure logging
+log_file_path = os.path.join(logs_dir, "execution.log")
+logging.basicConfig(filename=log_file_path, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logging.info("Execution started.")
+
+# Load dataset
+digits = load_digits()
+X = digits.data
+y = digits.target
+logging.info("Dataset loaded successfully.")
+
+# Define models and their parameter grids
+model_params = {
+    "Random Forest": {
+        "model": RandomForestClassifier(),
+        "params": {
+            "classifier__n_estimators": [10, 50, 100, 200]
+        }
+    },
+    "Logistic Regression": {
+        "model": LogisticRegression(),
+        "params": {
+            "classifier__C": [0.1, 1, 10, 100]
+        }
+    },
+    "SVM": {
+        "model": SVC(),
+        "params": {
+            "classifier__C": [0.1, 1, 10, 100],
+            "classifier__gamma": [0.001, 0.01, 0.1, 1]
+        }
+    },
+    "K-NN": {
+        "model": KNeighborsClassifier(),
+        "params": {
+            "classifier__n_neighbors": [3, 5, 7, 9]
+        }
+    },
+    "Decision Tree": {
+        "model": DecisionTreeClassifier(),
+        "params": {
+            "classifier__max_depth": [3, 5, 7, 10]
+        }
+    }
+}
+
+# Dynamic number of splits for K-Fold
+n_splits_values = [5, 10]
+model_results = []
+
+try:
+    for n_splits in n_splits_values:
+        logging.info(f"Evaluating models with n_splits={n_splits}")
+        kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
+
+        # Perform K-Fold Cross-Validation
+        for name, model_info in model_params.items():
+            pipeline = Pipeline([
+                ('scaler', StandardScaler()),
+                ('classifier', model_info['model'])
+            ])
+            grid_search = GridSearchCV(pipeline, model_info['params'], cv=kf, n_jobs=-1, verbose=1, return_train_score=True)
+            grid_search.fit(X, y)
+            mean_val_score = np.mean(grid_search.cv_results_['mean_test_score'])
+            std_val_score = np.std(grid_search.cv_results_['mean_test_score'])
+            mean_train_score = np.mean(grid_search.cv_results_['mean_train_score'])
+
+            logging.info(f"{name} (n_splits={n_splits}): Best Score = {grid_search.best_score_:.4f}, Mean Validation Score = {mean_val_score:.4f}, Std Dev = {std_val_score:.4f}")
+            logging.info(f"Training Performance: Mean Train Score = {mean_train_score:.4f}")
+
+            model_results.append({
+                'name': name,
+                'grid_search': grid_search,
+                'n_splits': n_splits,
+                'mean_val_score': mean_val_score,
+                'std_val_score': std_val_score
+            })
+
+    # Bootstrap with .632 adjustment
+    n_iterations = 100
+    bootstrap_scores = {}
+    scaler = StandardScaler()
+
+    for name, model_info in model_params.items():
+        logging.info(f"Running Bootstrap for model: {name}")
+        model = model_info['model']
+        if isinstance(model, LogisticRegression):
+            model.set_params(max_iter=5000, tol=0.01, solver='saga')
+
+        pipeline = make_pipeline(scaler, model)
+        scores = []
+        for i in range(n_iterations):
+            if i % 10 == 0:
+                logging.info(f"Iteration {i}/{n_iterations} for {name}")
+            X_sample, y_sample = resample(X, y, n_samples=len(X))
+            pipeline.fit(X_sample, y_sample)
+            y_pred = pipeline.predict(X)
+            err = 1 - accuracy_score(y, y_pred)
+            loo_err = err
+            err_632 = 0.368 * err + 0.632 * loo_err
+            scores.append(1 - err_632)
+        bootstrap_scores[name] = scores
+        logging.info(f"Completed Bootstrap for model: {name}. Mean Score: {np.mean(scores):.4f}")
+
+    # Write results to report
+    report_path = os.path.join(reports_dir, "model_selection_report.txt")
+    with open(report_path, "w") as report_file:
+        report_file.write("Model Selection Report\n")
+        report_file.write("======================\n\n")
+        report_file.write("K-Fold Cross-Validation Results:\n")
+        for result in model_results:
+            report_file.write(f"Model: {result['name']} (n_splits={result['n_splits']})\n")
+            report_file.write(f"  - Best Score: {result['grid_search'].best_score_:.4f}\n")
+            report_file.write(f"  - Mean Validation Score: {result['mean_val_score']:.4f}\n")
+            report_file.write(f"  - Std Dev: {result['std_val_score']:.4f}\n\n")
+
+        report_file.write("Bootstrap Results:\n")
+        for model_name, scores in bootstrap_scores.items():
+            report_file.write(f"Model: {model_name}\n")
+            report_file.write(f"  - Mean Score: {np.mean(scores):.4f}\n")
+            report_file.write(f"  - Std Dev: {np.std(scores):.4f}\n\n")
+
+    # Determine best models
+    best_kfold_model = max(model_results, key=lambda x: x['grid_search'].best_score_)
+    best_bootstrap_model = max(bootstrap_scores.items(), key=lambda x: np.mean(x[1]))
+
+    logging.info(f"Best Model (K-Fold): {best_kfold_model['name']} with Score: {best_kfold_model['grid_search'].best_score_:.4f}")
+    logging.info(f"Best Model (Bootstrap): {best_bootstrap_model[0]} with Score: {np.mean(best_bootstrap_model[1]):.4f}")
+
+    # Visualizations
+    fig1, axs1 = plt.subplots(1, 2, figsize=(16, 6))
+    fig2, axs2 = plt.subplots(1, 2, figsize=(16, 6))
+
+    # K-Fold Visualization
+    kfold_means = [result['mean_val_score'] for result in model_results if result['n_splits'] == 5]
+    kfold_stds = [result['std_val_score'] for result in model_results if result['n_splits'] == 5]
+    models = [result['name'] for result in model_results if result['n_splits'] == 5]
+    sns.barplot(x=models, y=kfold_means, ax=axs1[0])
+    axs1[0].set_title('Mean ± Std Dev of K-Fold Scores (n_splits=5)')
+    axs1[0].set_xlabel('Model')
+    axs1[0].set_ylabel('Score')
+    axs1[0].errorbar(range(len(kfold_means)), kfold_means, yerr=kfold_stds, fmt='o', color='black')
+
+    for n_splits in n_splits_values:
+        scores = [result['mean_val_score'] for result in model_results if result['n_splits'] == n_splits]
+        models = [result['name'] for result in model_results if result['n_splits'] == n_splits]
+        axs1[1].plot(models, scores, marker='o', label=f'n_splits={n_splits}')
+    axs1[1].set_title('K-Fold Scores Across Splits')
+    axs1[1].legend()
+
+    # Save K-Fold visualization
+    fig1.savefig(os.path.join(figures_dir, "kfold_visualization.png"))
+
+    # Bootstrap Visualizations
+    for name, scores in bootstrap_scores.items():
+        axs2[0].plot(range(n_iterations), scores, label=name)
+    axs2[0].set_title('Bootstrap Accuracy Trends')
+    axs2[0].legend()
+
+    scatter_model = list(bootstrap_scores.keys())[0]
+    axs2[1].scatter(range(len(bootstrap_scores[scatter_model])), bootstrap_scores[scatter_model], alpha=0.5)
+    axs2[1].set_title(f'Scatter Plot of Bootstrap Scores ({scatter_model})')
+
+    # Save Bootstrap visualization
+    fig2.savefig(os.path.join(figures_dir, "bootstrap_visualization.png"))
+    logging.info("All visualizations saved successfully.")
+except Exception as e:
+    logging.error(f"An error occurred: {e}")
+    raise
diff --git a/README.md b/README.md
@@ -1,29 +1,122 @@
-# Project 2
+### **Model Selection using K-Fold Cross-Validation and Bootstrap**
 
-Select one of the following two options:
+---
 
-## Boosting Trees
+### **Project Overview**
 
-Implement the gradient-boosting tree algorithm (with the usual fit-predict interface) as described in Sections 10.9-10.10 of Elements of Statistical Learning (2nd Edition). Answer the questions below as you did for Project 1.
+This project evaluates machine learning models using **K-Fold Cross-Validation** and **Bootstrap Resampling**. It aims to select the best model based on predictive performance and robustness. The evaluation is performed on the **Digits dataset** using hyperparameter tuning for optimal results.
 
-Put your README below. Answer the following questions.
+The following classifiers are included:
+- Random Forest
+- Logistic Regression
+- Support Vector Machines (SVM)
+- K-Neighbors Classifier
+- Decision Tree
 
-* What does the model you have implemented do and when should it be used?
-* How did you test your model to determine if it is working reasonably correctly?
-* What parameters have you exposed to users of your implementation in order to tune performance? (Also perhaps provide some basic usage examples.)
-* Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental?
+Logs, reports, and visualizations provide detailed insights into model performance.
 
-## Model Selection
+---
 
-Implement generic k-fold cross-validation and bootstrapping model selection methods.
+### **How to Run the Code**
 
-In your README, answer the following questions:
+#### **Prerequisites**
+1. **Python Version**: Ensure Python 3.10 or higher is installed.
+2. **Dependencies**: Install required libraries with:
+   ```bash
+   pip install -r requirements.txt
+   ```
 
-* Do your cross-validation and bootstrapping model selectors agree with a simpler model selector like AIC in simple cases (like linear regression)?
-* In what cases might the methods you've written fail or give incorrect or undesirable results?
-* What could you implement given more time to mitigate these cases or help users of your methods?
-* What parameters have you exposed to your users in order to use your model selectors.
+#### **Execution Steps**
+1. Clone the repository or download the Python script.
+2. Navigate to the script directory.
+3. Run the script using:
+   ```bash
+   python Project2_KCross_Bootstrap.py
+   ```
 
-See sections 7.10-7.11 of Elements of Statistical Learning and the lecture notes. Pay particular attention to Section 7.10.2.
+#### **Outputs**
+- Logs: Saved in `logs/debug.log`.
+- Reports: Stored in `results/`.
+- Visualizations: Saved as `.png` files in `figures/`.
 
-As usual, above-and-beyond efforts will be considered for bonus points.
+---
+
+### **Answers to Key Questions**
+
+#### **1. Do your cross-validation and bootstrapping model selectors agree with a simpler model selector like AIC in simple cases (like linear regression)?**
+
+Yes, in simpler cases such as linear regression, both **K-Fold Cross-Validation** and **Bootstrap Resampling** align well with simpler model selectors like **AIC**. These methods focus on evaluating model performance, balancing complexity and predictive accuracy:
+- **K-Fold Cross-Validation** evaluates models across multiple splits, providing stable estimates of performance variability.
+- **Bootstrap Resampling** assesses generalization by evaluating predictions on resampled data.
+
+While AIC relies on assumptions like model linearity, K-Fold and Bootstrap are more versatile, making them suitable for evaluating a wider range of models.
+
+#### **2. In what cases might the methods you've written fail or give incorrect or undesirable results?**
+
+The methods may face challenges in the following cases:
+- **Imbalanced Datasets**: K-Fold might fail to preserve class distributions across splits, leading to misleading results.
+- **Small Datasets**: Both methods can struggle with small datasets. K-Fold may lose critical data in splits, while Bootstrap may produce overly optimistic estimates.
+- **Overfitting During Tuning**: Excessive hyperparameter tuning in K-Fold can lead to overfitting, causing poor performance on unseen data.
+
+Such issues can result in models being incorrectly evaluated, favoring those that perform well on the validation data but fail on unseen data.
+
+#### **3. What could you implement given more time to mitigate these cases or help users of your methods?**
+
+With more time, the following improvements could be implemented:
+1. **Stratified K-Fold**: Automatically preserve class distributions across folds for imbalanced datasets.
+2. **Nested Cross-Validation**: Separate hyperparameter tuning and evaluation to avoid overfitting and provide a true estimate of model performance.
+3. **Advanced Bootstrap Adjustments**:
+   - `.632+ Bootstrap` for more realistic error estimation.
+   - Custom sampling ratios for flexible resampling.
+4. **Additional Metrics**: Provide F1-score, ROC-AUC, or precision-recall curves for a more nuanced evaluation.
+5. **Automated Error Analysis**: Generate reports highlighting misclassification patterns and critical features.
+
+#### **4. What parameters have you exposed to your users in order to use your model selectors?**
+
+The program provides flexibility by exposing the following parameters:
+- **K-Fold Parameters**:
+  - Number of splits (`n_splits_values`): Users can specify split sizes, such as 5 or 10.
+- **Bootstrap Parameters**:
+  - Number of iterations (`n_iterations`): Control the resampling count (default: 100).
+- **Model Hyperparameters**:
+  - Random Forest: `n_estimators` (number of trees).
+  - Logistic Regression: Regularization parameter `C`.
+  - SVM: Regularization parameter `C` and kernel coefficient `gamma`.
+  - Decision Tree: Maximum depth (`max_depth`).
+  - K-Neighbors: Number of neighbors (`n_neighbors`).
+- **Output Controls**:
+  - Logs: Debug information and runtime statistics saved in `logs/`.
+  - Reports: Summaries of metrics and hyperparameters stored in `results/`.
+
+---
+
+### **Expected Outputs**
+
+1. **Logs**:
+   - Debug and runtime logs are saved to `logs/debug.log`.
+
+2. **Reports**:
+   - K-Fold results with metrics and hyperparameters: `results/kfold_results.txt`.
+   - Bootstrap results with mean and standard deviation: `results/bootstrap_results.txt`.
+
+3. **Visualizations**:
+   - **K-Fold Visualizations**:
+     - Mean ± Standard Deviation of scores across models.
+     - Scores across different fold sizes.
+   - **Bootstrap Visualizations**:
+     - Trends of accuracy over iterations.
+     - Distribution of bootstrap scores.
+
+All figures are saved as `.png` files in the `figures/` directory.
+
+---
+
+### **Summary**
+
+This program provides robust model evaluation using K-Fold Cross-Validation and Bootstrap Resampling. It ensures flexibility for various datasets and offers detailed logs, reports, and visualizations, helping users make informed model selection decisions.
+
+
+### **Team members**
+### **Krishna Manideep Malladi (A20550891)**
+### **Manvitha Byrineni (A20550783)**
+### **Udaya sree Vankdavath (A20552992)**