From 397d7e4ca63d139eadadd01f70263aa4ca7f0a8b Mon Sep 17 00:00:00 2001 From: ceciliabetek Date: Wed, 11 Mar 2026 20:05:30 +0100 Subject: [PATCH] Update lab-hyper-tuning.ipynb --- lab-hyper-tuning.ipynb | 419 ++++++++--------------------------------- 1 file changed, 77 insertions(+), 342 deletions(-) diff --git a/lab-hyper-tuning.ipynb b/lab-hyper-tuning.ipynb index 847d487..56486fa 100644 --- a/lab-hyper-tuning.ipynb +++ b/lab-hyper-tuning.ipynb @@ -1,343 +1,78 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# LAB | Hyperparameter Tuning" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Load the data**\n", - "\n", - "Finally step in order to maximize the performance on your Spaceship Titanic model.\n", - "\n", - "The data can be found here:\n", - "\n", - "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv\n", - "\n", - "Metadata\n", - "\n", - "https://github.com/data-bootcamp-v4/data/blob/main/spaceship_titanic.md" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So far we've been training and evaluating models with default values for hyperparameters.\n", - "\n", - "Today we will perform the same feature engineering as before, and then compare the best working models you got so far, but now fine tuning it's hyperparameters." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "#Libraries\n", - "import pandas as pd\n", - "import numpy as np\n", - "from sklearn.model_selection import train_test_split" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
PassengerIdHomePlanetCryoSleepCabinDestinationAgeVIPRoomServiceFoodCourtShoppingMallSpaVRDeckNameTransported
00001_01EuropaFalseB/0/PTRAPPIST-1e39.0False0.00.00.00.00.0Maham OfracculyFalse
10002_01EarthFalseF/0/STRAPPIST-1e24.0False109.09.025.0549.044.0Juanna VinesTrue
20003_01EuropaFalseA/0/STRAPPIST-1e58.0True43.03576.00.06715.049.0Altark SusentFalse
30003_02EuropaFalseA/0/STRAPPIST-1e33.0False0.01283.0371.03329.0193.0Solam SusentFalse
40004_01EarthFalseF/1/STRAPPIST-1e16.0False303.070.0151.0565.02.0Willy SantantinesTrue
\n", - "
" - ], - "text/plain": [ - " PassengerId HomePlanet CryoSleep Cabin Destination Age VIP \\\n", - "0 0001_01 Europa False B/0/P TRAPPIST-1e 39.0 False \n", - "1 0002_01 Earth False F/0/S TRAPPIST-1e 24.0 False \n", - "2 0003_01 Europa False A/0/S TRAPPIST-1e 58.0 True \n", - "3 0003_02 Europa False A/0/S TRAPPIST-1e 33.0 False \n", - "4 0004_01 Earth False F/1/S TRAPPIST-1e 16.0 False \n", - "\n", - " RoomService FoodCourt ShoppingMall Spa VRDeck Name \\\n", - "0 0.0 0.0 0.0 0.0 0.0 Maham Ofracculy \n", - "1 109.0 9.0 25.0 549.0 44.0 Juanna Vines \n", - "2 43.0 3576.0 0.0 6715.0 49.0 Altark Susent \n", - "3 0.0 1283.0 371.0 3329.0 193.0 Solam Susent \n", - "4 303.0 70.0 151.0 565.0 2.0 Willy Santantines \n", - "\n", - " Transported \n", - "0 False \n", - "1 True \n", - "2 False \n", - "3 False \n", - "4 True " - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spaceship = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv\")\n", - "spaceship.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now perform the same as before:\n", - "- Feature Scaling\n", - "- Feature Selection\n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "#your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Now let's use the best model we got so far in order to see how it can improve when we fine tune it's hyperparameters." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Evaluate your model" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "#your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Grid/Random Search**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For this lab we will use Grid Search." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Define hyperparameters to fine tune." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Run Grid Search" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Evaluate your model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 +# Libraries +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split, GridSearchCV +from sklearn.preprocessing import StandardScaler +from sklearn.feature_selection import SelectKBest, f_classif +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score + +# 1️⃣ Load data +spaceship = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv") + +# Drop columns not useful +spaceship = spaceship.drop(["PassengerId", "Name", "Cabin"], axis=1) + +# Convert boolean to int +spaceship["CryoSleep"] = spaceship["CryoSleep"].astype(float) +spaceship["VIP"] = spaceship["VIP"].astype(float) +spaceship["Transported"] = spaceship["Transported"].astype(int) + +# Handle missing values +spaceship = spaceship.fillna(0) + +# One-hot encoding for categorical variables +spaceship = pd.get_dummies(spaceship, columns=["HomePlanet","Destination"], drop_first=True) + +# 2️⃣ Split X and y +X = spaceship.drop("Transported", axis=1) +y = spaceship["Transported"] + +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42 +) + +# 3️⃣ Feature Scaling +scaler = StandardScaler() +X_train_scaled = scaler.fit_transform(X_train) +X_test_scaled = scaler.transform(X_test) + +# 4️⃣ Feature Selection +selector = SelectKBest(score_func=f_classif, k=10) +X_train_selected = selector.fit_transform(X_train_scaled, y_train) +X_test_selected = selector.transform(X_test_scaled) + +# 5️⃣ Baseline model +model = RandomForestClassifier(random_state=42) +model.fit(X_train_selected, y_train) + +# 6️⃣ Evaluation baseline +y_pred = model.predict(X_test_selected) +accuracy = accuracy_score(y_test, y_pred) +print("Baseline accuracy:", accuracy) + +# 7️⃣ Define hyperparameters for Grid Search +param_grid = { + "n_estimators": [100, 200, 300], + "max_depth": [None, 10, 20], + "min_samples_split": [2, 5, 10], + "min_samples_leaf": [1, 2, 4] } + +# 8️⃣ Grid Search +grid_search = GridSearchCV( + estimator=RandomForestClassifier(random_state=42), + param_grid=param_grid, + cv=5, + scoring="accuracy", + n_jobs=-1 +) +grid_search.fit(X_train_selected, y_train) + +print("Best parameters:", grid_search.best_params_) + +# 9️⃣ Evaluation best model +best_model = grid_search.best_estimator_ +y_pred = best_model.predict(X_test_selected) +accuracy = accuracy_score(y_test, y_pred) +print("Best model accuracy:", accuracy)