From 5eb7e74cc5a2de17e504514cf2a5e22318342651 Mon Sep 17 00:00:00 2001 From: GeoTheLeo Date: Sun, 8 Mar 2026 17:23:59 +0100 Subject: [PATCH] Hyperparameter tuning with GridSearchCV Awesomeness --- .gitignore | 49 + lab-hyper-tuning.ipynb | 6154 +++++++++++++++++++++++++++++++++++++++- requirements.txt | Bin 0 -> 4048 bytes 3 files changed, 6143 insertions(+), 60 deletions(-) create mode 100644 .gitignore create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cfd5ae3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,49 @@ +# ========================= +# Virtual environments +# ========================= +venv/ +env/ +ENV/ + +# ========================= +# Python cache +# ========================= +__pycache__/ +*.pyc +*.pyo +*.pyd + +# ========================= +# Jupyter +# ========================= +.ipynb_checkpoints/ + +# ========================= +# VSCode +# ========================= +.vscode/ + +# ========================= +# OS files +# ========================= +.DS_Store +Thumbs.db + +# ========================= +# Data files (optional) +# ========================= +data/ +*.csv +*.xlsx + +# ========================= +# Model artifacts +# ========================= +*.pkl +*.joblib +*.model + +# ========================= +# Logs +# ========================= +*.log \ No newline at end of file diff --git a/lab-hyper-tuning.ipynb b/lab-hyper-tuning.ipynb index 847d487..1df0c22 100644 --- a/lab-hyper-tuning.ipynb +++ b/lab-hyper-tuning.ipynb @@ -35,19 +35,190 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#Libraries\n", + "# Libraries\n", "import pandas as pd\n", "import numpy as np\n", - "from sklearn.model_selection import train_test_split" + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdHomePlanetCryoSleepCabinDestinationAgeVIPRoomServiceFoodCourtShoppingMallSpaVRDeckNameTransported
00001_01EuropaFalseB/0/PTRAPPIST-1e39.0False0.00.00.00.00.0Maham OfracculyFalse
10002_01EarthFalseF/0/STRAPPIST-1e24.0False109.09.025.0549.044.0Juanna VinesTrue
20003_01EuropaFalseA/0/STRAPPIST-1e58.0True43.03576.00.06715.049.0Altark SusentFalse
30003_02EuropaFalseA/0/STRAPPIST-1e33.0False0.01283.0371.03329.0193.0Solam SusentFalse
40004_01EarthFalseF/1/STRAPPIST-1e16.0False303.070.0151.0565.02.0Willy SantantinesTrue
\n", + "
" + ], + "text/plain": [ + " PassengerId HomePlanet CryoSleep Cabin Destination Age VIP \\\n", + "0 0001_01 Europa False B/0/P TRAPPIST-1e 39.0 False \n", + "1 0002_01 Earth False F/0/S TRAPPIST-1e 24.0 False \n", + "2 0003_01 Europa False A/0/S TRAPPIST-1e 58.0 True \n", + "3 0003_02 Europa False A/0/S TRAPPIST-1e 33.0 False \n", + "4 0004_01 Earth False F/1/S TRAPPIST-1e 16.0 False \n", + "\n", + " RoomService FoodCourt ShoppingMall Spa VRDeck Name \\\n", + "0 0.0 0.0 0.0 0.0 0.0 Maham Ofracculy \n", + "1 109.0 9.0 25.0 549.0 44.0 Juanna Vines \n", + "2 43.0 3576.0 0.0 6715.0 49.0 Altark Susent \n", + "3 0.0 1283.0 371.0 3329.0 193.0 Solam Susent \n", + "4 303.0 70.0 151.0 565.0 2.0 Willy Santantines \n", + "\n", + " Transported \n", + "0 False \n", + "1 True \n", + "2 False \n", + "3 False \n", + "4 True " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv\"\n", + "\n", + "df = pd.read_csv(url)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -200,7 +371,7 @@ "4 True " ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -221,11 +392,136 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\profe\\AppData\\Local\\Temp\\ipykernel_12764\\3442894969.py:50: Pandas4Warning: For backward compatibility, 'str' dtypes are included by select_dtypes when 'object' dtype is specified. This behavior is deprecated and will be removed in a future version. Explicitly pass 'str' to `include` to select them, or to `exclude` to remove them and silence this warning.\n", + "See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.\n", + " cat_cols = X_train.select_dtypes(include=[\"object\", \"bool\"]).columns\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Accuracy: 0.7682576193214491\n" + ] + } + ], "source": [ - "#your code here" + "#your code here\n", + "# ==============================\n", + "# Libraries\n", + "# ==============================\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.feature_selection import SelectKBest, f_classif\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.impute import SimpleImputer\n", + "\n", + "\n", + "# ==============================\n", + "# Load Data\n", + "# ==============================\n", + "\n", + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv\"\n", + "\n", + "df = pd.read_csv(url)\n", + "\n", + "\n", + "# ==============================\n", + "# Separate Features and Target\n", + "# ==============================\n", + "\n", + "X = df.drop(\"Transported\", axis=1)\n", + "y = df[\"Transported\"]\n", + "\n", + "\n", + "# ==============================\n", + "# Train Test Split\n", + "# ==============================\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42\n", + ")\n", + "\n", + "\n", + "# ==============================\n", + "# Identify Column Types\n", + "# ==============================\n", + "\n", + "num_cols = X_train.select_dtypes(include=[\"int64\", \"float64\"]).columns\n", + "cat_cols = X_train.select_dtypes(include=[\"object\", \"bool\"]).columns\n", + "\n", + "\n", + "# ==============================\n", + "# Numerical Pipeline\n", + "# ==============================\n", + "\n", + "num_pipeline = Pipeline([\n", + " (\"imputer\", SimpleImputer(strategy=\"median\")),\n", + " (\"scaler\", StandardScaler())\n", + "])\n", + "\n", + "\n", + "# ==============================\n", + "# Categorical Pipeline\n", + "# ==============================\n", + "\n", + "cat_pipeline = Pipeline([\n", + " (\"imputer\", SimpleImputer(strategy=\"most_frequent\")),\n", + " (\"encoder\", OneHotEncoder(handle_unknown=\"ignore\"))\n", + "])\n", + "\n", + "\n", + "# ==============================\n", + "# Combine Preprocessing\n", + "# ==============================\n", + "\n", + "preprocessor = ColumnTransformer([\n", + " (\"num\", num_pipeline, num_cols),\n", + " (\"cat\", cat_pipeline, cat_cols)\n", + "])\n", + "\n", + "\n", + "# ==============================\n", + "# Full ML Pipeline\n", + "# ==============================\n", + "\n", + "pipeline = Pipeline([\n", + " (\"preprocessing\", preprocessor),\n", + " (\"feature_selection\", SelectKBest(score_func=f_classif, k=20)),\n", + " (\"model\", RandomForestClassifier(random_state=42))\n", + "])\n", + "\n", + "\n", + "# ==============================\n", + "# Train Model\n", + "# ==============================\n", + "\n", + "pipeline.fit(X_train, y_train)\n", + "\n", + "\n", + "# ==============================\n", + "# Evaluate\n", + "# ==============================\n", + "\n", + "preds = pipeline.predict(X_test)\n", + "\n", + "accuracy = accuracy_score(y_test, preds)\n", + "\n", + "print(\"Model Accuracy:\", accuracy)" ] }, { @@ -241,68 +537,1873 @@ "metadata": {}, "outputs": [], "source": [ - "#your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Evaluate your model" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "#your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Grid/Random Search**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For this lab we will use Grid Search." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Define hyperparameters to fine tune." + "#your code here\n", + "# define the parameter grid first\n", + "param_grid = {\n", + " \"feature_selection__k\": [10, 20, 30],\n", + " \"model__n_estimators\": [100, 200],\n", + " \"model__max_depth\": [5, 10, None],\n", + " \"model__min_samples_split\": [2, 5],\n", + " \"model__min_samples_leaf\": [1, 2]\n", + "}" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "#your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Run Grid Search" + "# Run GridSearchCV\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "grid_search = GridSearchCV(\n", + " pipeline,\n", + " param_grid,\n", + " cv=5,\n", + " scoring=\"accuracy\",\n", + " n_jobs=-1\n", + ")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
GridSearchCV(cv=5,\n",
+       "             estimator=Pipeline(steps=[('preprocessing',\n",
+       "                                        ColumnTransformer(transformers=[('num',\n",
+       "                                                                         Pipeline(steps=[('imputer',\n",
+       "                                                                                          SimpleImputer(strategy='median')),\n",
+       "                                                                                         ('scaler',\n",
+       "                                                                                          StandardScaler())]),\n",
+       "                                                                         Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='str')),\n",
+       "                                                                        ('cat',\n",
+       "                                                                         Pipeline(steps=[('imputer',\n",
+       "                                                                                          SimpleImputer(strategy='most_frequent')),...\n",
+       "                                                                         Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',\n",
+       "       'Name'],\n",
+       "      dtype='str'))])),\n",
+       "                                       ('feature_selection', SelectKBest(k=20)),\n",
+       "                                       ('model',\n",
+       "                                        RandomForestClassifier(random_state=42))]),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={'feature_selection__k': [10, 20, 30],\n",
+       "                         'model__max_depth': [5, 10, None],\n",
+       "                         'model__min_samples_leaf': [1, 2],\n",
+       "                         'model__min_samples_split': [2, 5],\n",
+       "                         'model__n_estimators': [100, 200]},\n",
+       "             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=5,\n", + " estimator=Pipeline(steps=[('preprocessing',\n", + " ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='median')),\n", + " ('scaler',\n", + " StandardScaler())]),\n", + " Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='str')),\n", + " ('cat',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='most_frequent')),...\n", + " Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',\n", + " 'Name'],\n", + " dtype='str'))])),\n", + " ('feature_selection', SelectKBest(k=20)),\n", + " ('model',\n", + " RandomForestClassifier(random_state=42))]),\n", + " n_jobs=-1,\n", + " param_grid={'feature_selection__k': [10, 20, 30],\n", + " 'model__max_depth': [5, 10, None],\n", + " 'model__min_samples_leaf': [1, 2],\n", + " 'model__min_samples_split': [2, 5],\n", + " 'model__n_estimators': [100, 200]},\n", + " scoring='accuracy')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Fit the Grid Search\n", + "grid_search.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best Parameters:\n", + "{'feature_selection__k': 10, 'model__max_depth': 10, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 200}\n" + ] + } + ], + "source": [ + "# Best Hyperparameters\n", + "print(\"Best Parameters:\")\n", + "print(grid_search.best_params_)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Best Model\n", + "best_model = grid_search.best_estimator_" + ] }, { "cell_type": "markdown", @@ -313,15 +2414,3948 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tuned Model Accuracy: 0.7763082231167338\n" + ] + } + ], + "source": [ + "# Evaluate the Tuned Model\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "preds = best_model.predict(X_test)\n", + "\n", + "accuracy = accuracy_score(y_test, preds)\n", + "\n", + "print(\"Tuned Model Accuracy:\", accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Grid/Random Search**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For this lab we will use Grid Search." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Define hyperparameters to fine tune." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('model', RandomForestClassifier())" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#your code here\n", + "(\"model\", RandomForestClassifier())" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "param_grid = {\n", + " \n", + " \"feature_selection__k\": [10, 20, 30],\n", + "\n", + " \"model__n_estimators\": [100, 200, 300],\n", + "\n", + " \"model__max_depth\": [5, 10, None],\n", + "\n", + " \"model__min_samples_split\": [2, 5],\n", + "\n", + " \"model__min_samples_leaf\": [1, 2]\n", + " \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 15, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "grid_search = GridSearchCV(\n", + "\n", + " pipeline,\n", + " \n", + " param_grid,\n", + " \n", + " cv=5,\n", + " \n", + " scoring=\"accuracy\",\n", + " \n", + " n_jobs=-1\n", + " \n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
GridSearchCV(cv=5,\n",
+       "             estimator=Pipeline(steps=[('preprocessing',\n",
+       "                                        ColumnTransformer(transformers=[('num',\n",
+       "                                                                         Pipeline(steps=[('imputer',\n",
+       "                                                                                          SimpleImputer(strategy='median')),\n",
+       "                                                                                         ('scaler',\n",
+       "                                                                                          StandardScaler())]),\n",
+       "                                                                         Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='str')),\n",
+       "                                                                        ('cat',\n",
+       "                                                                         Pipeline(steps=[('imputer',\n",
+       "                                                                                          SimpleImputer(strategy='most_frequent')),...\n",
+       "                                                                         Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',\n",
+       "       'Name'],\n",
+       "      dtype='str'))])),\n",
+       "                                       ('feature_selection', SelectKBest(k=20)),\n",
+       "                                       ('model',\n",
+       "                                        RandomForestClassifier(random_state=42))]),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={'feature_selection__k': [10, 20, 30],\n",
+       "                         'model__max_depth': [5, 10, None],\n",
+       "                         'model__min_samples_leaf': [1, 2],\n",
+       "                         'model__min_samples_split': [2, 5],\n",
+       "                         'model__n_estimators': [100, 200, 300]},\n",
+       "             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=5,\n", + " estimator=Pipeline(steps=[('preprocessing',\n", + " ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='median')),\n", + " ('scaler',\n", + " StandardScaler())]),\n", + " Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='str')),\n", + " ('cat',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='most_frequent')),...\n", + " Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',\n", + " 'Name'],\n", + " dtype='str'))])),\n", + " ('feature_selection', SelectKBest(k=20)),\n", + " ('model',\n", + " RandomForestClassifier(random_state=42))]),\n", + " n_jobs=-1,\n", + " param_grid={'feature_selection__k': [10, 20, 30],\n", + " 'model__max_depth': [5, 10, None],\n", + " 'model__min_samples_leaf': [1, 2],\n", + " 'model__min_samples_split': [2, 5],\n", + " 'model__n_estimators': [100, 200, 300]},\n", + " scoring='accuracy')" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Fit Grid Search\n", + "grid_search.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best parameters:\n", + "{'feature_selection__k': 10, 'model__max_depth': 10, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 300}\n" + ] + } + ], + "source": [ + "# Retrieve Best Hyperparameters\n", + "print(\"Best parameters:\")\n", + "\n", + "print(grid_search.best_params_)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve Best Model\n", + "best_model = grid_search.best_estimator_" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tuned Model Accuracy: 0.7740080506037953\n" + ] + } + ], + "source": [ + "# Evaluate Tuned Model\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "preds = best_model.predict(X_test)\n", + "\n", + "accuracy = accuracy_score(y_test, preds)\n", + "\n", + "print(\"Tuned Model Accuracy:\", accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Run Grid Search" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
GridSearchCV(cv=5,\n",
+       "             estimator=Pipeline(steps=[('preprocessing',\n",
+       "                                        ColumnTransformer(transformers=[('num',\n",
+       "                                                                         Pipeline(steps=[('imputer',\n",
+       "                                                                                          SimpleImputer(strategy='median')),\n",
+       "                                                                                         ('scaler',\n",
+       "                                                                                          StandardScaler())]),\n",
+       "                                                                         Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='str')),\n",
+       "                                                                        ('cat',\n",
+       "                                                                         Pipeline(steps=[('imputer',\n",
+       "                                                                                          SimpleImputer(strategy='most_frequent')),...\n",
+       "                                                                         Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',\n",
+       "       'Name'],\n",
+       "      dtype='str'))])),\n",
+       "                                       ('feature_selection', SelectKBest(k=20)),\n",
+       "                                       ('model',\n",
+       "                                        RandomForestClassifier(random_state=42))]),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={'feature_selection__k': [10, 20, 30],\n",
+       "                         'model__max_depth': [5, 10, None],\n",
+       "                         'model__min_samples_leaf': [1, 2],\n",
+       "                         'model__min_samples_split': [2, 5],\n",
+       "                         'model__n_estimators': [100, 200, 300]},\n",
+       "             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=5,\n", + " estimator=Pipeline(steps=[('preprocessing',\n", + " ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='median')),\n", + " ('scaler',\n", + " StandardScaler())]),\n", + " Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='str')),\n", + " ('cat',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='most_frequent')),...\n", + " Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',\n", + " 'Name'],\n", + " dtype='str'))])),\n", + " ('feature_selection', SelectKBest(k=20)),\n", + " ('model',\n", + " RandomForestClassifier(random_state=42))]),\n", + " n_jobs=-1,\n", + " param_grid={'feature_selection__k': [10, 20, 30],\n", + " 'model__max_depth': [5, 10, None],\n", + " 'model__min_samples_leaf': [1, 2],\n", + " 'model__min_samples_split': [2, 5],\n", + " 'model__n_estimators': [100, 200, 300]},\n", + " scoring='accuracy')" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "grid_search = GridSearchCV(\n", + " estimator=pipeline,\n", + " param_grid=param_grid,\n", + " cv=5,\n", + " scoring=\"accuracy\",\n", + " n_jobs=-1\n", + ")\n", + "\n", + "grid_search.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'feature_selection__k': 10, 'model__max_depth': 10, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 300}\n" + ] + } + ], + "source": [ + "# Best Hyperparameters\n", + "print(grid_search.best_params_)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7903377829727591\n" + ] + } + ], + "source": [ + "# Best Cross-Validation Score\n", + "print(grid_search.best_score_)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# Best Model (full pipeline)\n", + "best_model = grid_search.best_estimator_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Evaluate your model" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test Accuracy: 0.7740080506037953\n" + ] + } + ], + "source": [ + "from sklearn.metrics import accuracy_score\n", + "\n", + "preds = best_model.predict(X_test)\n", + "\n", + "print(\"Test Accuracy:\", accuracy_score(y_test, preds))" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best Hyperparameters\n", + "{'feature_selection__k': 10, 'model__max_depth': 10, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 300}\n", + "\n", + "Best Cross Validation Accuracy\n", + "0.7903\n", + "\n", + "Test Accuracy\n", + "0.774\n", + "\n", + "Classification Report\n", + " precision recall f1-score support\n", + "\n", + " False 0.79 0.73 0.76 861\n", + " True 0.76 0.81 0.78 878\n", + "\n", + " accuracy 0.77 1739\n", + " macro avg 0.78 0.77 0.77 1739\n", + "weighted avg 0.78 0.77 0.77 1739\n", + "\n" + ] + } + ], + "source": [ + "# Nice to have a Professional Final Notebook Cell - why not!\n", + "# ==============================\n", + "# FINAL MODEL RESULTS\n", + "# ==============================\n", + "\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "\n", + "# Best parameters found\n", + "print(\"Best Hyperparameters\")\n", + "print(grid_search.best_params_)\n", + "print()\n", + "\n", + "# Best cross-validation score\n", + "print(\"Best Cross Validation Accuracy\")\n", + "print(round(grid_search.best_score_, 4))\n", + "print()\n", + "\n", + "# Best model (entire pipeline)\n", + "best_model = grid_search.best_estimator_\n", + "\n", + "# Predictions\n", + "preds = best_model.predict(X_test)\n", + "\n", + "# Test accuracy\n", + "test_accuracy = accuracy_score(y_test, preds)\n", + "\n", + "print(\"Test Accuracy\")\n", + "print(round(test_accuracy, 4))\n", + "print()\n", + "\n", + "# Detailed evaluation\n", + "print(\"Classification Report\")\n", + "print(classification_report(y_test, preds))" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "venv", "language": "python", "name": "python3" }, @@ -335,7 +6369,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..ded3f09394358fd0326b585ccc73245b357c6bcd GIT binary patch literal 4048 zcmai%%}yg%5QXnLQXa)QvS3fBv4Aw!D;0*_F@wd@N^WT4wS6WBH-XOIL0rYxQhWnzGhs6PmMf5n5;E zL;1euq%5T0mSue3l*4z=cJlHR(y8ncN7_A~r6${b{=SI(`X>B_yHZ583Suuoi|L{w>S zCpnt)&_x5AXj~2+caq#k%t(mF87*aVp^R^o^GnF75(v$PI-$`F)~Qw(T5zPY2_5hw z?qzwFC*zH;Px^B;SCU}Mc(?LNq(&xn*2)21+(ji|1z4|+d0CqnXOY>uJji2vjNIG> zWMitDMTiNfGud`Ym&!Tq~R#W5oczza`IFC(uoQI4`;0KSJzz{ZfiYPfp z{zPrx*69@;y+{66VFBOoX4*jp9ksEQK&Ua%WLf*VbAv zeW_Z&8u*{_PxPt+{eUZak8Acd|;)Z1jy3|5V7_%Nk#V z%wFC9M3~aOY~Z7hjMGQOEA?J81@^o5jO%-P6JfKL-9{dmf6WMsoM?0~F}qHVj0y2F zis-ZgcS5zwnXX_9-b@cNU?O;SytOxx&R_a-wb3>*l|!%XGqv_%7o9PJ<2bGr2bHj8 z?81mT3s0@0O|LuN|H}Vb-r*uKdo~coH2OY~KaFFoe?6dhA2nvq8FzBV>&NJ<)7ZAE zlKJErm~-$wt+)M?h<*sGUB>3}2NQY{;<&b-Puz zr@JowKI(byIkNYR0HNKGRj0FTGqVt1HMhOuZ4|{^a}}g>f7N)=;7&wdWS*6CeN(!V z6{gKLA|Fq2BSk#uBJu5wD$uEB5VeN`i8He~t&kCu${FwVoXyV`V2iS$-_8N7&jO2# z)2H@2MSp_*R(Wsq|2V4)opP)PS!RcL>w_5Rrj#ACk|pLE9X{r#IZaKPW6n88j2<3- zlYG1bv0!$SYx-d#d#_HS2Jm7W7bnmsU$zPzVV-$f=h@s8+&Oph%NF%y16@wnQ}lZ) zYyEJ0%8ENL;{K!jT)vcFBWp7y_a9!rsX7lKn+-Rmv<_xW0DidvSH=;+`G}WjJSbCl zRqicx$k=zpPPQ67;d7;V;Yz@Dn3T6VEYpEzPoB9F_b&hUjJY#D^GRpi-@Ux}E|r_o z2zEklDV>RaefxB76=SygX5b$6;P%b!3I<@S?k{Vm9s7Zaj_YfnxD+D4>pcDG8@ySR zm>=Y89B1wQd@n%5`#CvC1g612KP(fsBD$#un_~W#dP*;tGDtx1<>5;!)@|BvZQ(y z*(V|3{R?`DYQ^bE9@umFwv?UXJS*p{=dR<9?9Q_`T6IcW6K=2<{Lc1zkHSPx9V&5p SKVVz(R&