diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cfd5ae3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,49 @@ +# ========================= +# Virtual environments +# ========================= +venv/ +env/ +ENV/ + +# ========================= +# Python cache +# ========================= +__pycache__/ +*.pyc +*.pyo +*.pyd + +# ========================= +# Jupyter +# ========================= +.ipynb_checkpoints/ + +# ========================= +# VSCode +# ========================= +.vscode/ + +# ========================= +# OS files +# ========================= +.DS_Store +Thumbs.db + +# ========================= +# Data files (optional) +# ========================= +data/ +*.csv +*.xlsx + +# ========================= +# Model artifacts +# ========================= +*.pkl +*.joblib +*.model + +# ========================= +# Logs +# ========================= +*.log \ No newline at end of file diff --git a/lab-hyper-tuning.ipynb b/lab-hyper-tuning.ipynb index 847d487..1df0c22 100644 --- a/lab-hyper-tuning.ipynb +++ b/lab-hyper-tuning.ipynb @@ -35,19 +35,190 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#Libraries\n", + "# Libraries\n", "import pandas as pd\n", "import numpy as np\n", - "from sklearn.model_selection import train_test_split" + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdHomePlanetCryoSleepCabinDestinationAgeVIPRoomServiceFoodCourtShoppingMallSpaVRDeckNameTransported
00001_01EuropaFalseB/0/PTRAPPIST-1e39.0False0.00.00.00.00.0Maham OfracculyFalse
10002_01EarthFalseF/0/STRAPPIST-1e24.0False109.09.025.0549.044.0Juanna VinesTrue
20003_01EuropaFalseA/0/STRAPPIST-1e58.0True43.03576.00.06715.049.0Altark SusentFalse
30003_02EuropaFalseA/0/STRAPPIST-1e33.0False0.01283.0371.03329.0193.0Solam SusentFalse
40004_01EarthFalseF/1/STRAPPIST-1e16.0False303.070.0151.0565.02.0Willy SantantinesTrue
\n", + "
" + ], + "text/plain": [ + " PassengerId HomePlanet CryoSleep Cabin Destination Age VIP \\\n", + "0 0001_01 Europa False B/0/P TRAPPIST-1e 39.0 False \n", + "1 0002_01 Earth False F/0/S TRAPPIST-1e 24.0 False \n", + "2 0003_01 Europa False A/0/S TRAPPIST-1e 58.0 True \n", + "3 0003_02 Europa False A/0/S TRAPPIST-1e 33.0 False \n", + "4 0004_01 Earth False F/1/S TRAPPIST-1e 16.0 False \n", + "\n", + " RoomService FoodCourt ShoppingMall Spa VRDeck Name \\\n", + "0 0.0 0.0 0.0 0.0 0.0 Maham Ofracculy \n", + "1 109.0 9.0 25.0 549.0 44.0 Juanna Vines \n", + "2 43.0 3576.0 0.0 6715.0 49.0 Altark Susent \n", + "3 0.0 1283.0 371.0 3329.0 193.0 Solam Susent \n", + "4 303.0 70.0 151.0 565.0 2.0 Willy Santantines \n", + "\n", + " Transported \n", + "0 False \n", + "1 True \n", + "2 False \n", + "3 False \n", + "4 True " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv\"\n", + "\n", + "df = pd.read_csv(url)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -200,7 +371,7 @@ "4 True " ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -221,11 +392,136 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\profe\\AppData\\Local\\Temp\\ipykernel_12764\\3442894969.py:50: Pandas4Warning: For backward compatibility, 'str' dtypes are included by select_dtypes when 'object' dtype is specified. This behavior is deprecated and will be removed in a future version. Explicitly pass 'str' to `include` to select them, or to `exclude` to remove them and silence this warning.\n", + "See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.\n", + " cat_cols = X_train.select_dtypes(include=[\"object\", \"bool\"]).columns\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Accuracy: 0.7682576193214491\n" + ] + } + ], "source": [ - "#your code here" + "#your code here\n", + "# ==============================\n", + "# Libraries\n", + "# ==============================\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.feature_selection import SelectKBest, f_classif\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.impute import SimpleImputer\n", + "\n", + "\n", + "# ==============================\n", + "# Load Data\n", + "# ==============================\n", + "\n", + "url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv\"\n", + "\n", + "df = pd.read_csv(url)\n", + "\n", + "\n", + "# ==============================\n", + "# Separate Features and Target\n", + "# ==============================\n", + "\n", + "X = df.drop(\"Transported\", axis=1)\n", + "y = df[\"Transported\"]\n", + "\n", + "\n", + "# ==============================\n", + "# Train Test Split\n", + "# ==============================\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42\n", + ")\n", + "\n", + "\n", + "# ==============================\n", + "# Identify Column Types\n", + "# ==============================\n", + "\n", + "num_cols = X_train.select_dtypes(include=[\"int64\", \"float64\"]).columns\n", + "cat_cols = X_train.select_dtypes(include=[\"object\", \"bool\"]).columns\n", + "\n", + "\n", + "# ==============================\n", + "# Numerical Pipeline\n", + "# ==============================\n", + "\n", + "num_pipeline = Pipeline([\n", + " (\"imputer\", SimpleImputer(strategy=\"median\")),\n", + " (\"scaler\", StandardScaler())\n", + "])\n", + "\n", + "\n", + "# ==============================\n", + "# Categorical Pipeline\n", + "# ==============================\n", + "\n", + "cat_pipeline = Pipeline([\n", + " (\"imputer\", SimpleImputer(strategy=\"most_frequent\")),\n", + " (\"encoder\", OneHotEncoder(handle_unknown=\"ignore\"))\n", + "])\n", + "\n", + "\n", + "# ==============================\n", + "# Combine Preprocessing\n", + "# ==============================\n", + "\n", + "preprocessor = ColumnTransformer([\n", + " (\"num\", num_pipeline, num_cols),\n", + " (\"cat\", cat_pipeline, cat_cols)\n", + "])\n", + "\n", + "\n", + "# ==============================\n", + "# Full ML Pipeline\n", + "# ==============================\n", + "\n", + "pipeline = Pipeline([\n", + " (\"preprocessing\", preprocessor),\n", + " (\"feature_selection\", SelectKBest(score_func=f_classif, k=20)),\n", + " (\"model\", RandomForestClassifier(random_state=42))\n", + "])\n", + "\n", + "\n", + "# ==============================\n", + "# Train Model\n", + "# ==============================\n", + "\n", + "pipeline.fit(X_train, y_train)\n", + "\n", + "\n", + "# ==============================\n", + "# Evaluate\n", + "# ==============================\n", + "\n", + "preds = pipeline.predict(X_test)\n", + "\n", + "accuracy = accuracy_score(y_test, preds)\n", + "\n", + "print(\"Model Accuracy:\", accuracy)" ] }, { @@ -241,68 +537,1873 @@ "metadata": {}, "outputs": [], "source": [ - "#your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Evaluate your model" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "#your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Grid/Random Search**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For this lab we will use Grid Search." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Define hyperparameters to fine tune." + "#your code here\n", + "# define the parameter grid first\n", + "param_grid = {\n", + " \"feature_selection__k\": [10, 20, 30],\n", + " \"model__n_estimators\": [100, 200],\n", + " \"model__max_depth\": [5, 10, None],\n", + " \"model__min_samples_split\": [2, 5],\n", + " \"model__min_samples_leaf\": [1, 2]\n", + "}" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "#your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Run Grid Search" + "# Run GridSearchCV\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "grid_search = GridSearchCV(\n", + " pipeline,\n", + " param_grid,\n", + " cv=5,\n", + " scoring=\"accuracy\",\n", + " n_jobs=-1\n", + ")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
GridSearchCV(cv=5,\n",
+       "             estimator=Pipeline(steps=[('preprocessing',\n",
+       "                                        ColumnTransformer(transformers=[('num',\n",
+       "                                                                         Pipeline(steps=[('imputer',\n",
+       "                                                                                          SimpleImputer(strategy='median')),\n",
+       "                                                                                         ('scaler',\n",
+       "                                                                                          StandardScaler())]),\n",
+       "                                                                         Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='str')),\n",
+       "                                                                        ('cat',\n",
+       "                                                                         Pipeline(steps=[('imputer',\n",
+       "                                                                                          SimpleImputer(strategy='most_frequent')),...\n",
+       "                                                                         Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',\n",
+       "       'Name'],\n",
+       "      dtype='str'))])),\n",
+       "                                       ('feature_selection', SelectKBest(k=20)),\n",
+       "                                       ('model',\n",
+       "                                        RandomForestClassifier(random_state=42))]),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={'feature_selection__k': [10, 20, 30],\n",
+       "                         'model__max_depth': [5, 10, None],\n",
+       "                         'model__min_samples_leaf': [1, 2],\n",
+       "                         'model__min_samples_split': [2, 5],\n",
+       "                         'model__n_estimators': [100, 200]},\n",
+       "             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=5,\n", + " estimator=Pipeline(steps=[('preprocessing',\n", + " ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='median')),\n", + " ('scaler',\n", + " StandardScaler())]),\n", + " Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='str')),\n", + " ('cat',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='most_frequent')),...\n", + " Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',\n", + " 'Name'],\n", + " dtype='str'))])),\n", + " ('feature_selection', SelectKBest(k=20)),\n", + " ('model',\n", + " RandomForestClassifier(random_state=42))]),\n", + " n_jobs=-1,\n", + " param_grid={'feature_selection__k': [10, 20, 30],\n", + " 'model__max_depth': [5, 10, None],\n", + " 'model__min_samples_leaf': [1, 2],\n", + " 'model__min_samples_split': [2, 5],\n", + " 'model__n_estimators': [100, 200]},\n", + " scoring='accuracy')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Fit the Grid Search\n", + "grid_search.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best Parameters:\n", + "{'feature_selection__k': 10, 'model__max_depth': 10, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 200}\n" + ] + } + ], + "source": [ + "# Best Hyperparameters\n", + "print(\"Best Parameters:\")\n", + "print(grid_search.best_params_)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Best Model\n", + "best_model = grid_search.best_estimator_" + ] }, { "cell_type": "markdown", @@ -313,15 +2414,3948 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tuned Model Accuracy: 0.7763082231167338\n" + ] + } + ], + "source": [ + "# Evaluate the Tuned Model\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "preds = best_model.predict(X_test)\n", + "\n", + "accuracy = accuracy_score(y_test, preds)\n", + "\n", + "print(\"Tuned Model Accuracy:\", accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Grid/Random Search**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For this lab we will use Grid Search." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Define hyperparameters to fine tune." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('model', RandomForestClassifier())" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#your code here\n", + "(\"model\", RandomForestClassifier())" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "param_grid = {\n", + " \n", + " \"feature_selection__k\": [10, 20, 30],\n", + "\n", + " \"model__n_estimators\": [100, 200, 300],\n", + "\n", + " \"model__max_depth\": [5, 10, None],\n", + "\n", + " \"model__min_samples_split\": [2, 5],\n", + "\n", + " \"model__min_samples_leaf\": [1, 2]\n", + " \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 15, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "grid_search = GridSearchCV(\n", + "\n", + " pipeline,\n", + " \n", + " param_grid,\n", + " \n", + " cv=5,\n", + " \n", + " scoring=\"accuracy\",\n", + " \n", + " n_jobs=-1\n", + " \n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
GridSearchCV(cv=5,\n",
+       "             estimator=Pipeline(steps=[('preprocessing',\n",
+       "                                        ColumnTransformer(transformers=[('num',\n",
+       "                                                                         Pipeline(steps=[('imputer',\n",
+       "                                                                                          SimpleImputer(strategy='median')),\n",
+       "                                                                                         ('scaler',\n",
+       "                                                                                          StandardScaler())]),\n",
+       "                                                                         Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='str')),\n",
+       "                                                                        ('cat',\n",
+       "                                                                         Pipeline(steps=[('imputer',\n",
+       "                                                                                          SimpleImputer(strategy='most_frequent')),...\n",
+       "                                                                         Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',\n",
+       "       'Name'],\n",
+       "      dtype='str'))])),\n",
+       "                                       ('feature_selection', SelectKBest(k=20)),\n",
+       "                                       ('model',\n",
+       "                                        RandomForestClassifier(random_state=42))]),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={'feature_selection__k': [10, 20, 30],\n",
+       "                         'model__max_depth': [5, 10, None],\n",
+       "                         'model__min_samples_leaf': [1, 2],\n",
+       "                         'model__min_samples_split': [2, 5],\n",
+       "                         'model__n_estimators': [100, 200, 300]},\n",
+       "             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=5,\n", + " estimator=Pipeline(steps=[('preprocessing',\n", + " ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='median')),\n", + " ('scaler',\n", + " StandardScaler())]),\n", + " Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='str')),\n", + " ('cat',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='most_frequent')),...\n", + " Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',\n", + " 'Name'],\n", + " dtype='str'))])),\n", + " ('feature_selection', SelectKBest(k=20)),\n", + " ('model',\n", + " RandomForestClassifier(random_state=42))]),\n", + " n_jobs=-1,\n", + " param_grid={'feature_selection__k': [10, 20, 30],\n", + " 'model__max_depth': [5, 10, None],\n", + " 'model__min_samples_leaf': [1, 2],\n", + " 'model__min_samples_split': [2, 5],\n", + " 'model__n_estimators': [100, 200, 300]},\n", + " scoring='accuracy')" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Fit Grid Search\n", + "grid_search.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best parameters:\n", + "{'feature_selection__k': 10, 'model__max_depth': 10, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 300}\n" + ] + } + ], + "source": [ + "# Retrieve Best Hyperparameters\n", + "print(\"Best parameters:\")\n", + "\n", + "print(grid_search.best_params_)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve Best Model\n", + "best_model = grid_search.best_estimator_" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tuned Model Accuracy: 0.7740080506037953\n" + ] + } + ], + "source": [ + "# Evaluate Tuned Model\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "preds = best_model.predict(X_test)\n", + "\n", + "accuracy = accuracy_score(y_test, preds)\n", + "\n", + "print(\"Tuned Model Accuracy:\", accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Run Grid Search" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
GridSearchCV(cv=5,\n",
+       "             estimator=Pipeline(steps=[('preprocessing',\n",
+       "                                        ColumnTransformer(transformers=[('num',\n",
+       "                                                                         Pipeline(steps=[('imputer',\n",
+       "                                                                                          SimpleImputer(strategy='median')),\n",
+       "                                                                                         ('scaler',\n",
+       "                                                                                          StandardScaler())]),\n",
+       "                                                                         Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='str')),\n",
+       "                                                                        ('cat',\n",
+       "                                                                         Pipeline(steps=[('imputer',\n",
+       "                                                                                          SimpleImputer(strategy='most_frequent')),...\n",
+       "                                                                         Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',\n",
+       "       'Name'],\n",
+       "      dtype='str'))])),\n",
+       "                                       ('feature_selection', SelectKBest(k=20)),\n",
+       "                                       ('model',\n",
+       "                                        RandomForestClassifier(random_state=42))]),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={'feature_selection__k': [10, 20, 30],\n",
+       "                         'model__max_depth': [5, 10, None],\n",
+       "                         'model__min_samples_leaf': [1, 2],\n",
+       "                         'model__min_samples_split': [2, 5],\n",
+       "                         'model__n_estimators': [100, 200, 300]},\n",
+       "             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=5,\n", + " estimator=Pipeline(steps=[('preprocessing',\n", + " ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='median')),\n", + " ('scaler',\n", + " StandardScaler())]),\n", + " Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='str')),\n", + " ('cat',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='most_frequent')),...\n", + " Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',\n", + " 'Name'],\n", + " dtype='str'))])),\n", + " ('feature_selection', SelectKBest(k=20)),\n", + " ('model',\n", + " RandomForestClassifier(random_state=42))]),\n", + " n_jobs=-1,\n", + " param_grid={'feature_selection__k': [10, 20, 30],\n", + " 'model__max_depth': [5, 10, None],\n", + " 'model__min_samples_leaf': [1, 2],\n", + " 'model__min_samples_split': [2, 5],\n", + " 'model__n_estimators': [100, 200, 300]},\n", + " scoring='accuracy')" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "grid_search = GridSearchCV(\n", + " estimator=pipeline,\n", + " param_grid=param_grid,\n", + " cv=5,\n", + " scoring=\"accuracy\",\n", + " n_jobs=-1\n", + ")\n", + "\n", + "grid_search.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'feature_selection__k': 10, 'model__max_depth': 10, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 300}\n" + ] + } + ], + "source": [ + "# Best Hyperparameters\n", + "print(grid_search.best_params_)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7903377829727591\n" + ] + } + ], + "source": [ + "# Best Cross-Validation Score\n", + "print(grid_search.best_score_)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# Best Model (full pipeline)\n", + "best_model = grid_search.best_estimator_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Evaluate your model" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test Accuracy: 0.7740080506037953\n" + ] + } + ], + "source": [ + "from sklearn.metrics import accuracy_score\n", + "\n", + "preds = best_model.predict(X_test)\n", + "\n", + "print(\"Test Accuracy:\", accuracy_score(y_test, preds))" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best Hyperparameters\n", + "{'feature_selection__k': 10, 'model__max_depth': 10, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 300}\n", + "\n", + "Best Cross Validation Accuracy\n", + "0.7903\n", + "\n", + "Test Accuracy\n", + "0.774\n", + "\n", + "Classification Report\n", + " precision recall f1-score support\n", + "\n", + " False 0.79 0.73 0.76 861\n", + " True 0.76 0.81 0.78 878\n", + "\n", + " accuracy 0.77 1739\n", + " macro avg 0.78 0.77 0.77 1739\n", + "weighted avg 0.78 0.77 0.77 1739\n", + "\n" + ] + } + ], + "source": [ + "# Nice to have a Professional Final Notebook Cell - why not!\n", + "# ==============================\n", + "# FINAL MODEL RESULTS\n", + "# ==============================\n", + "\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "\n", + "# Best parameters found\n", + "print(\"Best Hyperparameters\")\n", + "print(grid_search.best_params_)\n", + "print()\n", + "\n", + "# Best cross-validation score\n", + "print(\"Best Cross Validation Accuracy\")\n", + "print(round(grid_search.best_score_, 4))\n", + "print()\n", + "\n", + "# Best model (entire pipeline)\n", + "best_model = grid_search.best_estimator_\n", + "\n", + "# Predictions\n", + "preds = best_model.predict(X_test)\n", + "\n", + "# Test accuracy\n", + "test_accuracy = accuracy_score(y_test, preds)\n", + "\n", + "print(\"Test Accuracy\")\n", + "print(round(test_accuracy, 4))\n", + "print()\n", + "\n", + "# Detailed evaluation\n", + "print(\"Classification Report\")\n", + "print(classification_report(y_test, preds))" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "venv", "language": "python", "name": "python3" }, @@ -335,7 +6369,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ded3f09 Binary files /dev/null and b/requirements.txt differ