diff --git a/Project_Zillow_Logistic_Regression.ipynb b/Project_Zillow_Logistic_Regression.ipynb new file mode 100644 index 0000000..31b4c19 --- /dev/null +++ b/Project_Zillow_Logistic_Regression.ipynb @@ -0,0 +1,7047 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyMtkuIY2koSIKgVrrcBVM46", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# **Split the Data into Training and Testing Sets**

\n", + "\n", + "### **Step 1:** Read the lending_data.csv data from the Resources folder into a Pandas DataFrame." + ], + "metadata": { + "id": "pk3A1a13M3dd" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zrP0hiEYHiii" + }, + "outputs": [], + "source": [ + "# Import the modules\n", + "import numpy as np\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report\n" + ] + }, + { + "cell_type": "code", + "source": [ + "# Read the CSV file from the Resources folder into a Pandas DataFrame\n", + "# Loading data\n", + "df = pd.read_csv(\"https://raw.githubusercontent.com/mirasmitty/Project_Zillow/main/Resources/Zillow_data_Detroit.csv\")\n", + "df['Week of pending'] = pd.to_datetime(df['Week of pending'])\n", + "df['Week of pending'] = df['Week of pending'].values.astype(\"int64\")\n", + "\n", + "# Review the DataFrame\n", + "df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 837 + }, + "id": "JpdCUOKTH3ie", + "outputId": "74d436ac-6b66-4115-a211-c6df1a6f15f0" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Week of pending Mean days listing to pending \\\n", + "0 1517011200000000000 49 \n", + "1 1517616000000000000 48 \n", + "2 1518220800000000000 47 \n", + "3 1518825600000000000 46 \n", + "4 1519430400000000000 43 \n", + "\n", + " Mean price reduction percentage \n", + "0 0.045619 \n", + "1 0.046532 \n", + "2 0.048536 \n", + "3 0.049169 \n", + "4 0.048168 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Week of pendingMean days listing to pendingMean price reduction percentage
01517011200000000000490.045619
11517616000000000000480.046532
21518220800000000000470.048536
31518825600000000000460.049169
41519430400000000000430.048168
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 315,\n \"fields\": [\n {\n \"column\": \"Week of pending\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 55083303430349928,\n \"min\": 1517011200000000000,\n \"max\": 1706918400000000000,\n \"num_unique_values\": 315,\n \"samples\": [\n 1641600000000000000,\n 1536969600000000000,\n 1615593600000000000\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Mean days listing to pending\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 13,\n \"min\": 15,\n \"max\": 72,\n \"num_unique_values\": 56,\n \"samples\": [\n 49,\n 42,\n 62\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Mean price reduction percentage\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.005088071642996824,\n \"min\": 0.034588388,\n \"max\": 0.056797626,\n \"num_unique_values\": 313,\n \"samples\": [\n 0.041885905,\n 0.041987246,\n 0.051982281\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 16 + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "

Distributions

\n", + "" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "from matplotlib import pyplot as plt\n", + "_df_15['index'].plot(kind='hist', bins=20, title='index')\n", + "plt.gca().spines[['top', 'right',]].set_visible(False)" + ], + "text/html": [ + "
\n", + " \n", + "
\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "from matplotlib import pyplot as plt\n", + "_df_16['Week of pending'].plot(kind='hist', bins=20, title='Week of pending')\n", + "plt.gca().spines[['top', 'right',]].set_visible(False)" + ], + "text/html": [ + "
\n", + " \n", + "
\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "from matplotlib import pyplot as plt\n", + "_df_17['Mean days listing to pending'].plot(kind='hist', bins=20, title='Mean days listing to pending')\n", + "plt.gca().spines[['top', 'right',]].set_visible(False)" + ], + "text/html": [ + "
\n", + " \n", + "
\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "from matplotlib import pyplot as plt\n", + "_df_18['Mean price reduction percentage'].plot(kind='hist', bins=20, title='Mean price reduction percentage')\n", + "plt.gca().spines[['top', 'right',]].set_visible(False)" + ], + "text/html": [ + "
\n", + " \n", + "
\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "

2-d distributions

\n", + "" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "from matplotlib import pyplot as plt\n", + "_df_19.plot(kind='scatter', x='index', y='Week of pending', s=32, alpha=.8)\n", + "plt.gca().spines[['top', 'right',]].set_visible(False)" + ], + "text/html": [ + "
\n", + " \n", + "
\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "from matplotlib import pyplot as plt\n", + "_df_20.plot(kind='scatter', x='Week of pending', y='Mean days listing to pending', s=32, alpha=.8)\n", + "plt.gca().spines[['top', 'right',]].set_visible(False)" + ], + "text/html": [ + "
\n", + " \n", + "
\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "from matplotlib import pyplot as plt\n", + "_df_21.plot(kind='scatter', x='Mean days listing to pending', y='Mean price reduction percentage', s=32, alpha=.8)\n", + "plt.gca().spines[['top', 'right',]].set_visible(False)" + ], + "text/html": [ + "
\n", + " \n", + "
\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "

Time series

\n", + "" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "from matplotlib import pyplot as plt\n", + "import seaborn as sns\n", + "def _plot_series(series, series_name, series_index=0):\n", + " from matplotlib import pyplot as plt\n", + " import seaborn as sns\n", + " palette = list(sns.palettes.mpl_palette('Dark2'))\n", + " xs = series['index']\n", + " ys = series['Mean days listing to pending']\n", + " \n", + " plt.plot(xs, ys, label=series_name, color=palette[series_index % len(palette)])\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')\n", + "df_sorted = _df_22.sort_values('index', ascending=True)\n", + "_plot_series(df_sorted, '')\n", + "sns.despine(fig=fig, ax=ax)\n", + "plt.xlabel('index')\n", + "_ = plt.ylabel('Mean days listing to pending')" + ], + "text/html": [ + "
\n", + " \n", + "
\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "from matplotlib import pyplot as plt\n", + "import seaborn as sns\n", + "def _plot_series(series, series_name, series_index=0):\n", + " from matplotlib import pyplot as plt\n", + " import seaborn as sns\n", + " palette = list(sns.palettes.mpl_palette('Dark2'))\n", + " xs = series['index']\n", + " ys = series['Mean price reduction percentage']\n", + " \n", + " plt.plot(xs, ys, label=series_name, color=palette[series_index % len(palette)])\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')\n", + "df_sorted = _df_23.sort_values('index', ascending=True)\n", + "_plot_series(df_sorted, '')\n", + "sns.despine(fig=fig, ax=ax)\n", + "plt.xlabel('index')\n", + "_ = plt.ylabel('Mean price reduction percentage')" + ], + "text/html": [ + "
\n", + " \n", + "
\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "from matplotlib import pyplot as plt\n", + "import seaborn as sns\n", + "def _plot_series(series, series_name, series_index=0):\n", + " from matplotlib import pyplot as plt\n", + " import seaborn as sns\n", + " palette = list(sns.palettes.mpl_palette('Dark2'))\n", + " counted = (series['index']\n", + " .value_counts()\n", + " .reset_index(name='counts')\n", + " .rename({'index': 'index'}, axis=1)\n", + " .sort_values('index', ascending=True))\n", + " xs = counted['index']\n", + " ys = counted['counts']\n", + " plt.plot(xs, ys, label=series_name, color=palette[series_index % len(palette)])\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')\n", + "df_sorted = _df_24.sort_values('index', ascending=True)\n", + "_plot_series(df_sorted, '')\n", + "sns.despine(fig=fig, ax=ax)\n", + "plt.xlabel('index')\n", + "_ = plt.ylabel('count()')" + ], + "text/html": [ + "
\n", + " \n", + "
\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "from matplotlib import pyplot as plt\n", + "import seaborn as sns\n", + "def _plot_series(series, series_name, series_index=0):\n", + " from matplotlib import pyplot as plt\n", + " import seaborn as sns\n", + " palette = list(sns.palettes.mpl_palette('Dark2'))\n", + " xs = series['Week of pending']\n", + " ys = series['Mean days listing to pending']\n", + " \n", + " plt.plot(xs, ys, label=series_name, color=palette[series_index % len(palette)])\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')\n", + "df_sorted = _df_25.sort_values('Week of pending', ascending=True)\n", + "_plot_series(df_sorted, '')\n", + "sns.despine(fig=fig, ax=ax)\n", + "plt.xlabel('Week of pending')\n", + "_ = plt.ylabel('Mean days listing to pending')" + ], + "text/html": [ + "
\n", + " \n", + "
\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "

Values

\n", + "" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "from matplotlib import pyplot as plt\n", + "_df_26['index'].plot(kind='line', figsize=(8, 4), title='index')\n", + "plt.gca().spines[['top', 'right']].set_visible(False)" + ], + "text/html": [ + "
\n", + " \n", + "
\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "from matplotlib import pyplot as plt\n", + "_df_27['Week of pending'].plot(kind='line', figsize=(8, 4), title='Week of pending')\n", + "plt.gca().spines[['top', 'right']].set_visible(False)" + ], + "text/html": [ + "
\n", + " \n", + "
\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "from matplotlib import pyplot as plt\n", + "_df_28['Mean days listing to pending'].plot(kind='line', figsize=(8, 4), title='Mean days listing to pending')\n", + "plt.gca().spines[['top', 'right']].set_visible(False)" + ], + "text/html": [ + "
\n", + " \n", + "
\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "from matplotlib import pyplot as plt\n", + "_df_29['Mean price reduction percentage'].plot(kind='line', figsize=(8, 4), title='Mean price reduction percentage')\n", + "plt.gca().spines[['top', 'right']].set_visible(False)" + ], + "text/html": [ + "
\n", + " \n", + "
\n", + " \n", + " " + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "### **Step 2:** Create the labels set (y) from the “loan_status” column, and then create the features (X) DataFrame from the remaining columns." + ], + "metadata": { + "id": "_GzBQdVUNVc8" + } + }, + { + "cell_type": "code", + "source": [ + "# Separate the data into labels and features\n", + "# Separate the y variable, the labels\n", + "y = df[\"Mean price reduction percentage\"]\n", + "\n", + "# Separate the X variable, the features\n", + "x = df.drop(columns=['Mean price reduction percentage'])\n" + ], + "metadata": { + "id": "D1Rc7verJSub" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Review the y variable Series\n", + "y[:5]\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1xpFwbdKLwcH", + "outputId": "a08926d0-9619-4687-f280-f0dbbaa98dfe" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 0.045619\n", + "1 0.046532\n", + "2 0.048536\n", + "3 0.049169\n", + "4 0.048168\n", + "Name: Mean price reduction percentage, dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 18 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Review the X variable DataFrame\n", + "x[:5]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "nDzx6xzBMGf1", + "outputId": "f7f9f972-4a9f-4ac5-fb7f-c24740439e82" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Week of pending Mean days listing to pending\n", + "0 1517011200000000000 49\n", + "1 1517616000000000000 48\n", + "2 1518220800000000000 47\n", + "3 1518825600000000000 46\n", + "4 1519430400000000000 43" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Week of pendingMean days listing to pending
0151701120000000000049
1151761600000000000048
2151822080000000000047
3151882560000000000046
4151943040000000000043
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"x[:5]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"Week of pending\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 956272764434917,\n \"min\": 1517011200000000000,\n \"max\": 1519430400000000000,\n \"num_unique_values\": 5,\n \"samples\": [\n 1517616000000000000,\n 1519430400000000000,\n 1518220800000000000\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Mean days listing to pending\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2,\n \"min\": 43,\n \"max\": 49,\n \"num_unique_values\": 5,\n \"samples\": [\n 48,\n 43,\n 47\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 19 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "### **Step 3:** Check the balance of the labels variable (y) by using the value_counts function." + ], + "metadata": { + "id": "3Io6qlPjNak3" + } + }, + { + "cell_type": "code", + "source": [ + "# Check the balance of our target values\n", + "y.value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "r09FJ8yAMy0H", + "outputId": "a22a2eb9-4852-4694-bc14-a95c052f754c" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.045619 1\n", + "0.051982 1\n", + "0.053724 1\n", + "0.053300 1\n", + "0.051753 1\n", + "0.052866 1\n", + "0.055280 1\n", + "0.055631 1\n", + "0.053722 1\n", + "0.050454 1\n", + "0.056315 1\n", + "0.049650 1\n", + "0.050125 1\n", + "0.050378 1\n", + "0.047866 1\n", + "0.047123 1\n", + "0.045601 1\n", + "0.045761 1\n", + "0.054597 1\n", + "0.055683 1\n", + "0.043061 1\n", + "0.049253 1\n", + "0.046162 1\n", + "0.046104 1\n", + "0.046359 1\n", + "0.047222 1\n", + "0.047829 1\n", + "0.048121 1\n", + "0.048495 1\n", + "0.050773 1\n", + "0.056189 1\n", + "0.052223 1\n", + "0.053757 1\n", + "0.053530 1\n", + "0.053945 1\n", + "0.054528 1\n", + "0.055790 1\n", + "0.056798 1\n", + "0.046843 1\n", + "0.045798 1\n", + "0.046940 1\n", + "0.046210 1\n", + "0.047247 1\n", + "0.047235 1\n", + "0.047233 1\n", + "0.046522 1\n", + "0.045901 1\n", + "0.046424 1\n", + "0.045528 1\n", + "0.045870 1\n", + "0.045268 1\n", + "0.043907 1\n", + "0.044843 1\n", + "0.044800 1\n", + "0.046553 1\n", + "0.047386 1\n", + "0.047171 1\n", + "0.045198 1\n", + "0.046643 1\n", + "0.045805 1\n", + "0.045522 1\n", + "0.044850 1\n", + "0.044360 1\n", + "0.044613 1\n", + "0.043009 1\n", + "0.043563 1\n", + "0.043809 1\n", + "0.044134 1\n", + "0.045389 1\n", + "0.045845 1\n", + "0.045695 1\n", + "0.045771 1\n", + "0.045599 1\n", + "0.045390 1\n", + "0.045341 1\n", + "0.045320 1\n", + "0.045145 1\n", + "0.046238 1\n", + "0.045686 1\n", + "0.045812 1\n", + "0.048135 1\n", + "0.045604 1\n", + "0.045066 1\n", + "0.044726 1\n", + "0.045745 1\n", + "0.046168 1\n", + "0.047203 1\n", + "0.048383 1\n", + "0.048976 1\n", + "0.051110 1\n", + "0.047989 1\n", + "0.047645 1\n", + "0.047183 1\n", + "0.046061 1\n", + "0.047091 1\n", + "0.048881 1\n", + "0.050298 1\n", + "0.045846 1\n", + "0.045500 1\n", + "0.045001 1\n", + "0.044129 1\n", + "0.043778 1\n", + "0.043440 1\n", + "0.044398 1\n", + "0.044736 1\n", + "0.045432 1\n", + "0.045794 1\n", + "0.044920 1\n", + "0.044752 1\n", + "0.044799 1\n", + "0.044975 1\n", + "0.044045 1\n", + "0.043614 1\n", + "0.043202 1\n", + "0.042720 1\n", + "0.043026 1\n", + "0.051039 1\n", + "0.049792 1\n", + "0.044952 1\n", + "0.045377 1\n", + "0.044858 1\n", + "0.045319 1\n", + "0.045720 1\n", + "0.045881 1\n", + "0.046039 1\n", + "0.046211 1\n", + "0.045521 1\n", + "0.045620 1\n", + "0.049232 1\n", + "0.045150 1\n", + "0.045212 1\n", + "0.044290 1\n", + "0.044224 1\n", + "0.043877 1\n", + "0.043960 1\n", + "0.044618 1\n", + "0.046626 1\n", + "0.047211 1\n", + "0.046666 1\n", + "0.046659 1\n", + "0.048853 1\n", + "0.049427 1\n", + "0.049208 1\n", + "0.047832 1\n", + "0.048498 1\n", + "0.047285 1\n", + "0.047152 1\n", + "0.047750 1\n", + "0.047673 1\n", + "0.047831 1\n", + "0.047822 1\n", + "0.047984 1\n", + "0.046614 1\n", + "0.045657 1\n", + "0.045615 1\n", + "0.044226 1\n", + "0.042040 1\n", + "0.046532 1\n", + "0.041244 1\n", + "0.039966 1\n", + "0.041317 1\n", + "0.041729 1\n", + "0.041253 1\n", + "0.040153 1\n", + "0.039415 1\n", + "0.040565 1\n", + "0.043115 1\n", + "0.039414 1\n", + "0.042381 1\n", + "0.041886 1\n", + "0.041866 1\n", + "0.040357 1\n", + "0.039980 1\n", + "0.038556 1\n", + "0.037744 1\n", + "0.039662 1\n", + "0.038653 1\n", + "0.044982 1\n", + "0.037157 1\n", + "0.036139 1\n", + "0.036328 1\n", + "0.036132 1\n", + "0.036197 1\n", + "0.036192 1\n", + "0.036505 1\n", + "0.036940 1\n", + "0.037096 1\n", + "0.038388 1\n", + "0.038085 1\n", + "0.037930 1\n", + "0.038165 1\n", + "0.039745 1\n", + "0.039106 1\n", + "0.038861 1\n", + "0.039304 1\n", + "0.037782 1\n", + "0.038099 1\n", + "0.037792 1\n", + "0.042494 1\n", + "0.039996 1\n", + "0.039832 1\n", + "0.040623 1\n", + "0.039714 1\n", + "0.041179 1\n", + "0.042466 1\n", + "0.042186 1\n", + "0.041987 1\n", + "0.037592 1\n", + "0.042380 1\n", + "0.042958 1\n", + "0.044458 1\n", + "0.047288 1\n", + "0.048168 1\n", + "0.049169 1\n", + "0.048536 1\n", + "0.040865 1\n", + "0.039733 1\n", + "0.040034 1\n", + "0.039404 1\n", + "0.037384 1\n", + "0.036740 1\n", + "0.036993 1\n", + "0.037161 1\n", + "0.037247 1\n", + "0.037718 1\n", + "0.037459 1\n", + "0.037497 1\n", + "0.037725 1\n", + "0.038276 1\n", + "0.038757 1\n", + "0.038978 1\n", + "0.038807 1\n", + "0.038921 1\n", + "0.038872 1\n", + "0.035653 1\n", + "0.035737 1\n", + "0.035691 1\n", + "0.036638 1\n", + "0.036394 1\n", + "0.036424 1\n", + "0.036310 1\n", + "0.036261 1\n", + "0.036011 1\n", + "0.036073 1\n", + "0.036614 1\n", + "0.036515 1\n", + "0.042576 1\n", + "0.036034 1\n", + "0.035148 1\n", + "0.034772 1\n", + "0.034588 1\n", + "0.034994 1\n", + "0.035242 1\n", + "0.036853 1\n", + "0.037385 1\n", + "0.037573 1\n", + "0.037336 1\n", + "0.037263 1\n", + "0.044064 1\n", + "0.044309 1\n", + "0.041625 1\n", + "0.040599 1\n", + "0.039862 1\n", + "0.039480 1\n", + "0.038424 1\n", + "0.038240 1\n", + "0.037810 1\n", + "0.037188 1\n", + "0.037572 1\n", + "0.037535 1\n", + "0.037357 1\n", + "0.037083 1\n", + "0.036345 1\n", + "0.040396 1\n", + "0.047368 1\n", + "0.035767 1\n", + "0.037008 1\n", + "0.038567 1\n", + "0.038548 1\n", + "0.038282 1\n", + "0.037703 1\n", + "0.037360 1\n", + "0.037168 1\n", + "0.037115 1\n", + "0.036685 1\n", + "0.047529 1\n", + "0.036529 1\n", + "0.036719 1\n", + "0.036386 1\n", + "0.036220 1\n", + "0.036004 1\n", + "0.035782 1\n", + "0.035825 1\n", + "0.038226 1\n", + "0.038709 1\n", + "0.039298 1\n", + "0.039516 1\n", + "0.046611 1\n", + "0.041044 1\n", + "0.040156 1\n", + "0.038269 1\n", + "0.039588 1\n", + "0.040546 1\n", + "0.040413 1\n", + "0.040950 1\n", + "0.040520 1\n", + "0.040126 1\n", + "0.040337 1\n", + "0.039900 1\n", + "0.038930 1\n", + "0.038947 1\n", + "0.039715 1\n", + "0.043273 1\n", + "Name: Mean price reduction percentage, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "### **Step 4:** Split the data into training and testing datasets by using train_test_split.\n", + "\n", + "\n", + "---\n", + "\n", + "\n", + "\n", + "---\n", + "\n", + "\n", + "\n", + "---\n", + "\n", + "##**Cannot run code from here down: y has NaN Values**\n", + "\n", + "\n", + "---\n", + "\n", + "\n", + "\n", + "---\n", + "\n", + "\n", + "\n", + "---\n", + "\n" + ], + "metadata": { + "id": "r016mPXJN1ip" + } + }, + { + "cell_type": "code", + "source": [ + "# Import the train_test_learn module\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Split the data using train_test_split\n", + "# Assign a random_state of 1 to the function\n", + "x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, stratify=y)" + ], + "metadata": { + "id": "eQv0QlgHN4K4" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Create a Logistic Regression Model with the Original Data\n", + "\n" + ], + "metadata": { + "id": "kuiR9a9jOEZb" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "### **Step 1:** Fit a logistic regression model by using the training data (X_train and y_train)." + ], + "metadata": { + "id": "i6HO93VgOFLb" + } + }, + { + "cell_type": "code", + "source": [ + "# Import the LogisticRegression module from SKLearn\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "# Instantiate the Logistic Regression model\n", + "# Assign a random_state parameter of 1 to the model\n", + "logistic_regression_model = LogisticRegression(random_state=1)\n", + "\n", + "# Fit the model using training data\n", + "lr_model = logistic_regression_model.fit(x_train, y_train)\n" + ], + "metadata": { + "id": "D0c_G6DYOo07" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "### **Step 2:** Save the predictions on the testing data labels by using the testing feature data (X_test) and the fitted model." + ], + "metadata": { + "id": "wjrpEtc8PDJQ" + } + }, + { + "cell_type": "code", + "source": [ + "\n", + "# Make a prediction using the testing data\n", + "testing_predictions = lr_model.predict(x_test)" + ], + "metadata": { + "id": "1XoF3R5gPGHd" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### **Step 3:** Evaluate the model’s performance by doing the following:\n", + "* Calculate the accuracy score of the model.\n", + "* Generate a confusion matrix.\n", + "* Print the classification report.\n", + "\n" + ], + "metadata": { + "id": "WsoTPyplPOEO" + } + }, + { + "cell_type": "code", + "source": [ + "# Print the balanced_accuracy score of the model\n", + "\n", + "print(f\"Training Data Score: {lr_model.score(x_train,y_train)}\")\n", + "print(f\"Testing Data Score: {lr_model.score(x_test,y_test)}\")\n", + "print(f\"Balanced Accuracy Score: {balanced_accuracy_score(y_test, testing_predictions)}\")\n", + "print(f\"The balanced accuracy with the original data is {testing_accuracy* 100:.2f}%\")" + ], + "metadata": { + "id": "iZHh9DiHPeSk" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Generate a confusion matrix for the model\n", + "testing_confusion_matrix = confusion_matrix(y_test, testing_prediction)\n", + "\n", + "print(f'The confusion matrix with the original data is:\\n{testing_confusion_matrix}')" + ], + "metadata": { + "id": "AwnFN2l-PmzU" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Print the classification report for the model\n", + "testing_report = classification_report(y_test, testing_prediction)\n", + "\n", + "#View the results\n", + "print(f'The classification report with the original data shows:\\n{testing_report}')" + ], + "metadata": { + "id": "VFA-LwvEPpWl" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "**Step 4:** Answer the following question.\n", + "\n", + "**Question:** How well does the logistic regression model predict both the 0 (healthy loan) and 1 (high-risk loan) labels?\n", + "\n", + "**Answer:** HEALTHY LOANS: For healthy loans, the precision is 1.00, the recall is 1.00 and the f1-score is 1.00, meaning the model is performing exceptionally well in identifying healthy loans without missing any. This scenario is ideal, indicating a high level of confidence in the model's ability to identify healthy loans.\n", + "\n", + "**HIGH-RISK LOANS:** The precision is 0.87 for high-risk loans, so the remaining 13% are false-positives. The recall is 0.89, so the remaining 11% are false negatives. The f1-score is 0.88, which suggests a good balance between precision and recall for high-risk loans. Therefore, the model performs moderately well for high-risk loans, but there is still room for improvement; it is highly likely that the financial field would require 95% or higher in order to retain confidence in the model." + ], + "metadata": { + "id": "Ko7WeFk2Pvo_" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Predict a Logistic Regression Model with Resampled Training Data" + ], + "metadata": { + "id": "l70oTU4NQAxR" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "### **Step 1:** Use the RandomOverSampler module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points." + ], + "metadata": { + "id": "-zddS4A8QExT" + } + }, + { + "cell_type": "code", + "source": [ + "\n", + "# Import the RandomOverSampler module form imbalanced-learn\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "\n", + "# Instantiate the random oversampler model\n", + "# # Assign a random_state parameter of 1 to the model\n", + "ros_model = RandomOverSampler(random_state=1)\n", + "\n", + "# Fit the original training data to the random_oversampler model\n", + "x_res, y_res = ros_model.fit_resample(x_train, y_train)" + ], + "metadata": { + "id": "SN-ZBgQ3QGrd" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Count the distinct values of the resampled labels data\n", + "\n", + "#y_res.value_counts()\n", + "print(y_res.value_counts())\n", + "\n", + "\n", + "# Check that my numbers are the same on both sides\n", + "unique_values, counts = np.unique(y_res, return_counts=True)\n", + "print(\"Resampled Labels: \", unique_values)\n", + "print(\"Label Counts: \", counts)" + ], + "metadata": { + "id": "EHYhxN5OQOPh" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "### **Step 2:** Use the LogisticRegression classifier and the resampled data to fit the model and make predictions." + ], + "metadata": { + "id": "7iR7FqiAQTxd" + } + }, + { + "cell_type": "code", + "source": [ + "# Instantiate the Logistic Regression model\n", + "# Assign a random_state parameter of 1 to the model\n", + "lr_ros_model = LogisticRegression(random_state=1)\n", + "\n", + "# Fit the model using the resampled training data\n", + "lr_ros_model.fit(x_res, y_res)\n", + "\n", + "# Make a prediction using the testing data\n", + "testing_ros_predictions = lr_ros_model.predict(x_test)" + ], + "metadata": { + "id": "GeIFy1y5Q6c0" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "### **Step 3:** Evaluate the model’s performance by doing the following:\n", + "\n", + "* Calculate the accuracy score of the model.\n", + "* Generate a confusion matrix.\n", + "* Print the classification report.\n", + "\n", + "\n", + "\n" + ], + "metadata": { + "id": "uquE8574RAvM" + } + }, + { + "cell_type": "code", + "source": [ + "# Print the balanced_accuracy score of the model\n", + "print(f\"Training Data Score: {lr_ros_model.score(x_res,y_res)}\")\n", + "print(f\"Testing Data Score: {lr_ros_model.score(x_test,y_test)}\")\n", + "\n", + "ros_balanced_accuracy_score = balanced_accuracy_score(y_test, testing_ros_predictions)\n", + "print(f\"Balanced Accuracy Score: {ros_balanced_accuracy_score}\")\n", + "print(f\"The balanced accuracy with the original data is {ros_balanced_accuracy_score* 100:.2f}%\")" + ], + "metadata": { + "id": "GEeUM8MNROxS" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Generate a confusion matrix for the model\n", + "ros_confusion_matrix = confusion_matrix(y_test, testing_ros_predictions)\n", + "print(f'The confusion matrix with the oversampled data is:\\n{ros_confusion_matrix}')" + ], + "metadata": { + "id": "fYxO0OVWRVdL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Print the classification report for the model\n", + "ros_classification_report = classification_report(y_test, testing_ros_predictions)\n", + "\n", + "#View the results\n", + "print(f'The classification report with the oversampled data shows:\\n{ros_classification_report}')" + ], + "metadata": { + "id": "kbbYfKiJRXH6" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "###**Step 4:** Answer the following question\n", + "**Question:** How well does the logistic regression model, fit with oversampled data, predict both the 0 (healthy loan) and 1 (high-risk loan) labels?\n", + "\n", + "**Answer: **The logistic regression model, fit with the oversampled data, better predicts the healthy and high-risk loan labels than our first report. The healthy loans precision, recall, and f1-score remain the same as before; likewise, the high-risk loan labels have the same precision measurement. However, the recall has gone up 0.01 to a perfect \"1.00\", while the oversampled f1-score has increase the most, by 0.05, bringing it's score to \"0.93\". These changes make the logistic regression model, fit with the oversampled data, does a better job in catching the incorrect labelling of high-risk loans as healthy..\n" + ], + "metadata": { + "id": "9mHDdJTsRZp9" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "\n", + "---\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "---\n", + "\n", + "\n", + "# Import necessary libraries\n", + "\n", + "#### ATTEMPT ADVANCED REGRESSION TECHNIQUES WITH OUR DATA\n", + "(from here down↓)\n", + "\n", + "\n", + "\n" + ], + "metadata": { + "id": "UPDfjtYqR5K2" + } + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "pd.set_option('display.max_rows', None)" + ], + "metadata": { + "id": "wIWDHUBISnQm" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "plt.figure(figsize=(18,6))\n", + "plt.title('Heatmap of missing values')\n", + "sns.heatmap(df_combined.isnull(),yticklabels=False,cbar=False,cmap='viridis')" + ], + "metadata": { + "id": "SE9oT90VUoEC" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "unique_values = []\n", + "for col in object_cols:\n", + " unique_values.append(df_combined[col].unique().size)\n", + "plt.figure(figsize=(18,6))\n", + "plt.title('No. Unique values of Categorical Features')\n", + "plt.xticks(rotation=90)\n", + "sns.barplot(x=object_cols,y=unique_values)" + ], + "metadata": { + "id": "gfMF3htTUsTZ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "plt.figure(figsize=(18,36))\n", + "plt.title('Categorical Features: Distribution')\n", + "plt.xticks(rotation=90)\n", + "index = 1\n", + "for col in object_cols:\n", + " y = df_combined[col].value_counts()\n", + " plt.subplot(11,4,index)\n", + " plt.xticks(rotation=90)\n", + " sns.barplot(x=list(y.index), y=y)\n", + " index +=1" + ], + "metadata": { + "id": "moe_Mxr0UwGG" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "### **Fill up missing values:**\n", + "* Drop the features 'Alley', 'Fence', and 'MiscFeature'.\n", + "\n", + "* Drop 'Utilities' feature, as all but one have the value 'AllPub'\n", + "\n", + "* All entries with missing 'FirePlaceQu' have 'Fireplaces' = 0. Hence fill missing values with 'NA'.\n", + "\n", + "* All but one entries with missing 'PoolQC' value have 'PoolArea' = 0. Use mode for missing value with non-zero PoolArea. Use 'NA' for the rest of the entries.\n", + "\n", + "* **Basement features:** Fill missing values with 'NA' or '0'.\n", + "\n", + "* **Garage features:** Fill missing values with 'NA' or '0'.\n", + "\n", + "* **Remaining Integer and Real features:** fill up missing values with mean of the corresponding feature.\n", + "\n", + "* **Remaining Categorical features:** fill up missing values with mode of the corresponding feature." + ], + "metadata": { + "id": "sP1rorjKVCeo" + } + }, + { + "cell_type": "code", + "source": [ + "df_combined.drop(columns='Id',inplace=True); print('Drop Id \\n')\n", + "df_combined['MSZoning'] = df_combined['MSZoning'].fillna(df_combined['MSZoning'].mode()[0])\n", + "df_combined['LotFrontage'] = df_combined['LotFrontage'].fillna(df_combined['LotFrontage'].mean())\n", + "df_combined.drop(columns='Alley',inplace=True); print('Drop Alley \\n')\n", + "\n", + "#df_combined['Utilities'] = df_combined['Utilities'].fillna(df_combined['Utilities'].mode()[0])\n", + "print(df_combined['Utilities'].value_counts())\n", + "df_combined.drop(columns='Utilities',inplace=True); print('Drop Utilities \\n')\n", + "\n", + "df_combined['Exterior1st'] = df_combined['Exterior1st'].fillna(df_combined['Exterior1st'].mode()[0])\n", + "df_combined['Exterior2nd'] = df_combined['Exterior2nd'].fillna(df_combined['Exterior2nd'].mode()[0])\n", + "df_combined['MasVnrType'] = df_combined['MasVnrType'].fillna(df_combined['MasVnrType'].mode()[0])\n", + "df_combined['MasVnrArea'] = df_combined['MasVnrArea'].fillna(df_combined['MasVnrArea'].mean())\n", + "df_combined['Electrical'] = df_combined['Electrical'].fillna(df_combined['Electrical'].mode()[0])\n", + "df_combined['KitchenQual'] = df_combined['KitchenQual'].fillna(df_combined['KitchenQual'].mode()[0])\n", + "df_combined['Functional'] = df_combined['Functional'].fillna(df_combined['Functional'].mode()[0])\n", + "\n", + "#df_combined.loc[(df_combined['Fireplaces'] != 0) & (df_combined['FireplaceQu'].isnull()) ][['FireplaceQu','Fireplaces']]\n", + "df_combined['FireplaceQu'] = df_combined['FireplaceQu'].fillna('NA'); print('FirePlaceQu: Fill NA values for missing values \\n')\n", + "\n", + "df_combined.loc[(df_combined['PoolQC'].isnull()) & df_combined['PoolArea']>0][['PoolQC','PoolArea']]\n", + "df_combined.at[2599,'PoolQC'] = df_combined['PoolQC'].mode()[0]; print('PoolQC: Use mode for missing value with non-zero PoolAre \\n')\n", + "df_combined['PoolQC'] = df_combined['PoolQC'].fillna('NA'); print('PoolQC: Use NA for remaining missing values \\n')\n", + "\n", + "df_combined['SaleType'].fillna(df_combined['SaleType'].mode()[0],inplace=True)\n", + "df_combined.drop(columns=['Fence','MiscFeature','SalePrice'],inplace=True); print('Drop Fence, MiscFeature and SalePrice\\n')\n", + "\n", + "# Basement Features\n", + "#df_combined.loc[df_combined['BsmtQual'].isnull()][['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinSF1','BsmtFinType2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath']].head()\n", + "#df_combined.loc[df_combined['TotalBsmtSF'].isnull()][['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinSF1','BsmtFinType2','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath']]\n", + "print('Fill missing values of Basement features with NA or 0 \\n')\n", + "df_combined['BsmtQual'] = df_combined['BsmtQual'].fillna('NA')\n", + "df_combined['BsmtCond'] = df_combined['BsmtCond'].fillna('NA')\n", + "df_combined['BsmtExposure'] = df_combined['BsmtExposure'].fillna('NA')\n", + "df_combined['BsmtFinType1'] = df_combined['BsmtFinType1'].fillna('NA')\n", + "df_combined['BsmtFinType2'] = df_combined['BsmtFinType2'].fillna('NA')\n", + "\n", + "df_combined['BsmtFinSF1'] = df_combined['BsmtFinSF1'].fillna(int(0))\n", + "df_combined['BsmtFinSF2'] = df_combined['BsmtFinSF2'].fillna(int(0))\n", + "df_combined['BsmtUnfSF'] = df_combined['BsmtUnfSF'].fillna(int(0))\n", + "df_combined['TotalBsmtSF'] = df_combined['TotalBsmtSF'].fillna(int(0))\n", + "df_combined['BsmtFullBath'] = df_combined['BsmtFullBath'].fillna(int(0))\n", + "df_combined['BsmtHalfBath'] = df_combined['BsmtHalfBath'].fillna(int(0))\n", + "\n", + "# Garage Features\n", + "# df_combined.loc[df_combined['GarageCond'].isnull()][['GarageType','GarageYrBlt','GarageFinish','GarageCars','GarageArea','GarageQual','GarageCond']].head()\n", + "print('Fill missing values of Garage features with NA or 0 \\n')\n", + "df_combined['GarageType'] = df_combined['GarageType'].fillna('NA')\n", + "df_combined['GarageFinish'] = df_combined['GarageFinish'].fillna('NA')\n", + "df_combined['GarageCond'] = df_combined['GarageCond'].fillna('NA')\n", + "df_combined['GarageQual'] = df_combined['GarageQual'].fillna('NA')\n", + "df_combined['GarageCars'] = df_combined['GarageCars'].fillna(int(0))\n", + "df_combined['GarageArea'] = df_combined['GarageArea'].fillna(int(0))\n", + "df_combined['GarageYrBlt'] = df_combined['GarageYrBlt'].fillna(int(0))" + ], + "metadata": { + "id": "WJfk86_rVf3M" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df_combined.head()" + ], + "metadata": { + "id": "dscyrp-FVn1A" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "##Check that all missing values have been taken care of.\n" + ], + "metadata": { + "id": "Hdj1pricVsXL" + } + }, + { + "cell_type": "code", + "source": [ + "print(df_combined.isnull().sum().sum())\n" + ], + "metadata": { + "id": "He2FBbo5Vwu0" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#Data Preprocessing\n" + ], + "metadata": { + "id": "VjMXvf_eVyyn" + } + }, + { + "cell_type": "markdown", + "source": [ + "###Import Onehot encoder to encode categorical features\n" + ], + "metadata": { + "id": "hvYemVQ0V2Kw" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.preprocessing import OneHotEncoder\n" + ], + "metadata": { + "id": "kYKdZVSAV6B8" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "s = (df_combined.dtypes == 'object')\n", + "object_cols = list(s[s].index)\n", + "print(\"Categorical variables:\")\n", + "print(object_cols)\n", + "print('No. of. categorical features: ',len(object_cols))" + ], + "metadata": { + "id": "B_4wlFrTV9V5" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "OH_encoder = OneHotEncoder(sparse=False)\n", + "OH_cols = pd.DataFrame(OH_encoder.fit_transform(df_combined[object_cols]))\n", + "OH_cols.index = df_combined.index\n", + "OH_cols.columns = OH_encoder.get_feature_names()\n", + "df_final = df_combined.drop(object_cols, axis=1)\n", + "df_final = pd.concat([df_final, OH_cols], axis=1)" + ], + "metadata": { + "id": "6G0xExDMV_0s" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df_final.head()\n" + ], + "metadata": { + "id": "bqdO2y8PWCI4" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Check that the shapes are consistent\n", + "\n", + "print('df_final shape:', df_final.shape)\n", + "print('df_train shape:', df_Train.shape)\n", + "print('df_test shape:', df_Test.shape)\n", + "\n", + "X_Train = pd.DataFrame(df_final[:1460])\n", + "X_Test = pd.DataFrame(df_final[1460:])\n", + "Y_Train = df_Train['SalePrice']\n", + "\n", + "print('\\nCheck that the datasets are consistent:\\n')\n", + "print('X_train shape', X_Train.shape)\n", + "print('Y_train shape:', Y_Train.shape)\n", + "print('X_test shape:', X_Test.shape)" + ], + "metadata": { + "id": "XTjjDol_WFTE" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#Model selection and prediction\n" + ], + "metadata": { + "id": "nosipS5zWNw4" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics import mean_absolute_error\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Split the training set into training and validation set\n", + "\n", + "X_train, X_valid, Y_train, Y_valid = train_test_split(X_Train, Y_Train, train_size=0.8, test_size=0.2,random_state=0)\n" + ], + "metadata": { + "id": "DhE6b407WRw3" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.ensemble import RandomForestRegressor\n", + "\n", + "model_RFR = RandomForestRegressor()\n", + "model_RFR.fit(X_train, Y_train)\n", + "Y_pred = model_RFR.predict(X_valid)\n", + "print(mean_absolute_error(Y_valid, Y_pred))" + ], + "metadata": { + "id": "AX-GGnP2WUGn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.ensemble import GradientBoostingRegressor\n", + "\n", + "model_GBR = GradientBoostingRegressor()\n", + "model_GBR.fit(X_train, Y_train)\n", + "Y_pred = model_GBR.predict(X_valid)\n", + "print(mean_absolute_error(Y_valid, Y_pred))" + ], + "metadata": { + "id": "JU0_vCHRWWp7" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.linear_model import SGDRegressor\n", + "\n", + "model_SGD = SGDRegressor()\n", + "model_SGD.fit(X_train, Y_train)\n", + "Y_pred = model_SGD.predict(X_valid)\n", + "print(mean_absolute_error(Y_valid, Y_pred))" + ], + "metadata": { + "id": "ymYn_ciGWZPk" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "from xgboost import XGBRegressor\n", + "\n", + "model_XGBR = XGBRegressor(learning_rate=0.03,n_estimators=200,objective='reg:squarederror')\n", + "model_XGBR.fit(X_train,Y_train)\n", + "Y_pred = model_XGBR.predict(X_valid)\n", + "print(mean_absolute_error(Y_valid, Y_pred))" + ], + "metadata": { + "id": "oY6ubKpSWbe0" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "plt.figure()\n", + "plt.title('Comparison of Sale Price of Predicted and Actual values')\n", + "plt.scatter(Y_Train,model_RFR.predict(X_Train),label='Random Forest')\n", + "plt.scatter(Y_Train,model_XGBR.predict(X_Train),label='XGB')\n", + "plt.legend()" + ], + "metadata": { + "id": "i-6Qp5C9Wdti" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "model = XGBRegressor()\n", + "\n", + "n_estimators = [100, 200, 500]\n", + "learning_rates = [0.03,0.1,0.3]\n", + "objectives = ['reg:squarederror']\n", + "\n", + "# Define the grid of hyperparameters to search\n", + "hyperparameter_grid = {\n", + " 'n_estimators' : n_estimators,\n", + " 'learning_rate':learning_rates,\n", + " 'objective' : objectives\n", + " }\n", + "\n", + "grid_cv = GridSearchCV(estimator = model,\n", + " param_grid = hyperparameter_grid,\n", + " scoring = 'neg_mean_absolute_error',\n", + " return_train_score = True)\n", + "\n", + "grid_cv.fit(X_Train,Y_Train)" + ], + "metadata": { + "id": "UF78kE6lWiEx" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "grid_cv.best_score_\n" + ], + "metadata": { + "id": "DnHxgmeiWkMZ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "grid_cv.best_estimator_\n" + ], + "metadata": { + "id": "Y8iu_B8PWlgD" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "Y_pred = random_cv.predict(X_valid)\n", + "print(mean_absolute_error(Y_valid, Y_pred))" + ], + "metadata": { + "id": "CxqTwQQeWoxH" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "regressor = grid_cv.best_estimator_\n", + "Y_pred = regressor.predict(X_valid)\n", + "print(mean_absolute_error(Y_valid, Y_pred))" + ], + "metadata": { + "id": "i7JT1mMhWqps" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "plt.figure()\n", + "plt.title('Comparison of Sale Price of Predicted and Actual values')\n", + "plt.scatter(Y_Train,model_RFR.predict(X_Train),label='Random Forest')\n", + "plt.scatter(Y_Train,model_XGBR.predict(X_Train),label='XGB')\n", + "plt.scatter(Y_Train,regressor.predict(X_Train),label='Best model')\n", + "plt.legend()" + ], + "metadata": { + "id": "yr540r-3Ws9G" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#Preparation of Submission Data\n" + ], + "metadata": { + "id": "2IhMjO-BWv0N" + } + }, + { + "cell_type": "code", + "source": [ + "Y_Pred = regressor.predict(X_Test)\n" + ], + "metadata": { + "id": "PbNuaJvMWyV5" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "Y_Pred\n" + ], + "metadata": { + "id": "aBtFxzuHW0Aa" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "Y_Pred.shape\n" + ], + "metadata": { + "id": "95r46nkXW1s-" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "sub = pd.DataFrame()\n", + "sub['Id'] = df_Test['Id']\n", + "sub['SalePrice'] = Y_Pred" + ], + "metadata": { + "id": "Xlz2LofuW4Gn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "sub.head()\n" + ], + "metadata": { + "id": "4iTePcIbW6i2" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "sub.tail()\n" + ], + "metadata": { + "id": "5mXLjSHpW8Ut" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "sub.to_csv('Submission.csv')\n" + ], + "metadata": { + "id": "xJJzdFJqW980" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/Spark_Setup_(module_21,_optimized_21,_&_22).ipynb b/Spark_Setup_(module_21,_optimized_21,_&_22).ipynb deleted file mode 100644 index c095475..0000000 --- a/Spark_Setup_(module_21,_optimized_21,_&_22).ipynb +++ /dev/null @@ -1,1119 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "authorship_tag": "ABX9TyObpjxondblx6TF1OHcVN/z", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nTCutDQGoOxY" - }, - "outputs": [], - "source": [ - "!pip install keras-tuner --upgrade\n", - "%matplotlib inline\n", - "\n", - "# Import our dependencies\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import StandardScaler\n", - "import pandas as pd\n", - "import tensorflow as tf\n", - "\n", - "# Import and read the charity_data.csv.\n", - "import pandas as pd\n", - "application_df = pd.read_csv(\"https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv\")\n", - "application_df.head()\n", - "application_df.tail()" - ] - }, - { - "cell_type": "code", - "source": [ - "# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.\n", - "application_df = application_df.drop(columns = ['EIN', 'NAME'],axis=1)\n", - "application_df.head()\n", - "application_df.tail()" - ], - "metadata": { - "id": "hL-nfZhKovLI" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Determine the number of unique values in each column.\n", - "application_df.nunique()" - ], - "metadata": { - "id": "VhrRNiptovy9" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Look at APPLICATION_TYPE value counts for binning\n", - "application_df_counts = application_df['APPLICATION_TYPE'].value_counts()\n", - "application_df_counts" - ], - "metadata": { - "id": "54sMSuG0ozWV" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Choose a cutoff value and create a list of application types to be replaced\n", - "# use the variable name `application_types_to_replace`\n", - "application_types_to_replace = list(application_df_counts[application_df_counts<500].index)\n", - "\n", - "# Replace in dataframe\n", - "for app in application_types_to_replace:\n", - " application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,\"Other\")\n", - "\n", - "# Check to make sure binning was successful\n", - "application_df['APPLICATION_TYPE'].value_counts()" - ], - "metadata": { - "id": "Mkrjf-6Lo2IL" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Look at CLASSIFICATION value counts for binning\n", - "class_counts = application_df['CLASSIFICATION'].value_counts()\n", - "class_counts" - ], - "metadata": { - "id": "lhtrnl4Jo4oE" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# You may find it helpful to look at CLASSIFICATION value counts >1\n", - "class_type = class_counts.loc[class_counts > 1]\n", - "class_type" - ], - "metadata": { - "id": "cCrl-BZ5o66g" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Choose a cutoff value and create a list of classifications to be replaced\n", - "# use the variable name `classifications_to_replace`\n", - "classifications_to_replace = list(class_counts[class_counts < 1000].index)\n", - "\n", - "# Replace in dataframe\n", - "for cls in classifications_to_replace:\n", - " application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,\"Other\")\n", - "\n", - "# Check to make sure binning was successful\n", - "application_df['CLASSIFICATION'].value_counts()" - ], - "metadata": { - "id": "NQA9sL5Ho679" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Convert categorical data to numeric with `pd.get_dummies`\n", - "dummies_df = pd.get_dummies(application_df)\n", - "dummies_df.head()\n" - ], - "metadata": { - "id": "ZLaV6OVVo9wu" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Split our preprocessed data into our features and target arrays\n", - "y = dummies_df['IS_SUCCESSFUL'].values\n", - "X = dummies_df.drop('IS_SUCCESSFUL', axis=1).values\n", - "\n", - "# Split the preprocessed data into a training and testing dataset\n", - "X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=78, test_size=.2)" - ], - "metadata": { - "id": "JiE71OCRpBK8" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "print(f\"X_train Shape: {X_train.shape}, X_test Shape: {X_test.shape}\")" - ], - "metadata": { - "id": "F-DkLL-epDuv" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Create a StandardScaler instances\n", - "scaler = StandardScaler()\n", - "\n", - "# Fit the StandardScaler\n", - "X_scaler = scaler.fit(X_train)\n", - "\n", - "# Scale the data\n", - "X_train_scaled = X_scaler.transform(X_train)\n", - "X_test_scaled = X_scaler.transform(X_test)" - ], - "metadata": { - "id": "ruk8SGQzpFtM" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# installing kera_tuner upgrade\n", - "# !pip install keras-tuner --upgrade\n", - "\n", - "# Import the kerastuner library\n", - "#import keras_tuner as kt" - ], - "metadata": { - "id": "l5_243-lpFu0" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "import warnings\n", - "warnings.filterwarnings(\"ignore\")" - ], - "metadata": { - "id": "xYT_dwpMpKTA" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.\n", - "input_features = len(X_train_scaled[0])\n", - "hidden_nodes_layer1 = 80\n", - "hidden_nodes_layer2 = 30\n", - "hidden_nodes_layer3 = 1\n", - "\n", - "nn = tf.keras.models.Sequential()\n", - "\n", - "# First hidden layer\n", - "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,\n", - " input_dim=input_features, activation=\"relu\"))\n", - "\n", - "# Second hidden layer\n", - "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation=\"sigmoid\"))\n", - "\n", - "# Output layer\n", - "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation=\"sigmoid\"))\n", - "\n", - "\n", - "# Check the structure of the model\n", - "nn.summary()" - ], - "metadata": { - "id": "6frBap78pKUU" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Compile the model\n", - "nn.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])" - ], - "metadata": { - "id": "7Jul0GDHpKXc" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Train the model\n", - "fit_model = nn.fit(X_train_scaled, y_train, epochs=100)" - ], - "metadata": { - "id": "wckfODjopKaT" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Evaluate the model using the test data\n", - "model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)\n", - "print(f\"Loss: {model_loss}, Accuracy: {model_accuracy}\")" - ], - "metadata": { - "id": "mG26GVMKpKc7" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Export our model to HDF5 file\n", - "nn.save(\"AlphabetSoupCharity.h5\")" - ], - "metadata": { - "id": "jQFjjkWFpKfx" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "!pip install keras-tuner --upgrade\n", - "%matplotlib inline\n", - "\n", - "!pip install tensorflow\n", - "\n", - "# Import our dependencies\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import StandardScaler\n", - "import pandas as pd\n", - "import tensorflow as tf\n", - "\n", - "# Import and read the charity_data.csv.\n", - "import pandas as pd\n", - "application_df = pd.read_csv(\"https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv\")\n", - "application_df.head()\n", - "application_df.tail()" - ], - "metadata": { - "id": "MJJBDZhxpKky" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "\n", - "# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.\n", - "application_df = application_df.drop(columns = ['EIN', 'NAME'],axis=1)\n", - "application_df.head()\n", - "application_df.tail()\n" - ], - "metadata": { - "id": "GvPAjnEMpKnF" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Determine the number of unique values in each column.\n", - "application_df.nunique()\n" - ], - "metadata": { - "id": "3f859pSRpKpq" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Look at APPLICATION_TYPE value counts for binning\n", - "application_df_counts = application_df['APPLICATION_TYPE'].value_counts()\n", - "application_df_counts" - ], - "metadata": { - "id": "u73oTiqApKsT" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Choose a cutoff value and create a list of application types to be replaced\n", - "# use the variable name `application_types_to_replace`\n", - "application_types_to_replace = list(application_df_counts[application_df_counts<500].index)\n", - "\n", - "# Replace in dataframe\n", - "for app in application_types_to_replace:\n", - " application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,\"Other\")\n", - "\n", - "# Check to make sure binning was successful\n", - "application_df['APPLICATION_TYPE'].value_counts()" - ], - "metadata": { - "id": "HvTxcJmRpKup" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Look at CLASSIFICATION value counts for binning\n", - "class_counts = application_df['CLASSIFICATION'].value_counts()\n", - "class_counts" - ], - "metadata": { - "id": "MQSfOqmUpjL5" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# You may find it helpful to look at CLASSIFICATION value counts >1\n", - "class_type = class_counts.loc[class_counts > 1]\n", - "class_type" - ], - "metadata": { - "id": "BF9VYHRJpjOw" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - " Choose a cutoff value and create a list of classifications to be replaced\n", - "# use the variable name `classifications_to_replace`\n", - "classifications_to_replace = list(class_counts[class_counts < 1000].index)\n", - "\n", - "# Replace in dataframe\n", - "for cls in classifications_to_replace:\n", - " application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,\"Other\")\n", - "\n", - "# Check to make sure binning was successful\n", - "application_df['CLASSIFICATION'].value_counts()" - ], - "metadata": { - "id": "LsF3Ozx6pjRV" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Convert categorical data to numeric with `pd.get_dummies`\n", - "dummies_df = pd.get_dummies(application_df)\n", - "dummies_df.head()" - ], - "metadata": { - "id": "YFcUF3hzpjUx" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Split our preprocessed data into our features and target arrays\n", - "y = dummies_df['IS_SUCCESSFUL'].values\n", - "X = dummies_df.drop('IS_SUCCESSFUL', axis=1).values\n", - "\n", - "# Split the preprocessed data into a training and testing dataset\n", - "X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=78, test_size=.2)\n" - ], - "metadata": { - "id": "izZ1YXSlpjXF" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "print(f\"X_train Shape: {X_train.shape}, X_test Shape: {X_test.shape}\")\n" - ], - "metadata": { - "id": "Nb3Xn_MOpjZa" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Create a StandardScaler instances\n", - "scaler = StandardScaler()\n", - "\n", - "# Fit the StandardScaler\n", - "X_scaler = scaler.fit(X_train)\n", - "\n", - "# Scale the data\n", - "X_train_scaled = X_scaler.transform(X_train)\n", - "X_test_scaled = X_scaler.transform(X_test)" - ], - "metadata": { - "id": "Pv3LdlUOpjcR" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "#Import the kerastuner library\n", - "import keras_tuner as kt" - ], - "metadata": { - "id": "ZpjXPBbLpje3" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "#FIRST ATTEMPT\n", - "# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.\n", - "input_features = len(X_train_scaled[0])\n", - "hidden_nodes_layer1 = 80\n", - "hidden_nodes_layer2 = 30\n", - "hidden_nodes_layer3 = 1\n", - "\n", - "nn = tf.keras.models.Sequential()\n", - "\n", - "# First hidden layer\n", - "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,\n", - " input_dim=input_features, activation=\"tanh\"))\n", - "\n", - "# Second hidden layer\n", - "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation=\"sigmoid\"))\n", - "\n", - "# Third layer\n", - "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation=\"relu\"))\n", - "\n", - "# Outer Layer\n", - "nn.add(tf.keras.layers.Dense(units=1, activation=\"sigmoid\"))\n", - "\n", - "\n", - "# Check the structure of the model\n", - "nn.summary()" - ], - "metadata": { - "id": "WT1DSYt6pjhw" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# *FIRST ATTEMPT* Compile the model\n", - "nn.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])" - ], - "metadata": { - "id": "xi9sevJPpjj0" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# *FIRST ATTEMPT* Train the model\n", - "fit_model = nn.fit(X_train_scaled, y_train, epochs=100)" - ], - "metadata": { - "id": "osGv-I_Vpjms" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# *FIRST ATTEMPT* Evaluate the model using the test data\n", - "model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)\n", - "print(f\"Loss: {model_loss}, Accuracy: {model_accuracy}\")" - ], - "metadata": { - "id": "2iblbOWSpjpF" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# *FIRST ATTEMPT* ACCURACY\n", - "First Attempt Accuracy = 72.5%" - ], - "metadata": { - "id": "z3UZkZgxpjrp" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "#SECOND ATTEMPT\n", - "# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.\n", - "input_features = len(X_train_scaled[0])\n", - "hidden_nodes_layer1 = 100\n", - "hidden_nodes_layer2 = 30\n", - "hidden_nodes_layer3 = 1\n", - "\n", - "nn = tf.keras.models.Sequential()\n", - "\n", - "# First hidden layer\n", - "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,\n", - " input_dim=input_features, activation=\"relu\"))\n", - "\n", - "# Second hidden layer\n", - "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation=\"sigmoid\"))\n", - "\n", - "# Third layer\n", - "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation=\"relu\"))\n", - "\n", - "# Outer Layer\n", - "nn.add(tf.keras.layers.Dense(units=1, activation=\"sigmoid\"))\n", - "\n", - "\n", - "# Check the structure of the model\n", - "nn.summary()" - ], - "metadata": { - "id": "xrhb6QMfpjuP" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# *SECOND ATTEMPT* Compile the model\n", - "nn.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])" - ], - "metadata": { - "id": "NCXuyAQJpjw2" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# *SECOND ATTEMPT* Train the model\n", - "fit_model = nn.fit(X_train_scaled, y_train, epochs=100)" - ], - "metadata": { - "id": "VRxl-SKnpjzc" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# *SECOND ATTEMPT* Evaluate the model using the test data\n", - "model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)\n", - "print(f\"Loss: {model_loss}, Accuracy: {model_accuracy}\")" - ], - "metadata": { - "id": "t0rABw3Gpj2F" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# *SECOND* ACCURACY\n", - "#Changes for second attempt: first hidden layer activation from \"tahn\" to \"relu\"\n", - "# increase hidden layer nodes 1 from 80 to 100\n", - "Second Attempt Accuracy = 72.1% (vs first attempt of 72.5%)" - ], - "metadata": { - "id": "9DftVHWPpj4Z" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# *THIRD ATTEMPT*\n", - "# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.\n", - "input_features = len(X_train_scaled[0])\n", - "hidden_nodes_layer1 = 80\n", - "hidden_nodes_layer2 = 25\n", - "hidden_nodes_layer3 = 2\n", - "\n", - "nn = tf.keras.models.Sequential()\n", - "\n", - "# First hidden layer\n", - "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,\n", - " input_dim=input_features, activation=\"relu\"))\n", - "\n", - "# Second hidden layer\n", - "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation=\"sigmoid\"))\n", - "\n", - "# Third layer\n", - "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation=\"relu\"))\n", - "\n", - "# Outer Layer\n", - "nn.add(tf.keras.layers.Dense(units=1, activation=\"sigmoid\"))\n", - "\n", - "\n", - "# Check the structure of the model\n", - "nn.summary()\n" - ], - "metadata": { - "id": "J4zv1OSNqZwx" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# *THIRD ATTEMPT* Compile the model\n", - "nn.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])\n" - ], - "metadata": { - "id": "oFw0EosHqZxr" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# *THIRD ATTEMPT* Train the model\n", - "fit_model = nn.fit(X_train_scaled, y_train, epochs=100)" - ], - "metadata": { - "id": "xrKzAwGLqZ44" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# *THIRD ATTEMPT* Evaluate the model using the test data\n", - "model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)\n", - "print(f\"Loss: {model_loss}, Accuracy: {model_accuracy}\")" - ], - "metadata": { - "id": "6-eqf6xCqZ7d" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# *THIRD* ACCURACY\n", - "First Attempt Accuracy = 72.2%" - ], - "metadata": { - "id": "NLzLER_jqZ-E" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "import os\n", - "# Find the latest version of spark 3.x from http://www.apache.org/dist/spark/ and enter as the spark version\n", - "# For example:\n", - "# spark_version = 'spark-3.5.1'\n", - "spark_version = 'spark-3.5.1'\n", - "os.environ['SPARK_VERSION']=spark_version\n", - "\n", - "# Install Spark and Java\n", - "!apt-get update\n", - "!apt-get install openjdk-11-jdk-headless -qq > /dev/null\n", - "!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz\n", - "!tar xf $SPARK_VERSION-bin-hadoop3.tgz\n", - "!pip install -q findspark\n", - "\n", - "# Set Environment Variables\n", - "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-11-openjdk-amd64\"\n", - "os.environ[\"SPARK_HOME\"] = f\"/content/{spark_version}-bin-hadoop3\"\n", - "\n", - "# Start a SparkSession\n", - "import findspark\n", - "findspark.init()" - ], - "metadata": { - "id": "Wpwrv8_dqaAe" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Import packages\n", - "from pyspark.sql import SparkSession\n", - "import time\n", - "\n", - "# Create a SparkSession\n", - "spark = SparkSession.builder.appName(\"SparkSQL\").getOrCreate()" - ], - "metadata": { - "id": "4gd5ZX0XqaCw" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# 1. Read in the AWS S3 bucket into a DataFrame.\n", - "from pyspark import SparkFiles\n", - "url = \"https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.2/22-big-data/home_sales_revised.csv\"\n", - "spark.sparkContext.addFile(url)\n", - "df = spark.read.csv(SparkFiles.get(\"home_sales_revised.csv\"), sep=\",\", header=True)\n", - "\n", - "# Show DataFrame\n", - "df.show()\n" - ], - "metadata": { - "id": "8iLGqnOKqaFG" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "\n", - "# Print our schema\n", - "df.printSchema()# 2. Create a temporary view of the DataFrame.\n", - "\n", - "df.createOrReplaceTempView('home_sales')" - ], - "metadata": { - "id": "2TKsOjUcqaHt" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# 3. What is the average price for a four bedroom house sold per year, rounded to two decimal places?\n", - "a = \"\"\"\n", - "SELECT\n", - " YEAR(date) AS YEAR,\n", - " ROUND(AVG(price), 2) AS AVERAGE_PRICE\n", - "FROM home_sales\n", - "WHERE bedrooms = 4\n", - "GROUP BY YEAR\n", - "ORDER BY YEAR DESC\n", - "\"\"\"\n", - "spark.sql(a).show()" - ], - "metadata": { - "id": "0pIPKxgTqaJy" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# 4. What is the average price of a home for each year the home was built,\n", - "# that have 3 bedrooms and 3 bathrooms, rounded to two decimal places?\n", - "b = \"\"\"\n", - "SELECT\n", - " YEAR(date_built) AS YEAR,\n", - " ROUND(AVG(price), 2) AS AVERAGE_PRICE\n", - "FROM home_sales\n", - "WHERE bedrooms = 3\n", - "and bathrooms = 3\n", - "GROUP BY YEAR(date_built)\n", - "ORDER BY YEAR DESC\n", - "\"\"\"\n", - "spark.sql(b).show()\n" - ], - "metadata": { - "id": "Yfr0e-XxqaMa" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# 5. What is the average price of a home for each year the home was built,\n", - "# that have 3 bedrooms, 3 bathrooms, with two floors,\n", - "# and are greater than or equal to 2,000 square feet, rounded to two decimal places?\n", - "\n", - "\n", - "c = \"\"\"\n", - "SELECT\n", - "YEAR(date_built) AS YEAR_BUILT,\n", - "ROUND(AVG(price), 2) AS AVERAGE_PRICE\n", - "FROM home_sales\n", - "WHERE bedrooms = 3\n", - "and bathrooms = 3\n", - "and sqft_living >= 2000\n", - "and floors = 2\n", - "GROUP BY YEAR_BUILT\n", - "ORDER BY YEAR_BUILT DESC\n", - "\"\"\"\n", - "spark.sql(c).show()" - ], - "metadata": { - "id": "HrEbTABkqaOu" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# 6. What is the average price of a home per \"view\" rating, rounded to two decimal places,\n", - "# having an average home price greater than or equal to $350,000? Order by descending view rating.\n", - "# Although this is a small dataset, determine the run time for this query.\n", - "\n", - "start_time = time.time()\n", - "\n", - "d = \"\"\"\n", - "SELECT\n", - "view,\n", - "ROUND(AVG(price), 2) AS AVERAGE_PRICE\n", - "FROM home_sales\n", - "GROUP BY view\n", - "HAVING AVG(price) >= 350000\n", - "ORDER BY view desc\n", - "\"\"\"\n", - "spark.sql(d).show()\n", - "\n", - "print(\"--- %s seconds ---\" % (time.time() - start_time))" - ], - "metadata": { - "id": "PpPgkgRxqzuG" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# 7. Cache the the temporary table home_sales.\n", - "spark.sql('cache table home_sales')" - ], - "metadata": { - "id": "IBCdFP4Eqzvi" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# 8. Check if the table is cached.\n", - "spark.catalog.isCached('home_sales')" - ], - "metadata": { - "id": "AqFsOiWwqz1S" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# 9. Using the cached data, run the last query above, that calculates\n", - "# the average price of a home per \"view\" rating, rounded to two decimal places,\n", - "# having an average home price greater than or equal to $350,000.\n", - "# Determine the runtime and compare it to the uncached runtime.\n", - "\n", - "start_time = time.time()\n", - "\n", - "e = \"\"\"\n", - "SELECT\n", - "view,\n", - "ROUND(AVG(price), 2) AS AVERAGE_PRICE\n", - "FROM home_sales\n", - "GROUP BY view\n", - "HAVING AVG(price) >= 350000\n", - "ORDER BY view desc\n", - "\"\"\"\n", - "spark.sql(e).show()\n", - "\n", - "\n", - "print(\"--- %s seconds ---\" % (time.time() - start_time))\n", - "\n", - "# d speed = 0.7024235725402832 seconds\n", - "# e spped = 0.5843555927276611 seconds\n", - "# cache-ing sped up the run time !\n" - ], - "metadata": { - "id": "SJTRR5UHqz5_" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# 10. Partition by the \"date_built\" field on the formatted parquet home sales data\n", - "df.write.partitionBy('date_built').mode(\"overwrite\").parquet(\"p_home_sales\")" - ], - "metadata": { - "id": "cSeQIJrFq6G4" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# 11. Read the parquet formatted data.\n", - "p_df = spark.read.parquet('p_home_sales')" - ], - "metadata": { - "id": "LgELnGCNq6OJ" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# 12. Create a temporary table for the parquet data.\n", - "p_df.createOrReplaceTempView('parquet_temp_home')" - ], - "metadata": { - "id": "bL8GVtuEq6Xj" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# 13. Using the parquet DataFrame, run the last query above, that calculates\n", - "# the average price of a home per \"view\" rating, rounded to two decimal places,\n", - "# having an average home price greater than or equal to $350,000.\n", - "# Determine the runtime and compare it to the cached runtime.\n", - "\n", - "start_time = time.time()\n", - "\n", - "f = \"\"\"\n", - "SELECT\n", - "view,\n", - "ROUND(AVG(price), 2) AS AVERAGE_PRICE\n", - "FROM parquet_temp_home\n", - "GROUP BY view\n", - "HAVING AVG(price) >= 350000\n", - "ORDER BY view desc\n", - "\"\"\"\n", - "spark.sql(f).show()\n", - "\n", - "print(\"--- %s seconds ---\" % (time.time() - start_time))\n", - "\n", - "# d speed = 0.7024235725402832 seconds\n", - "# e spped = 0.5843555927276611 seconds\n", - "# cache-ing sped up the run time !\n", - "\n", - "# f speed = 0.6659867763519287 seconds\n", - "# parquet_temp_home is faster than original time, but not as fast as cached time\n" - ], - "metadata": { - "id": "G5ynaoJvq6fl" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# 14. Uncache the home_sales temporary table.\n", - "spark.sql('uncache table home_sales')" - ], - "metadata": { - "id": "zAI0vZamq6mq" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# 15. Check if the home_sales is no longer cached\n", - "if spark.catalog.isCached('home_sales'):\n", - " print('home_sales remains cached')\n", - "else:\n", - " print('home_sales is no longer cached. ')\n" - ], - "metadata": { - "id": "Yn38FqT8rI-o" - }, - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file