diff --git a/Project_Zillow_Logistic_Regression.ipynb b/Project_Zillow_Logistic_Regression.ipynb
new file mode 100644
index 0000000..31b4c19
--- /dev/null
+++ b/Project_Zillow_Logistic_Regression.ipynb
@@ -0,0 +1,7047 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "authorship_tag": "ABX9TyMtkuIY2koSIKgVrrcBVM46",
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# **Split the Data into Training and Testing Sets**
\n",
+ "\n",
+ "### **Step 1:** Read the lending_data.csv data from the Resources folder into a Pandas DataFrame."
+ ],
+ "metadata": {
+ "id": "pk3A1a13M3dd"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "zrP0hiEYHiii"
+ },
+ "outputs": [],
+ "source": [
+ "# Import the modules\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from pathlib import Path\n",
+ "from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Read the CSV file from the Resources folder into a Pandas DataFrame\n",
+ "# Loading data\n",
+ "df = pd.read_csv(\"https://raw.githubusercontent.com/mirasmitty/Project_Zillow/main/Resources/Zillow_data_Detroit.csv\")\n",
+ "df['Week of pending'] = pd.to_datetime(df['Week of pending'])\n",
+ "df['Week of pending'] = df['Week of pending'].values.astype(\"int64\")\n",
+ "\n",
+ "# Review the DataFrame\n",
+ "df.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 837
+ },
+ "id": "JpdCUOKTH3ie",
+ "outputId": "74d436ac-6b66-4115-a211-c6df1a6f15f0"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Week of pending Mean days listing to pending \\\n",
+ "0 1517011200000000000 49 \n",
+ "1 1517616000000000000 48 \n",
+ "2 1518220800000000000 47 \n",
+ "3 1518825600000000000 46 \n",
+ "4 1519430400000000000 43 \n",
+ "\n",
+ " Mean price reduction percentage \n",
+ "0 0.045619 \n",
+ "1 0.046532 \n",
+ "2 0.048536 \n",
+ "3 0.049169 \n",
+ "4 0.048168 "
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Week of pending | \n",
+ " Mean days listing to pending | \n",
+ " Mean price reduction percentage | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1517011200000000000 | \n",
+ " 49 | \n",
+ " 0.045619 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1517616000000000000 | \n",
+ " 48 | \n",
+ " 0.046532 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1518220800000000000 | \n",
+ " 47 | \n",
+ " 0.048536 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1518825600000000000 | \n",
+ " 46 | \n",
+ " 0.049169 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1519430400000000000 | \n",
+ " 43 | \n",
+ " 0.048168 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "df",
+ "summary": "{\n \"name\": \"df\",\n \"rows\": 315,\n \"fields\": [\n {\n \"column\": \"Week of pending\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 55083303430349928,\n \"min\": 1517011200000000000,\n \"max\": 1706918400000000000,\n \"num_unique_values\": 315,\n \"samples\": [\n 1641600000000000000,\n 1536969600000000000,\n 1615593600000000000\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Mean days listing to pending\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 13,\n \"min\": 15,\n \"max\": 72,\n \"num_unique_values\": 56,\n \"samples\": [\n 49,\n 42,\n 62\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Mean price reduction percentage\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.005088071642996824,\n \"min\": 0.034588388,\n \"max\": 0.056797626,\n \"num_unique_values\": 313,\n \"samples\": [\n 0.041885905,\n 0.041987246,\n 0.051982281\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 16
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "text/html": [
+ "Distributions
\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "from matplotlib import pyplot as plt\n",
+ "_df_15['index'].plot(kind='hist', bins=20, title='index')\n",
+ "plt.gca().spines[['top', 'right',]].set_visible(False)"
+ ],
+ "text/html": [
+ " \n",
+ "

\n",
+ "
\n",
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "from matplotlib import pyplot as plt\n",
+ "_df_16['Week of pending'].plot(kind='hist', bins=20, title='Week of pending')\n",
+ "plt.gca().spines[['top', 'right',]].set_visible(False)"
+ ],
+ "text/html": [
+ " \n",
+ "

\n",
+ "
\n",
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "from matplotlib import pyplot as plt\n",
+ "_df_17['Mean days listing to pending'].plot(kind='hist', bins=20, title='Mean days listing to pending')\n",
+ "plt.gca().spines[['top', 'right',]].set_visible(False)"
+ ],
+ "text/html": [
+ " \n",
+ "

\n",
+ "
\n",
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "from matplotlib import pyplot as plt\n",
+ "_df_18['Mean price reduction percentage'].plot(kind='hist', bins=20, title='Mean price reduction percentage')\n",
+ "plt.gca().spines[['top', 'right',]].set_visible(False)"
+ ],
+ "text/html": [
+ " \n",
+ "

\n",
+ "
\n",
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "text/html": [
+ "2-d distributions
\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "from matplotlib import pyplot as plt\n",
+ "_df_19.plot(kind='scatter', x='index', y='Week of pending', s=32, alpha=.8)\n",
+ "plt.gca().spines[['top', 'right',]].set_visible(False)"
+ ],
+ "text/html": [
+ " \n",
+ "

\n",
+ "
\n",
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "from matplotlib import pyplot as plt\n",
+ "_df_20.plot(kind='scatter', x='Week of pending', y='Mean days listing to pending', s=32, alpha=.8)\n",
+ "plt.gca().spines[['top', 'right',]].set_visible(False)"
+ ],
+ "text/html": [
+ " \n",
+ "

\n",
+ "
\n",
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "from matplotlib import pyplot as plt\n",
+ "_df_21.plot(kind='scatter', x='Mean days listing to pending', y='Mean price reduction percentage', s=32, alpha=.8)\n",
+ "plt.gca().spines[['top', 'right',]].set_visible(False)"
+ ],
+ "text/html": [
+ " \n",
+ "

\n",
+ "
\n",
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "text/html": [
+ "Time series
\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "from matplotlib import pyplot as plt\n",
+ "import seaborn as sns\n",
+ "def _plot_series(series, series_name, series_index=0):\n",
+ " from matplotlib import pyplot as plt\n",
+ " import seaborn as sns\n",
+ " palette = list(sns.palettes.mpl_palette('Dark2'))\n",
+ " xs = series['index']\n",
+ " ys = series['Mean days listing to pending']\n",
+ " \n",
+ " plt.plot(xs, ys, label=series_name, color=palette[series_index % len(palette)])\n",
+ "\n",
+ "fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')\n",
+ "df_sorted = _df_22.sort_values('index', ascending=True)\n",
+ "_plot_series(df_sorted, '')\n",
+ "sns.despine(fig=fig, ax=ax)\n",
+ "plt.xlabel('index')\n",
+ "_ = plt.ylabel('Mean days listing to pending')"
+ ],
+ "text/html": [
+ " \n",
+ "

\n",
+ "
\n",
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "from matplotlib import pyplot as plt\n",
+ "import seaborn as sns\n",
+ "def _plot_series(series, series_name, series_index=0):\n",
+ " from matplotlib import pyplot as plt\n",
+ " import seaborn as sns\n",
+ " palette = list(sns.palettes.mpl_palette('Dark2'))\n",
+ " xs = series['index']\n",
+ " ys = series['Mean price reduction percentage']\n",
+ " \n",
+ " plt.plot(xs, ys, label=series_name, color=palette[series_index % len(palette)])\n",
+ "\n",
+ "fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')\n",
+ "df_sorted = _df_23.sort_values('index', ascending=True)\n",
+ "_plot_series(df_sorted, '')\n",
+ "sns.despine(fig=fig, ax=ax)\n",
+ "plt.xlabel('index')\n",
+ "_ = plt.ylabel('Mean price reduction percentage')"
+ ],
+ "text/html": [
+ " \n",
+ "

\n",
+ "
\n",
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "from matplotlib import pyplot as plt\n",
+ "import seaborn as sns\n",
+ "def _plot_series(series, series_name, series_index=0):\n",
+ " from matplotlib import pyplot as plt\n",
+ " import seaborn as sns\n",
+ " palette = list(sns.palettes.mpl_palette('Dark2'))\n",
+ " counted = (series['index']\n",
+ " .value_counts()\n",
+ " .reset_index(name='counts')\n",
+ " .rename({'index': 'index'}, axis=1)\n",
+ " .sort_values('index', ascending=True))\n",
+ " xs = counted['index']\n",
+ " ys = counted['counts']\n",
+ " plt.plot(xs, ys, label=series_name, color=palette[series_index % len(palette)])\n",
+ "\n",
+ "fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')\n",
+ "df_sorted = _df_24.sort_values('index', ascending=True)\n",
+ "_plot_series(df_sorted, '')\n",
+ "sns.despine(fig=fig, ax=ax)\n",
+ "plt.xlabel('index')\n",
+ "_ = plt.ylabel('count()')"
+ ],
+ "text/html": [
+ " \n",
+ "

\n",
+ "
\n",
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "from matplotlib import pyplot as plt\n",
+ "import seaborn as sns\n",
+ "def _plot_series(series, series_name, series_index=0):\n",
+ " from matplotlib import pyplot as plt\n",
+ " import seaborn as sns\n",
+ " palette = list(sns.palettes.mpl_palette('Dark2'))\n",
+ " xs = series['Week of pending']\n",
+ " ys = series['Mean days listing to pending']\n",
+ " \n",
+ " plt.plot(xs, ys, label=series_name, color=palette[series_index % len(palette)])\n",
+ "\n",
+ "fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')\n",
+ "df_sorted = _df_25.sort_values('Week of pending', ascending=True)\n",
+ "_plot_series(df_sorted, '')\n",
+ "sns.despine(fig=fig, ax=ax)\n",
+ "plt.xlabel('Week of pending')\n",
+ "_ = plt.ylabel('Mean days listing to pending')"
+ ],
+ "text/html": [
+ " \n",
+ "

\n",
+ "
\n",
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "text/html": [
+ "Values
\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "from matplotlib import pyplot as plt\n",
+ "_df_26['index'].plot(kind='line', figsize=(8, 4), title='index')\n",
+ "plt.gca().spines[['top', 'right']].set_visible(False)"
+ ],
+ "text/html": [
+ " \n",
+ "

\n",
+ "
\n",
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "from matplotlib import pyplot as plt\n",
+ "_df_27['Week of pending'].plot(kind='line', figsize=(8, 4), title='Week of pending')\n",
+ "plt.gca().spines[['top', 'right']].set_visible(False)"
+ ],
+ "text/html": [
+ " \n",
+ "

\n",
+ "
\n",
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "from matplotlib import pyplot as plt\n",
+ "_df_28['Mean days listing to pending'].plot(kind='line', figsize=(8, 4), title='Mean days listing to pending')\n",
+ "plt.gca().spines[['top', 'right']].set_visible(False)"
+ ],
+ "text/html": [
+ " \n",
+ "

\n",
+ "
\n",
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "from matplotlib import pyplot as plt\n",
+ "_df_29['Mean price reduction percentage'].plot(kind='line', figsize=(8, 4), title='Mean price reduction percentage')\n",
+ "plt.gca().spines[['top', 'right']].set_visible(False)"
+ ],
+ "text/html": [
+ " \n",
+ "

\n",
+ "
\n",
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "### **Step 2:** Create the labels set (y) from the “loan_status” column, and then create the features (X) DataFrame from the remaining columns."
+ ],
+ "metadata": {
+ "id": "_GzBQdVUNVc8"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Separate the data into labels and features\n",
+ "# Separate the y variable, the labels\n",
+ "y = df[\"Mean price reduction percentage\"]\n",
+ "\n",
+ "# Separate the X variable, the features\n",
+ "x = df.drop(columns=['Mean price reduction percentage'])\n"
+ ],
+ "metadata": {
+ "id": "D1Rc7verJSub"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Review the y variable Series\n",
+ "y[:5]\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "1xpFwbdKLwcH",
+ "outputId": "a08926d0-9619-4687-f280-f0dbbaa98dfe"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0 0.045619\n",
+ "1 0.046532\n",
+ "2 0.048536\n",
+ "3 0.049169\n",
+ "4 0.048168\n",
+ "Name: Mean price reduction percentage, dtype: float64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 18
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Review the X variable DataFrame\n",
+ "x[:5]"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "nDzx6xzBMGf1",
+ "outputId": "f7f9f972-4a9f-4ac5-fb7f-c24740439e82"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Week of pending Mean days listing to pending\n",
+ "0 1517011200000000000 49\n",
+ "1 1517616000000000000 48\n",
+ "2 1518220800000000000 47\n",
+ "3 1518825600000000000 46\n",
+ "4 1519430400000000000 43"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Week of pending | \n",
+ " Mean days listing to pending | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1517011200000000000 | \n",
+ " 49 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1517616000000000000 | \n",
+ " 48 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1518220800000000000 | \n",
+ " 47 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1518825600000000000 | \n",
+ " 46 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1519430400000000000 | \n",
+ " 43 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \"x[:5]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"Week of pending\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 956272764434917,\n \"min\": 1517011200000000000,\n \"max\": 1519430400000000000,\n \"num_unique_values\": 5,\n \"samples\": [\n 1517616000000000000,\n 1519430400000000000,\n 1518220800000000000\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Mean days listing to pending\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2,\n \"min\": 43,\n \"max\": 49,\n \"num_unique_values\": 5,\n \"samples\": [\n 48,\n 43,\n 47\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 19
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "### **Step 3:** Check the balance of the labels variable (y) by using the value_counts function."
+ ],
+ "metadata": {
+ "id": "3Io6qlPjNak3"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Check the balance of our target values\n",
+ "y.value_counts()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "r09FJ8yAMy0H",
+ "outputId": "a22a2eb9-4852-4694-bc14-a95c052f754c"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0.045619 1\n",
+ "0.051982 1\n",
+ "0.053724 1\n",
+ "0.053300 1\n",
+ "0.051753 1\n",
+ "0.052866 1\n",
+ "0.055280 1\n",
+ "0.055631 1\n",
+ "0.053722 1\n",
+ "0.050454 1\n",
+ "0.056315 1\n",
+ "0.049650 1\n",
+ "0.050125 1\n",
+ "0.050378 1\n",
+ "0.047866 1\n",
+ "0.047123 1\n",
+ "0.045601 1\n",
+ "0.045761 1\n",
+ "0.054597 1\n",
+ "0.055683 1\n",
+ "0.043061 1\n",
+ "0.049253 1\n",
+ "0.046162 1\n",
+ "0.046104 1\n",
+ "0.046359 1\n",
+ "0.047222 1\n",
+ "0.047829 1\n",
+ "0.048121 1\n",
+ "0.048495 1\n",
+ "0.050773 1\n",
+ "0.056189 1\n",
+ "0.052223 1\n",
+ "0.053757 1\n",
+ "0.053530 1\n",
+ "0.053945 1\n",
+ "0.054528 1\n",
+ "0.055790 1\n",
+ "0.056798 1\n",
+ "0.046843 1\n",
+ "0.045798 1\n",
+ "0.046940 1\n",
+ "0.046210 1\n",
+ "0.047247 1\n",
+ "0.047235 1\n",
+ "0.047233 1\n",
+ "0.046522 1\n",
+ "0.045901 1\n",
+ "0.046424 1\n",
+ "0.045528 1\n",
+ "0.045870 1\n",
+ "0.045268 1\n",
+ "0.043907 1\n",
+ "0.044843 1\n",
+ "0.044800 1\n",
+ "0.046553 1\n",
+ "0.047386 1\n",
+ "0.047171 1\n",
+ "0.045198 1\n",
+ "0.046643 1\n",
+ "0.045805 1\n",
+ "0.045522 1\n",
+ "0.044850 1\n",
+ "0.044360 1\n",
+ "0.044613 1\n",
+ "0.043009 1\n",
+ "0.043563 1\n",
+ "0.043809 1\n",
+ "0.044134 1\n",
+ "0.045389 1\n",
+ "0.045845 1\n",
+ "0.045695 1\n",
+ "0.045771 1\n",
+ "0.045599 1\n",
+ "0.045390 1\n",
+ "0.045341 1\n",
+ "0.045320 1\n",
+ "0.045145 1\n",
+ "0.046238 1\n",
+ "0.045686 1\n",
+ "0.045812 1\n",
+ "0.048135 1\n",
+ "0.045604 1\n",
+ "0.045066 1\n",
+ "0.044726 1\n",
+ "0.045745 1\n",
+ "0.046168 1\n",
+ "0.047203 1\n",
+ "0.048383 1\n",
+ "0.048976 1\n",
+ "0.051110 1\n",
+ "0.047989 1\n",
+ "0.047645 1\n",
+ "0.047183 1\n",
+ "0.046061 1\n",
+ "0.047091 1\n",
+ "0.048881 1\n",
+ "0.050298 1\n",
+ "0.045846 1\n",
+ "0.045500 1\n",
+ "0.045001 1\n",
+ "0.044129 1\n",
+ "0.043778 1\n",
+ "0.043440 1\n",
+ "0.044398 1\n",
+ "0.044736 1\n",
+ "0.045432 1\n",
+ "0.045794 1\n",
+ "0.044920 1\n",
+ "0.044752 1\n",
+ "0.044799 1\n",
+ "0.044975 1\n",
+ "0.044045 1\n",
+ "0.043614 1\n",
+ "0.043202 1\n",
+ "0.042720 1\n",
+ "0.043026 1\n",
+ "0.051039 1\n",
+ "0.049792 1\n",
+ "0.044952 1\n",
+ "0.045377 1\n",
+ "0.044858 1\n",
+ "0.045319 1\n",
+ "0.045720 1\n",
+ "0.045881 1\n",
+ "0.046039 1\n",
+ "0.046211 1\n",
+ "0.045521 1\n",
+ "0.045620 1\n",
+ "0.049232 1\n",
+ "0.045150 1\n",
+ "0.045212 1\n",
+ "0.044290 1\n",
+ "0.044224 1\n",
+ "0.043877 1\n",
+ "0.043960 1\n",
+ "0.044618 1\n",
+ "0.046626 1\n",
+ "0.047211 1\n",
+ "0.046666 1\n",
+ "0.046659 1\n",
+ "0.048853 1\n",
+ "0.049427 1\n",
+ "0.049208 1\n",
+ "0.047832 1\n",
+ "0.048498 1\n",
+ "0.047285 1\n",
+ "0.047152 1\n",
+ "0.047750 1\n",
+ "0.047673 1\n",
+ "0.047831 1\n",
+ "0.047822 1\n",
+ "0.047984 1\n",
+ "0.046614 1\n",
+ "0.045657 1\n",
+ "0.045615 1\n",
+ "0.044226 1\n",
+ "0.042040 1\n",
+ "0.046532 1\n",
+ "0.041244 1\n",
+ "0.039966 1\n",
+ "0.041317 1\n",
+ "0.041729 1\n",
+ "0.041253 1\n",
+ "0.040153 1\n",
+ "0.039415 1\n",
+ "0.040565 1\n",
+ "0.043115 1\n",
+ "0.039414 1\n",
+ "0.042381 1\n",
+ "0.041886 1\n",
+ "0.041866 1\n",
+ "0.040357 1\n",
+ "0.039980 1\n",
+ "0.038556 1\n",
+ "0.037744 1\n",
+ "0.039662 1\n",
+ "0.038653 1\n",
+ "0.044982 1\n",
+ "0.037157 1\n",
+ "0.036139 1\n",
+ "0.036328 1\n",
+ "0.036132 1\n",
+ "0.036197 1\n",
+ "0.036192 1\n",
+ "0.036505 1\n",
+ "0.036940 1\n",
+ "0.037096 1\n",
+ "0.038388 1\n",
+ "0.038085 1\n",
+ "0.037930 1\n",
+ "0.038165 1\n",
+ "0.039745 1\n",
+ "0.039106 1\n",
+ "0.038861 1\n",
+ "0.039304 1\n",
+ "0.037782 1\n",
+ "0.038099 1\n",
+ "0.037792 1\n",
+ "0.042494 1\n",
+ "0.039996 1\n",
+ "0.039832 1\n",
+ "0.040623 1\n",
+ "0.039714 1\n",
+ "0.041179 1\n",
+ "0.042466 1\n",
+ "0.042186 1\n",
+ "0.041987 1\n",
+ "0.037592 1\n",
+ "0.042380 1\n",
+ "0.042958 1\n",
+ "0.044458 1\n",
+ "0.047288 1\n",
+ "0.048168 1\n",
+ "0.049169 1\n",
+ "0.048536 1\n",
+ "0.040865 1\n",
+ "0.039733 1\n",
+ "0.040034 1\n",
+ "0.039404 1\n",
+ "0.037384 1\n",
+ "0.036740 1\n",
+ "0.036993 1\n",
+ "0.037161 1\n",
+ "0.037247 1\n",
+ "0.037718 1\n",
+ "0.037459 1\n",
+ "0.037497 1\n",
+ "0.037725 1\n",
+ "0.038276 1\n",
+ "0.038757 1\n",
+ "0.038978 1\n",
+ "0.038807 1\n",
+ "0.038921 1\n",
+ "0.038872 1\n",
+ "0.035653 1\n",
+ "0.035737 1\n",
+ "0.035691 1\n",
+ "0.036638 1\n",
+ "0.036394 1\n",
+ "0.036424 1\n",
+ "0.036310 1\n",
+ "0.036261 1\n",
+ "0.036011 1\n",
+ "0.036073 1\n",
+ "0.036614 1\n",
+ "0.036515 1\n",
+ "0.042576 1\n",
+ "0.036034 1\n",
+ "0.035148 1\n",
+ "0.034772 1\n",
+ "0.034588 1\n",
+ "0.034994 1\n",
+ "0.035242 1\n",
+ "0.036853 1\n",
+ "0.037385 1\n",
+ "0.037573 1\n",
+ "0.037336 1\n",
+ "0.037263 1\n",
+ "0.044064 1\n",
+ "0.044309 1\n",
+ "0.041625 1\n",
+ "0.040599 1\n",
+ "0.039862 1\n",
+ "0.039480 1\n",
+ "0.038424 1\n",
+ "0.038240 1\n",
+ "0.037810 1\n",
+ "0.037188 1\n",
+ "0.037572 1\n",
+ "0.037535 1\n",
+ "0.037357 1\n",
+ "0.037083 1\n",
+ "0.036345 1\n",
+ "0.040396 1\n",
+ "0.047368 1\n",
+ "0.035767 1\n",
+ "0.037008 1\n",
+ "0.038567 1\n",
+ "0.038548 1\n",
+ "0.038282 1\n",
+ "0.037703 1\n",
+ "0.037360 1\n",
+ "0.037168 1\n",
+ "0.037115 1\n",
+ "0.036685 1\n",
+ "0.047529 1\n",
+ "0.036529 1\n",
+ "0.036719 1\n",
+ "0.036386 1\n",
+ "0.036220 1\n",
+ "0.036004 1\n",
+ "0.035782 1\n",
+ "0.035825 1\n",
+ "0.038226 1\n",
+ "0.038709 1\n",
+ "0.039298 1\n",
+ "0.039516 1\n",
+ "0.046611 1\n",
+ "0.041044 1\n",
+ "0.040156 1\n",
+ "0.038269 1\n",
+ "0.039588 1\n",
+ "0.040546 1\n",
+ "0.040413 1\n",
+ "0.040950 1\n",
+ "0.040520 1\n",
+ "0.040126 1\n",
+ "0.040337 1\n",
+ "0.039900 1\n",
+ "0.038930 1\n",
+ "0.038947 1\n",
+ "0.039715 1\n",
+ "0.043273 1\n",
+ "Name: Mean price reduction percentage, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 20
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "### **Step 4:** Split the data into training and testing datasets by using train_test_split.\n",
+ "\n",
+ "\n",
+ "---\n",
+ "\n",
+ "\n",
+ "\n",
+ "---\n",
+ "\n",
+ "\n",
+ "\n",
+ "---\n",
+ "\n",
+ "##**Cannot run code from here down: y has NaN Values**\n",
+ "\n",
+ "\n",
+ "---\n",
+ "\n",
+ "\n",
+ "\n",
+ "---\n",
+ "\n",
+ "\n",
+ "\n",
+ "---\n",
+ "\n"
+ ],
+ "metadata": {
+ "id": "r016mPXJN1ip"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Import the train_test_learn module\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "# Split the data using train_test_split\n",
+ "# Assign a random_state of 1 to the function\n",
+ "x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, stratify=y)"
+ ],
+ "metadata": {
+ "id": "eQv0QlgHN4K4"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Create a Logistic Regression Model with the Original Data\n",
+ "\n"
+ ],
+ "metadata": {
+ "id": "kuiR9a9jOEZb"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "### **Step 1:** Fit a logistic regression model by using the training data (X_train and y_train)."
+ ],
+ "metadata": {
+ "id": "i6HO93VgOFLb"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Import the LogisticRegression module from SKLearn\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "\n",
+ "# Instantiate the Logistic Regression model\n",
+ "# Assign a random_state parameter of 1 to the model\n",
+ "logistic_regression_model = LogisticRegression(random_state=1)\n",
+ "\n",
+ "# Fit the model using training data\n",
+ "lr_model = logistic_regression_model.fit(x_train, y_train)\n"
+ ],
+ "metadata": {
+ "id": "D0c_G6DYOo07"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "### **Step 2:** Save the predictions on the testing data labels by using the testing feature data (X_test) and the fitted model."
+ ],
+ "metadata": {
+ "id": "wjrpEtc8PDJQ"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "\n",
+ "# Make a prediction using the testing data\n",
+ "testing_predictions = lr_model.predict(x_test)"
+ ],
+ "metadata": {
+ "id": "1XoF3R5gPGHd"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### **Step 3:** Evaluate the model’s performance by doing the following:\n",
+ "* Calculate the accuracy score of the model.\n",
+ "* Generate a confusion matrix.\n",
+ "* Print the classification report.\n",
+ "\n"
+ ],
+ "metadata": {
+ "id": "WsoTPyplPOEO"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Print the balanced_accuracy score of the model\n",
+ "\n",
+ "print(f\"Training Data Score: {lr_model.score(x_train,y_train)}\")\n",
+ "print(f\"Testing Data Score: {lr_model.score(x_test,y_test)}\")\n",
+ "print(f\"Balanced Accuracy Score: {balanced_accuracy_score(y_test, testing_predictions)}\")\n",
+ "print(f\"The balanced accuracy with the original data is {testing_accuracy* 100:.2f}%\")"
+ ],
+ "metadata": {
+ "id": "iZHh9DiHPeSk"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Generate a confusion matrix for the model\n",
+ "testing_confusion_matrix = confusion_matrix(y_test, testing_prediction)\n",
+ "\n",
+ "print(f'The confusion matrix with the original data is:\\n{testing_confusion_matrix}')"
+ ],
+ "metadata": {
+ "id": "AwnFN2l-PmzU"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Print the classification report for the model\n",
+ "testing_report = classification_report(y_test, testing_prediction)\n",
+ "\n",
+ "#View the results\n",
+ "print(f'The classification report with the original data shows:\\n{testing_report}')"
+ ],
+ "metadata": {
+ "id": "VFA-LwvEPpWl"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Step 4:** Answer the following question.\n",
+ "\n",
+ "**Question:** How well does the logistic regression model predict both the 0 (healthy loan) and 1 (high-risk loan) labels?\n",
+ "\n",
+ "**Answer:** HEALTHY LOANS: For healthy loans, the precision is 1.00, the recall is 1.00 and the f1-score is 1.00, meaning the model is performing exceptionally well in identifying healthy loans without missing any. This scenario is ideal, indicating a high level of confidence in the model's ability to identify healthy loans.\n",
+ "\n",
+ "**HIGH-RISK LOANS:** The precision is 0.87 for high-risk loans, so the remaining 13% are false-positives. The recall is 0.89, so the remaining 11% are false negatives. The f1-score is 0.88, which suggests a good balance between precision and recall for high-risk loans. Therefore, the model performs moderately well for high-risk loans, but there is still room for improvement; it is highly likely that the financial field would require 95% or higher in order to retain confidence in the model."
+ ],
+ "metadata": {
+ "id": "Ko7WeFk2Pvo_"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Predict a Logistic Regression Model with Resampled Training Data"
+ ],
+ "metadata": {
+ "id": "l70oTU4NQAxR"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "### **Step 1:** Use the RandomOverSampler module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points."
+ ],
+ "metadata": {
+ "id": "-zddS4A8QExT"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "\n",
+ "# Import the RandomOverSampler module form imbalanced-learn\n",
+ "from imblearn.over_sampling import RandomOverSampler\n",
+ "\n",
+ "# Instantiate the random oversampler model\n",
+ "# # Assign a random_state parameter of 1 to the model\n",
+ "ros_model = RandomOverSampler(random_state=1)\n",
+ "\n",
+ "# Fit the original training data to the random_oversampler model\n",
+ "x_res, y_res = ros_model.fit_resample(x_train, y_train)"
+ ],
+ "metadata": {
+ "id": "SN-ZBgQ3QGrd"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Count the distinct values of the resampled labels data\n",
+ "\n",
+ "#y_res.value_counts()\n",
+ "print(y_res.value_counts())\n",
+ "\n",
+ "\n",
+ "# Check that my numbers are the same on both sides\n",
+ "unique_values, counts = np.unique(y_res, return_counts=True)\n",
+ "print(\"Resampled Labels: \", unique_values)\n",
+ "print(\"Label Counts: \", counts)"
+ ],
+ "metadata": {
+ "id": "EHYhxN5OQOPh"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "### **Step 2:** Use the LogisticRegression classifier and the resampled data to fit the model and make predictions."
+ ],
+ "metadata": {
+ "id": "7iR7FqiAQTxd"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Instantiate the Logistic Regression model\n",
+ "# Assign a random_state parameter of 1 to the model\n",
+ "lr_ros_model = LogisticRegression(random_state=1)\n",
+ "\n",
+ "# Fit the model using the resampled training data\n",
+ "lr_ros_model.fit(x_res, y_res)\n",
+ "\n",
+ "# Make a prediction using the testing data\n",
+ "testing_ros_predictions = lr_ros_model.predict(x_test)"
+ ],
+ "metadata": {
+ "id": "GeIFy1y5Q6c0"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "### **Step 3:** Evaluate the model’s performance by doing the following:\n",
+ "\n",
+ "* Calculate the accuracy score of the model.\n",
+ "* Generate a confusion matrix.\n",
+ "* Print the classification report.\n",
+ "\n",
+ "\n",
+ "\n"
+ ],
+ "metadata": {
+ "id": "uquE8574RAvM"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Print the balanced_accuracy score of the model\n",
+ "print(f\"Training Data Score: {lr_ros_model.score(x_res,y_res)}\")\n",
+ "print(f\"Testing Data Score: {lr_ros_model.score(x_test,y_test)}\")\n",
+ "\n",
+ "ros_balanced_accuracy_score = balanced_accuracy_score(y_test, testing_ros_predictions)\n",
+ "print(f\"Balanced Accuracy Score: {ros_balanced_accuracy_score}\")\n",
+ "print(f\"The balanced accuracy with the original data is {ros_balanced_accuracy_score* 100:.2f}%\")"
+ ],
+ "metadata": {
+ "id": "GEeUM8MNROxS"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Generate a confusion matrix for the model\n",
+ "ros_confusion_matrix = confusion_matrix(y_test, testing_ros_predictions)\n",
+ "print(f'The confusion matrix with the oversampled data is:\\n{ros_confusion_matrix}')"
+ ],
+ "metadata": {
+ "id": "fYxO0OVWRVdL"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Print the classification report for the model\n",
+ "ros_classification_report = classification_report(y_test, testing_ros_predictions)\n",
+ "\n",
+ "#View the results\n",
+ "print(f'The classification report with the oversampled data shows:\\n{ros_classification_report}')"
+ ],
+ "metadata": {
+ "id": "kbbYfKiJRXH6"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "###**Step 4:** Answer the following question\n",
+ "**Question:** How well does the logistic regression model, fit with oversampled data, predict both the 0 (healthy loan) and 1 (high-risk loan) labels?\n",
+ "\n",
+ "**Answer: **The logistic regression model, fit with the oversampled data, better predicts the healthy and high-risk loan labels than our first report. The healthy loans precision, recall, and f1-score remain the same as before; likewise, the high-risk loan labels have the same precision measurement. However, the recall has gone up 0.01 to a perfect \"1.00\", while the oversampled f1-score has increase the most, by 0.05, bringing it's score to \"0.93\". These changes make the logistic regression model, fit with the oversampled data, does a better job in catching the incorrect labelling of high-risk loans as healthy..\n"
+ ],
+ "metadata": {
+ "id": "9mHDdJTsRZp9"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "\n",
+ "\n",
+ "---\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "---\n",
+ "\n",
+ "\n",
+ "# Import necessary libraries\n",
+ "\n",
+ "#### ATTEMPT ADVANCED REGRESSION TECHNIQUES WITH OUR DATA\n",
+ "(from here down↓)\n",
+ "\n",
+ "\n",
+ "\n"
+ ],
+ "metadata": {
+ "id": "UPDfjtYqR5K2"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "pd.set_option('display.max_rows', None)"
+ ],
+ "metadata": {
+ "id": "wIWDHUBISnQm"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "plt.figure(figsize=(18,6))\n",
+ "plt.title('Heatmap of missing values')\n",
+ "sns.heatmap(df_combined.isnull(),yticklabels=False,cbar=False,cmap='viridis')"
+ ],
+ "metadata": {
+ "id": "SE9oT90VUoEC"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "unique_values = []\n",
+ "for col in object_cols:\n",
+ " unique_values.append(df_combined[col].unique().size)\n",
+ "plt.figure(figsize=(18,6))\n",
+ "plt.title('No. Unique values of Categorical Features')\n",
+ "plt.xticks(rotation=90)\n",
+ "sns.barplot(x=object_cols,y=unique_values)"
+ ],
+ "metadata": {
+ "id": "gfMF3htTUsTZ"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "plt.figure(figsize=(18,36))\n",
+ "plt.title('Categorical Features: Distribution')\n",
+ "plt.xticks(rotation=90)\n",
+ "index = 1\n",
+ "for col in object_cols:\n",
+ " y = df_combined[col].value_counts()\n",
+ " plt.subplot(11,4,index)\n",
+ " plt.xticks(rotation=90)\n",
+ " sns.barplot(x=list(y.index), y=y)\n",
+ " index +=1"
+ ],
+ "metadata": {
+ "id": "moe_Mxr0UwGG"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "### **Fill up missing values:**\n",
+ "* Drop the features 'Alley', 'Fence', and 'MiscFeature'.\n",
+ "\n",
+ "* Drop 'Utilities' feature, as all but one have the value 'AllPub'\n",
+ "\n",
+ "* All entries with missing 'FirePlaceQu' have 'Fireplaces' = 0. Hence fill missing values with 'NA'.\n",
+ "\n",
+ "* All but one entries with missing 'PoolQC' value have 'PoolArea' = 0. Use mode for missing value with non-zero PoolArea. Use 'NA' for the rest of the entries.\n",
+ "\n",
+ "* **Basement features:** Fill missing values with 'NA' or '0'.\n",
+ "\n",
+ "* **Garage features:** Fill missing values with 'NA' or '0'.\n",
+ "\n",
+ "* **Remaining Integer and Real features:** fill up missing values with mean of the corresponding feature.\n",
+ "\n",
+ "* **Remaining Categorical features:** fill up missing values with mode of the corresponding feature."
+ ],
+ "metadata": {
+ "id": "sP1rorjKVCeo"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_combined.drop(columns='Id',inplace=True); print('Drop Id \\n')\n",
+ "df_combined['MSZoning'] = df_combined['MSZoning'].fillna(df_combined['MSZoning'].mode()[0])\n",
+ "df_combined['LotFrontage'] = df_combined['LotFrontage'].fillna(df_combined['LotFrontage'].mean())\n",
+ "df_combined.drop(columns='Alley',inplace=True); print('Drop Alley \\n')\n",
+ "\n",
+ "#df_combined['Utilities'] = df_combined['Utilities'].fillna(df_combined['Utilities'].mode()[0])\n",
+ "print(df_combined['Utilities'].value_counts())\n",
+ "df_combined.drop(columns='Utilities',inplace=True); print('Drop Utilities \\n')\n",
+ "\n",
+ "df_combined['Exterior1st'] = df_combined['Exterior1st'].fillna(df_combined['Exterior1st'].mode()[0])\n",
+ "df_combined['Exterior2nd'] = df_combined['Exterior2nd'].fillna(df_combined['Exterior2nd'].mode()[0])\n",
+ "df_combined['MasVnrType'] = df_combined['MasVnrType'].fillna(df_combined['MasVnrType'].mode()[0])\n",
+ "df_combined['MasVnrArea'] = df_combined['MasVnrArea'].fillna(df_combined['MasVnrArea'].mean())\n",
+ "df_combined['Electrical'] = df_combined['Electrical'].fillna(df_combined['Electrical'].mode()[0])\n",
+ "df_combined['KitchenQual'] = df_combined['KitchenQual'].fillna(df_combined['KitchenQual'].mode()[0])\n",
+ "df_combined['Functional'] = df_combined['Functional'].fillna(df_combined['Functional'].mode()[0])\n",
+ "\n",
+ "#df_combined.loc[(df_combined['Fireplaces'] != 0) & (df_combined['FireplaceQu'].isnull()) ][['FireplaceQu','Fireplaces']]\n",
+ "df_combined['FireplaceQu'] = df_combined['FireplaceQu'].fillna('NA'); print('FirePlaceQu: Fill NA values for missing values \\n')\n",
+ "\n",
+ "df_combined.loc[(df_combined['PoolQC'].isnull()) & df_combined['PoolArea']>0][['PoolQC','PoolArea']]\n",
+ "df_combined.at[2599,'PoolQC'] = df_combined['PoolQC'].mode()[0]; print('PoolQC: Use mode for missing value with non-zero PoolAre \\n')\n",
+ "df_combined['PoolQC'] = df_combined['PoolQC'].fillna('NA'); print('PoolQC: Use NA for remaining missing values \\n')\n",
+ "\n",
+ "df_combined['SaleType'].fillna(df_combined['SaleType'].mode()[0],inplace=True)\n",
+ "df_combined.drop(columns=['Fence','MiscFeature','SalePrice'],inplace=True); print('Drop Fence, MiscFeature and SalePrice\\n')\n",
+ "\n",
+ "# Basement Features\n",
+ "#df_combined.loc[df_combined['BsmtQual'].isnull()][['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinSF1','BsmtFinType2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath']].head()\n",
+ "#df_combined.loc[df_combined['TotalBsmtSF'].isnull()][['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinSF1','BsmtFinType2','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath']]\n",
+ "print('Fill missing values of Basement features with NA or 0 \\n')\n",
+ "df_combined['BsmtQual'] = df_combined['BsmtQual'].fillna('NA')\n",
+ "df_combined['BsmtCond'] = df_combined['BsmtCond'].fillna('NA')\n",
+ "df_combined['BsmtExposure'] = df_combined['BsmtExposure'].fillna('NA')\n",
+ "df_combined['BsmtFinType1'] = df_combined['BsmtFinType1'].fillna('NA')\n",
+ "df_combined['BsmtFinType2'] = df_combined['BsmtFinType2'].fillna('NA')\n",
+ "\n",
+ "df_combined['BsmtFinSF1'] = df_combined['BsmtFinSF1'].fillna(int(0))\n",
+ "df_combined['BsmtFinSF2'] = df_combined['BsmtFinSF2'].fillna(int(0))\n",
+ "df_combined['BsmtUnfSF'] = df_combined['BsmtUnfSF'].fillna(int(0))\n",
+ "df_combined['TotalBsmtSF'] = df_combined['TotalBsmtSF'].fillna(int(0))\n",
+ "df_combined['BsmtFullBath'] = df_combined['BsmtFullBath'].fillna(int(0))\n",
+ "df_combined['BsmtHalfBath'] = df_combined['BsmtHalfBath'].fillna(int(0))\n",
+ "\n",
+ "# Garage Features\n",
+ "# df_combined.loc[df_combined['GarageCond'].isnull()][['GarageType','GarageYrBlt','GarageFinish','GarageCars','GarageArea','GarageQual','GarageCond']].head()\n",
+ "print('Fill missing values of Garage features with NA or 0 \\n')\n",
+ "df_combined['GarageType'] = df_combined['GarageType'].fillna('NA')\n",
+ "df_combined['GarageFinish'] = df_combined['GarageFinish'].fillna('NA')\n",
+ "df_combined['GarageCond'] = df_combined['GarageCond'].fillna('NA')\n",
+ "df_combined['GarageQual'] = df_combined['GarageQual'].fillna('NA')\n",
+ "df_combined['GarageCars'] = df_combined['GarageCars'].fillna(int(0))\n",
+ "df_combined['GarageArea'] = df_combined['GarageArea'].fillna(int(0))\n",
+ "df_combined['GarageYrBlt'] = df_combined['GarageYrBlt'].fillna(int(0))"
+ ],
+ "metadata": {
+ "id": "WJfk86_rVf3M"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_combined.head()"
+ ],
+ "metadata": {
+ "id": "dscyrp-FVn1A"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "##Check that all missing values have been taken care of.\n"
+ ],
+ "metadata": {
+ "id": "Hdj1pricVsXL"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(df_combined.isnull().sum().sum())\n"
+ ],
+ "metadata": {
+ "id": "He2FBbo5Vwu0"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#Data Preprocessing\n"
+ ],
+ "metadata": {
+ "id": "VjMXvf_eVyyn"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "###Import Onehot encoder to encode categorical features\n"
+ ],
+ "metadata": {
+ "id": "hvYemVQ0V2Kw"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.preprocessing import OneHotEncoder\n"
+ ],
+ "metadata": {
+ "id": "kYKdZVSAV6B8"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "s = (df_combined.dtypes == 'object')\n",
+ "object_cols = list(s[s].index)\n",
+ "print(\"Categorical variables:\")\n",
+ "print(object_cols)\n",
+ "print('No. of. categorical features: ',len(object_cols))"
+ ],
+ "metadata": {
+ "id": "B_4wlFrTV9V5"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "OH_encoder = OneHotEncoder(sparse=False)\n",
+ "OH_cols = pd.DataFrame(OH_encoder.fit_transform(df_combined[object_cols]))\n",
+ "OH_cols.index = df_combined.index\n",
+ "OH_cols.columns = OH_encoder.get_feature_names()\n",
+ "df_final = df_combined.drop(object_cols, axis=1)\n",
+ "df_final = pd.concat([df_final, OH_cols], axis=1)"
+ ],
+ "metadata": {
+ "id": "6G0xExDMV_0s"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_final.head()\n"
+ ],
+ "metadata": {
+ "id": "bqdO2y8PWCI4"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Check that the shapes are consistent\n",
+ "\n",
+ "print('df_final shape:', df_final.shape)\n",
+ "print('df_train shape:', df_Train.shape)\n",
+ "print('df_test shape:', df_Test.shape)\n",
+ "\n",
+ "X_Train = pd.DataFrame(df_final[:1460])\n",
+ "X_Test = pd.DataFrame(df_final[1460:])\n",
+ "Y_Train = df_Train['SalePrice']\n",
+ "\n",
+ "print('\\nCheck that the datasets are consistent:\\n')\n",
+ "print('X_train shape', X_Train.shape)\n",
+ "print('Y_train shape:', Y_Train.shape)\n",
+ "print('X_test shape:', X_Test.shape)"
+ ],
+ "metadata": {
+ "id": "XTjjDol_WFTE"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#Model selection and prediction\n"
+ ],
+ "metadata": {
+ "id": "nosipS5zWNw4"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.metrics import mean_absolute_error\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "# Split the training set into training and validation set\n",
+ "\n",
+ "X_train, X_valid, Y_train, Y_valid = train_test_split(X_Train, Y_Train, train_size=0.8, test_size=0.2,random_state=0)\n"
+ ],
+ "metadata": {
+ "id": "DhE6b407WRw3"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.ensemble import RandomForestRegressor\n",
+ "\n",
+ "model_RFR = RandomForestRegressor()\n",
+ "model_RFR.fit(X_train, Y_train)\n",
+ "Y_pred = model_RFR.predict(X_valid)\n",
+ "print(mean_absolute_error(Y_valid, Y_pred))"
+ ],
+ "metadata": {
+ "id": "AX-GGnP2WUGn"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.ensemble import GradientBoostingRegressor\n",
+ "\n",
+ "model_GBR = GradientBoostingRegressor()\n",
+ "model_GBR.fit(X_train, Y_train)\n",
+ "Y_pred = model_GBR.predict(X_valid)\n",
+ "print(mean_absolute_error(Y_valid, Y_pred))"
+ ],
+ "metadata": {
+ "id": "JU0_vCHRWWp7"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.linear_model import SGDRegressor\n",
+ "\n",
+ "model_SGD = SGDRegressor()\n",
+ "model_SGD.fit(X_train, Y_train)\n",
+ "Y_pred = model_SGD.predict(X_valid)\n",
+ "print(mean_absolute_error(Y_valid, Y_pred))"
+ ],
+ "metadata": {
+ "id": "ymYn_ciGWZPk"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')\n",
+ "\n",
+ "from xgboost import XGBRegressor\n",
+ "\n",
+ "model_XGBR = XGBRegressor(learning_rate=0.03,n_estimators=200,objective='reg:squarederror')\n",
+ "model_XGBR.fit(X_train,Y_train)\n",
+ "Y_pred = model_XGBR.predict(X_valid)\n",
+ "print(mean_absolute_error(Y_valid, Y_pred))"
+ ],
+ "metadata": {
+ "id": "oY6ubKpSWbe0"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "plt.figure()\n",
+ "plt.title('Comparison of Sale Price of Predicted and Actual values')\n",
+ "plt.scatter(Y_Train,model_RFR.predict(X_Train),label='Random Forest')\n",
+ "plt.scatter(Y_Train,model_XGBR.predict(X_Train),label='XGB')\n",
+ "plt.legend()"
+ ],
+ "metadata": {
+ "id": "i-6Qp5C9Wdti"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.model_selection import GridSearchCV\n",
+ "model = XGBRegressor()\n",
+ "\n",
+ "n_estimators = [100, 200, 500]\n",
+ "learning_rates = [0.03,0.1,0.3]\n",
+ "objectives = ['reg:squarederror']\n",
+ "\n",
+ "# Define the grid of hyperparameters to search\n",
+ "hyperparameter_grid = {\n",
+ " 'n_estimators' : n_estimators,\n",
+ " 'learning_rate':learning_rates,\n",
+ " 'objective' : objectives\n",
+ " }\n",
+ "\n",
+ "grid_cv = GridSearchCV(estimator = model,\n",
+ " param_grid = hyperparameter_grid,\n",
+ " scoring = 'neg_mean_absolute_error',\n",
+ " return_train_score = True)\n",
+ "\n",
+ "grid_cv.fit(X_Train,Y_Train)"
+ ],
+ "metadata": {
+ "id": "UF78kE6lWiEx"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "grid_cv.best_score_\n"
+ ],
+ "metadata": {
+ "id": "DnHxgmeiWkMZ"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "grid_cv.best_estimator_\n"
+ ],
+ "metadata": {
+ "id": "Y8iu_B8PWlgD"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "Y_pred = random_cv.predict(X_valid)\n",
+ "print(mean_absolute_error(Y_valid, Y_pred))"
+ ],
+ "metadata": {
+ "id": "CxqTwQQeWoxH"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "regressor = grid_cv.best_estimator_\n",
+ "Y_pred = regressor.predict(X_valid)\n",
+ "print(mean_absolute_error(Y_valid, Y_pred))"
+ ],
+ "metadata": {
+ "id": "i7JT1mMhWqps"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "plt.figure()\n",
+ "plt.title('Comparison of Sale Price of Predicted and Actual values')\n",
+ "plt.scatter(Y_Train,model_RFR.predict(X_Train),label='Random Forest')\n",
+ "plt.scatter(Y_Train,model_XGBR.predict(X_Train),label='XGB')\n",
+ "plt.scatter(Y_Train,regressor.predict(X_Train),label='Best model')\n",
+ "plt.legend()"
+ ],
+ "metadata": {
+ "id": "yr540r-3Ws9G"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#Preparation of Submission Data\n"
+ ],
+ "metadata": {
+ "id": "2IhMjO-BWv0N"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "Y_Pred = regressor.predict(X_Test)\n"
+ ],
+ "metadata": {
+ "id": "PbNuaJvMWyV5"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "Y_Pred\n"
+ ],
+ "metadata": {
+ "id": "aBtFxzuHW0Aa"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "Y_Pred.shape\n"
+ ],
+ "metadata": {
+ "id": "95r46nkXW1s-"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sub = pd.DataFrame()\n",
+ "sub['Id'] = df_Test['Id']\n",
+ "sub['SalePrice'] = Y_Pred"
+ ],
+ "metadata": {
+ "id": "Xlz2LofuW4Gn"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sub.head()\n"
+ ],
+ "metadata": {
+ "id": "4iTePcIbW6i2"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sub.tail()\n"
+ ],
+ "metadata": {
+ "id": "5mXLjSHpW8Ut"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sub.to_csv('Submission.csv')\n"
+ ],
+ "metadata": {
+ "id": "xJJzdFJqW980"
+ },
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file
diff --git a/Spark_Setup_(module_21,_optimized_21,_&_22).ipynb b/Spark_Setup_(module_21,_optimized_21,_&_22).ipynb
deleted file mode 100644
index c095475..0000000
--- a/Spark_Setup_(module_21,_optimized_21,_&_22).ipynb
+++ /dev/null
@@ -1,1119 +0,0 @@
-{
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
- "colab": {
- "provenance": [],
- "authorship_tag": "ABX9TyObpjxondblx6TF1OHcVN/z",
- "include_colab_link": true
- },
- "kernelspec": {
- "name": "python3",
- "display_name": "Python 3"
- },
- "language_info": {
- "name": "python"
- }
- },
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "view-in-github",
- "colab_type": "text"
- },
- "source": [
- "
"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "nTCutDQGoOxY"
- },
- "outputs": [],
- "source": [
- "!pip install keras-tuner --upgrade\n",
- "%matplotlib inline\n",
- "\n",
- "# Import our dependencies\n",
- "from sklearn.model_selection import train_test_split\n",
- "from sklearn.preprocessing import StandardScaler\n",
- "import pandas as pd\n",
- "import tensorflow as tf\n",
- "\n",
- "# Import and read the charity_data.csv.\n",
- "import pandas as pd\n",
- "application_df = pd.read_csv(\"https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv\")\n",
- "application_df.head()\n",
- "application_df.tail()"
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.\n",
- "application_df = application_df.drop(columns = ['EIN', 'NAME'],axis=1)\n",
- "application_df.head()\n",
- "application_df.tail()"
- ],
- "metadata": {
- "id": "hL-nfZhKovLI"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Determine the number of unique values in each column.\n",
- "application_df.nunique()"
- ],
- "metadata": {
- "id": "VhrRNiptovy9"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Look at APPLICATION_TYPE value counts for binning\n",
- "application_df_counts = application_df['APPLICATION_TYPE'].value_counts()\n",
- "application_df_counts"
- ],
- "metadata": {
- "id": "54sMSuG0ozWV"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Choose a cutoff value and create a list of application types to be replaced\n",
- "# use the variable name `application_types_to_replace`\n",
- "application_types_to_replace = list(application_df_counts[application_df_counts<500].index)\n",
- "\n",
- "# Replace in dataframe\n",
- "for app in application_types_to_replace:\n",
- " application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,\"Other\")\n",
- "\n",
- "# Check to make sure binning was successful\n",
- "application_df['APPLICATION_TYPE'].value_counts()"
- ],
- "metadata": {
- "id": "Mkrjf-6Lo2IL"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Look at CLASSIFICATION value counts for binning\n",
- "class_counts = application_df['CLASSIFICATION'].value_counts()\n",
- "class_counts"
- ],
- "metadata": {
- "id": "lhtrnl4Jo4oE"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# You may find it helpful to look at CLASSIFICATION value counts >1\n",
- "class_type = class_counts.loc[class_counts > 1]\n",
- "class_type"
- ],
- "metadata": {
- "id": "cCrl-BZ5o66g"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Choose a cutoff value and create a list of classifications to be replaced\n",
- "# use the variable name `classifications_to_replace`\n",
- "classifications_to_replace = list(class_counts[class_counts < 1000].index)\n",
- "\n",
- "# Replace in dataframe\n",
- "for cls in classifications_to_replace:\n",
- " application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,\"Other\")\n",
- "\n",
- "# Check to make sure binning was successful\n",
- "application_df['CLASSIFICATION'].value_counts()"
- ],
- "metadata": {
- "id": "NQA9sL5Ho679"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Convert categorical data to numeric with `pd.get_dummies`\n",
- "dummies_df = pd.get_dummies(application_df)\n",
- "dummies_df.head()\n"
- ],
- "metadata": {
- "id": "ZLaV6OVVo9wu"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Split our preprocessed data into our features and target arrays\n",
- "y = dummies_df['IS_SUCCESSFUL'].values\n",
- "X = dummies_df.drop('IS_SUCCESSFUL', axis=1).values\n",
- "\n",
- "# Split the preprocessed data into a training and testing dataset\n",
- "X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=78, test_size=.2)"
- ],
- "metadata": {
- "id": "JiE71OCRpBK8"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "print(f\"X_train Shape: {X_train.shape}, X_test Shape: {X_test.shape}\")"
- ],
- "metadata": {
- "id": "F-DkLL-epDuv"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Create a StandardScaler instances\n",
- "scaler = StandardScaler()\n",
- "\n",
- "# Fit the StandardScaler\n",
- "X_scaler = scaler.fit(X_train)\n",
- "\n",
- "# Scale the data\n",
- "X_train_scaled = X_scaler.transform(X_train)\n",
- "X_test_scaled = X_scaler.transform(X_test)"
- ],
- "metadata": {
- "id": "ruk8SGQzpFtM"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# installing kera_tuner upgrade\n",
- "# !pip install keras-tuner --upgrade\n",
- "\n",
- "# Import the kerastuner library\n",
- "#import keras_tuner as kt"
- ],
- "metadata": {
- "id": "l5_243-lpFu0"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "import warnings\n",
- "warnings.filterwarnings(\"ignore\")"
- ],
- "metadata": {
- "id": "xYT_dwpMpKTA"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.\n",
- "input_features = len(X_train_scaled[0])\n",
- "hidden_nodes_layer1 = 80\n",
- "hidden_nodes_layer2 = 30\n",
- "hidden_nodes_layer3 = 1\n",
- "\n",
- "nn = tf.keras.models.Sequential()\n",
- "\n",
- "# First hidden layer\n",
- "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,\n",
- " input_dim=input_features, activation=\"relu\"))\n",
- "\n",
- "# Second hidden layer\n",
- "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation=\"sigmoid\"))\n",
- "\n",
- "# Output layer\n",
- "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation=\"sigmoid\"))\n",
- "\n",
- "\n",
- "# Check the structure of the model\n",
- "nn.summary()"
- ],
- "metadata": {
- "id": "6frBap78pKUU"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Compile the model\n",
- "nn.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])"
- ],
- "metadata": {
- "id": "7Jul0GDHpKXc"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Train the model\n",
- "fit_model = nn.fit(X_train_scaled, y_train, epochs=100)"
- ],
- "metadata": {
- "id": "wckfODjopKaT"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Evaluate the model using the test data\n",
- "model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)\n",
- "print(f\"Loss: {model_loss}, Accuracy: {model_accuracy}\")"
- ],
- "metadata": {
- "id": "mG26GVMKpKc7"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Export our model to HDF5 file\n",
- "nn.save(\"AlphabetSoupCharity.h5\")"
- ],
- "metadata": {
- "id": "jQFjjkWFpKfx"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "!pip install keras-tuner --upgrade\n",
- "%matplotlib inline\n",
- "\n",
- "!pip install tensorflow\n",
- "\n",
- "# Import our dependencies\n",
- "from sklearn.model_selection import train_test_split\n",
- "from sklearn.preprocessing import StandardScaler\n",
- "import pandas as pd\n",
- "import tensorflow as tf\n",
- "\n",
- "# Import and read the charity_data.csv.\n",
- "import pandas as pd\n",
- "application_df = pd.read_csv(\"https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv\")\n",
- "application_df.head()\n",
- "application_df.tail()"
- ],
- "metadata": {
- "id": "MJJBDZhxpKky"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "\n",
- "# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.\n",
- "application_df = application_df.drop(columns = ['EIN', 'NAME'],axis=1)\n",
- "application_df.head()\n",
- "application_df.tail()\n"
- ],
- "metadata": {
- "id": "GvPAjnEMpKnF"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Determine the number of unique values in each column.\n",
- "application_df.nunique()\n"
- ],
- "metadata": {
- "id": "3f859pSRpKpq"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Look at APPLICATION_TYPE value counts for binning\n",
- "application_df_counts = application_df['APPLICATION_TYPE'].value_counts()\n",
- "application_df_counts"
- ],
- "metadata": {
- "id": "u73oTiqApKsT"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Choose a cutoff value and create a list of application types to be replaced\n",
- "# use the variable name `application_types_to_replace`\n",
- "application_types_to_replace = list(application_df_counts[application_df_counts<500].index)\n",
- "\n",
- "# Replace in dataframe\n",
- "for app in application_types_to_replace:\n",
- " application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,\"Other\")\n",
- "\n",
- "# Check to make sure binning was successful\n",
- "application_df['APPLICATION_TYPE'].value_counts()"
- ],
- "metadata": {
- "id": "HvTxcJmRpKup"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Look at CLASSIFICATION value counts for binning\n",
- "class_counts = application_df['CLASSIFICATION'].value_counts()\n",
- "class_counts"
- ],
- "metadata": {
- "id": "MQSfOqmUpjL5"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# You may find it helpful to look at CLASSIFICATION value counts >1\n",
- "class_type = class_counts.loc[class_counts > 1]\n",
- "class_type"
- ],
- "metadata": {
- "id": "BF9VYHRJpjOw"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- " Choose a cutoff value and create a list of classifications to be replaced\n",
- "# use the variable name `classifications_to_replace`\n",
- "classifications_to_replace = list(class_counts[class_counts < 1000].index)\n",
- "\n",
- "# Replace in dataframe\n",
- "for cls in classifications_to_replace:\n",
- " application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,\"Other\")\n",
- "\n",
- "# Check to make sure binning was successful\n",
- "application_df['CLASSIFICATION'].value_counts()"
- ],
- "metadata": {
- "id": "LsF3Ozx6pjRV"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Convert categorical data to numeric with `pd.get_dummies`\n",
- "dummies_df = pd.get_dummies(application_df)\n",
- "dummies_df.head()"
- ],
- "metadata": {
- "id": "YFcUF3hzpjUx"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Split our preprocessed data into our features and target arrays\n",
- "y = dummies_df['IS_SUCCESSFUL'].values\n",
- "X = dummies_df.drop('IS_SUCCESSFUL', axis=1).values\n",
- "\n",
- "# Split the preprocessed data into a training and testing dataset\n",
- "X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=78, test_size=.2)\n"
- ],
- "metadata": {
- "id": "izZ1YXSlpjXF"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "print(f\"X_train Shape: {X_train.shape}, X_test Shape: {X_test.shape}\")\n"
- ],
- "metadata": {
- "id": "Nb3Xn_MOpjZa"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Create a StandardScaler instances\n",
- "scaler = StandardScaler()\n",
- "\n",
- "# Fit the StandardScaler\n",
- "X_scaler = scaler.fit(X_train)\n",
- "\n",
- "# Scale the data\n",
- "X_train_scaled = X_scaler.transform(X_train)\n",
- "X_test_scaled = X_scaler.transform(X_test)"
- ],
- "metadata": {
- "id": "Pv3LdlUOpjcR"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "#Import the kerastuner library\n",
- "import keras_tuner as kt"
- ],
- "metadata": {
- "id": "ZpjXPBbLpje3"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "#FIRST ATTEMPT\n",
- "# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.\n",
- "input_features = len(X_train_scaled[0])\n",
- "hidden_nodes_layer1 = 80\n",
- "hidden_nodes_layer2 = 30\n",
- "hidden_nodes_layer3 = 1\n",
- "\n",
- "nn = tf.keras.models.Sequential()\n",
- "\n",
- "# First hidden layer\n",
- "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,\n",
- " input_dim=input_features, activation=\"tanh\"))\n",
- "\n",
- "# Second hidden layer\n",
- "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation=\"sigmoid\"))\n",
- "\n",
- "# Third layer\n",
- "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation=\"relu\"))\n",
- "\n",
- "# Outer Layer\n",
- "nn.add(tf.keras.layers.Dense(units=1, activation=\"sigmoid\"))\n",
- "\n",
- "\n",
- "# Check the structure of the model\n",
- "nn.summary()"
- ],
- "metadata": {
- "id": "WT1DSYt6pjhw"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# *FIRST ATTEMPT* Compile the model\n",
- "nn.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])"
- ],
- "metadata": {
- "id": "xi9sevJPpjj0"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# *FIRST ATTEMPT* Train the model\n",
- "fit_model = nn.fit(X_train_scaled, y_train, epochs=100)"
- ],
- "metadata": {
- "id": "osGv-I_Vpjms"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# *FIRST ATTEMPT* Evaluate the model using the test data\n",
- "model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)\n",
- "print(f\"Loss: {model_loss}, Accuracy: {model_accuracy}\")"
- ],
- "metadata": {
- "id": "2iblbOWSpjpF"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# *FIRST ATTEMPT* ACCURACY\n",
- "First Attempt Accuracy = 72.5%"
- ],
- "metadata": {
- "id": "z3UZkZgxpjrp"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "#SECOND ATTEMPT\n",
- "# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.\n",
- "input_features = len(X_train_scaled[0])\n",
- "hidden_nodes_layer1 = 100\n",
- "hidden_nodes_layer2 = 30\n",
- "hidden_nodes_layer3 = 1\n",
- "\n",
- "nn = tf.keras.models.Sequential()\n",
- "\n",
- "# First hidden layer\n",
- "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,\n",
- " input_dim=input_features, activation=\"relu\"))\n",
- "\n",
- "# Second hidden layer\n",
- "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation=\"sigmoid\"))\n",
- "\n",
- "# Third layer\n",
- "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation=\"relu\"))\n",
- "\n",
- "# Outer Layer\n",
- "nn.add(tf.keras.layers.Dense(units=1, activation=\"sigmoid\"))\n",
- "\n",
- "\n",
- "# Check the structure of the model\n",
- "nn.summary()"
- ],
- "metadata": {
- "id": "xrhb6QMfpjuP"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# *SECOND ATTEMPT* Compile the model\n",
- "nn.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])"
- ],
- "metadata": {
- "id": "NCXuyAQJpjw2"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# *SECOND ATTEMPT* Train the model\n",
- "fit_model = nn.fit(X_train_scaled, y_train, epochs=100)"
- ],
- "metadata": {
- "id": "VRxl-SKnpjzc"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# *SECOND ATTEMPT* Evaluate the model using the test data\n",
- "model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)\n",
- "print(f\"Loss: {model_loss}, Accuracy: {model_accuracy}\")"
- ],
- "metadata": {
- "id": "t0rABw3Gpj2F"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# *SECOND* ACCURACY\n",
- "#Changes for second attempt: first hidden layer activation from \"tahn\" to \"relu\"\n",
- "# increase hidden layer nodes 1 from 80 to 100\n",
- "Second Attempt Accuracy = 72.1% (vs first attempt of 72.5%)"
- ],
- "metadata": {
- "id": "9DftVHWPpj4Z"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# *THIRD ATTEMPT*\n",
- "# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.\n",
- "input_features = len(X_train_scaled[0])\n",
- "hidden_nodes_layer1 = 80\n",
- "hidden_nodes_layer2 = 25\n",
- "hidden_nodes_layer3 = 2\n",
- "\n",
- "nn = tf.keras.models.Sequential()\n",
- "\n",
- "# First hidden layer\n",
- "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,\n",
- " input_dim=input_features, activation=\"relu\"))\n",
- "\n",
- "# Second hidden layer\n",
- "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation=\"sigmoid\"))\n",
- "\n",
- "# Third layer\n",
- "nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation=\"relu\"))\n",
- "\n",
- "# Outer Layer\n",
- "nn.add(tf.keras.layers.Dense(units=1, activation=\"sigmoid\"))\n",
- "\n",
- "\n",
- "# Check the structure of the model\n",
- "nn.summary()\n"
- ],
- "metadata": {
- "id": "J4zv1OSNqZwx"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# *THIRD ATTEMPT* Compile the model\n",
- "nn.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])\n"
- ],
- "metadata": {
- "id": "oFw0EosHqZxr"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# *THIRD ATTEMPT* Train the model\n",
- "fit_model = nn.fit(X_train_scaled, y_train, epochs=100)"
- ],
- "metadata": {
- "id": "xrKzAwGLqZ44"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# *THIRD ATTEMPT* Evaluate the model using the test data\n",
- "model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)\n",
- "print(f\"Loss: {model_loss}, Accuracy: {model_accuracy}\")"
- ],
- "metadata": {
- "id": "6-eqf6xCqZ7d"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# *THIRD* ACCURACY\n",
- "First Attempt Accuracy = 72.2%"
- ],
- "metadata": {
- "id": "NLzLER_jqZ-E"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "import os\n",
- "# Find the latest version of spark 3.x from http://www.apache.org/dist/spark/ and enter as the spark version\n",
- "# For example:\n",
- "# spark_version = 'spark-3.5.1'\n",
- "spark_version = 'spark-3.5.1'\n",
- "os.environ['SPARK_VERSION']=spark_version\n",
- "\n",
- "# Install Spark and Java\n",
- "!apt-get update\n",
- "!apt-get install openjdk-11-jdk-headless -qq > /dev/null\n",
- "!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz\n",
- "!tar xf $SPARK_VERSION-bin-hadoop3.tgz\n",
- "!pip install -q findspark\n",
- "\n",
- "# Set Environment Variables\n",
- "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-11-openjdk-amd64\"\n",
- "os.environ[\"SPARK_HOME\"] = f\"/content/{spark_version}-bin-hadoop3\"\n",
- "\n",
- "# Start a SparkSession\n",
- "import findspark\n",
- "findspark.init()"
- ],
- "metadata": {
- "id": "Wpwrv8_dqaAe"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# Import packages\n",
- "from pyspark.sql import SparkSession\n",
- "import time\n",
- "\n",
- "# Create a SparkSession\n",
- "spark = SparkSession.builder.appName(\"SparkSQL\").getOrCreate()"
- ],
- "metadata": {
- "id": "4gd5ZX0XqaCw"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# 1. Read in the AWS S3 bucket into a DataFrame.\n",
- "from pyspark import SparkFiles\n",
- "url = \"https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.2/22-big-data/home_sales_revised.csv\"\n",
- "spark.sparkContext.addFile(url)\n",
- "df = spark.read.csv(SparkFiles.get(\"home_sales_revised.csv\"), sep=\",\", header=True)\n",
- "\n",
- "# Show DataFrame\n",
- "df.show()\n"
- ],
- "metadata": {
- "id": "8iLGqnOKqaFG"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "\n",
- "# Print our schema\n",
- "df.printSchema()# 2. Create a temporary view of the DataFrame.\n",
- "\n",
- "df.createOrReplaceTempView('home_sales')"
- ],
- "metadata": {
- "id": "2TKsOjUcqaHt"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# 3. What is the average price for a four bedroom house sold per year, rounded to two decimal places?\n",
- "a = \"\"\"\n",
- "SELECT\n",
- " YEAR(date) AS YEAR,\n",
- " ROUND(AVG(price), 2) AS AVERAGE_PRICE\n",
- "FROM home_sales\n",
- "WHERE bedrooms = 4\n",
- "GROUP BY YEAR\n",
- "ORDER BY YEAR DESC\n",
- "\"\"\"\n",
- "spark.sql(a).show()"
- ],
- "metadata": {
- "id": "0pIPKxgTqaJy"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# 4. What is the average price of a home for each year the home was built,\n",
- "# that have 3 bedrooms and 3 bathrooms, rounded to two decimal places?\n",
- "b = \"\"\"\n",
- "SELECT\n",
- " YEAR(date_built) AS YEAR,\n",
- " ROUND(AVG(price), 2) AS AVERAGE_PRICE\n",
- "FROM home_sales\n",
- "WHERE bedrooms = 3\n",
- "and bathrooms = 3\n",
- "GROUP BY YEAR(date_built)\n",
- "ORDER BY YEAR DESC\n",
- "\"\"\"\n",
- "spark.sql(b).show()\n"
- ],
- "metadata": {
- "id": "Yfr0e-XxqaMa"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# 5. What is the average price of a home for each year the home was built,\n",
- "# that have 3 bedrooms, 3 bathrooms, with two floors,\n",
- "# and are greater than or equal to 2,000 square feet, rounded to two decimal places?\n",
- "\n",
- "\n",
- "c = \"\"\"\n",
- "SELECT\n",
- "YEAR(date_built) AS YEAR_BUILT,\n",
- "ROUND(AVG(price), 2) AS AVERAGE_PRICE\n",
- "FROM home_sales\n",
- "WHERE bedrooms = 3\n",
- "and bathrooms = 3\n",
- "and sqft_living >= 2000\n",
- "and floors = 2\n",
- "GROUP BY YEAR_BUILT\n",
- "ORDER BY YEAR_BUILT DESC\n",
- "\"\"\"\n",
- "spark.sql(c).show()"
- ],
- "metadata": {
- "id": "HrEbTABkqaOu"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# 6. What is the average price of a home per \"view\" rating, rounded to two decimal places,\n",
- "# having an average home price greater than or equal to $350,000? Order by descending view rating.\n",
- "# Although this is a small dataset, determine the run time for this query.\n",
- "\n",
- "start_time = time.time()\n",
- "\n",
- "d = \"\"\"\n",
- "SELECT\n",
- "view,\n",
- "ROUND(AVG(price), 2) AS AVERAGE_PRICE\n",
- "FROM home_sales\n",
- "GROUP BY view\n",
- "HAVING AVG(price) >= 350000\n",
- "ORDER BY view desc\n",
- "\"\"\"\n",
- "spark.sql(d).show()\n",
- "\n",
- "print(\"--- %s seconds ---\" % (time.time() - start_time))"
- ],
- "metadata": {
- "id": "PpPgkgRxqzuG"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# 7. Cache the the temporary table home_sales.\n",
- "spark.sql('cache table home_sales')"
- ],
- "metadata": {
- "id": "IBCdFP4Eqzvi"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# 8. Check if the table is cached.\n",
- "spark.catalog.isCached('home_sales')"
- ],
- "metadata": {
- "id": "AqFsOiWwqz1S"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# 9. Using the cached data, run the last query above, that calculates\n",
- "# the average price of a home per \"view\" rating, rounded to two decimal places,\n",
- "# having an average home price greater than or equal to $350,000.\n",
- "# Determine the runtime and compare it to the uncached runtime.\n",
- "\n",
- "start_time = time.time()\n",
- "\n",
- "e = \"\"\"\n",
- "SELECT\n",
- "view,\n",
- "ROUND(AVG(price), 2) AS AVERAGE_PRICE\n",
- "FROM home_sales\n",
- "GROUP BY view\n",
- "HAVING AVG(price) >= 350000\n",
- "ORDER BY view desc\n",
- "\"\"\"\n",
- "spark.sql(e).show()\n",
- "\n",
- "\n",
- "print(\"--- %s seconds ---\" % (time.time() - start_time))\n",
- "\n",
- "# d speed = 0.7024235725402832 seconds\n",
- "# e spped = 0.5843555927276611 seconds\n",
- "# cache-ing sped up the run time !\n"
- ],
- "metadata": {
- "id": "SJTRR5UHqz5_"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# 10. Partition by the \"date_built\" field on the formatted parquet home sales data\n",
- "df.write.partitionBy('date_built').mode(\"overwrite\").parquet(\"p_home_sales\")"
- ],
- "metadata": {
- "id": "cSeQIJrFq6G4"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# 11. Read the parquet formatted data.\n",
- "p_df = spark.read.parquet('p_home_sales')"
- ],
- "metadata": {
- "id": "LgELnGCNq6OJ"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# 12. Create a temporary table for the parquet data.\n",
- "p_df.createOrReplaceTempView('parquet_temp_home')"
- ],
- "metadata": {
- "id": "bL8GVtuEq6Xj"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# 13. Using the parquet DataFrame, run the last query above, that calculates\n",
- "# the average price of a home per \"view\" rating, rounded to two decimal places,\n",
- "# having an average home price greater than or equal to $350,000.\n",
- "# Determine the runtime and compare it to the cached runtime.\n",
- "\n",
- "start_time = time.time()\n",
- "\n",
- "f = \"\"\"\n",
- "SELECT\n",
- "view,\n",
- "ROUND(AVG(price), 2) AS AVERAGE_PRICE\n",
- "FROM parquet_temp_home\n",
- "GROUP BY view\n",
- "HAVING AVG(price) >= 350000\n",
- "ORDER BY view desc\n",
- "\"\"\"\n",
- "spark.sql(f).show()\n",
- "\n",
- "print(\"--- %s seconds ---\" % (time.time() - start_time))\n",
- "\n",
- "# d speed = 0.7024235725402832 seconds\n",
- "# e spped = 0.5843555927276611 seconds\n",
- "# cache-ing sped up the run time !\n",
- "\n",
- "# f speed = 0.6659867763519287 seconds\n",
- "# parquet_temp_home is faster than original time, but not as fast as cached time\n"
- ],
- "metadata": {
- "id": "G5ynaoJvq6fl"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# 14. Uncache the home_sales temporary table.\n",
- "spark.sql('uncache table home_sales')"
- ],
- "metadata": {
- "id": "zAI0vZamq6mq"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# 15. Check if the home_sales is no longer cached\n",
- "if spark.catalog.isCached('home_sales'):\n",
- " print('home_sales remains cached')\n",
- "else:\n",
- " print('home_sales is no longer cached. ')\n"
- ],
- "metadata": {
- "id": "Yn38FqT8rI-o"
- },
- "execution_count": null,
- "outputs": []
- }
- ]
-}
\ No newline at end of file