diff --git a/module-3/Data-Cleaning-Challenge/.ipynb_checkpoints/iris_cleaning-checkpoint.ipynb b/module-3/Data-Cleaning-Challenge/.ipynb_checkpoints/iris_cleaning-checkpoint.ipynb new file mode 100644 index 00000000..e9eebc09 --- /dev/null +++ b/module-3/Data-Cleaning-Challenge/.ipynb_checkpoints/iris_cleaning-checkpoint.ipynb @@ -0,0 +1,796 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "moved-confusion", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "advanced-label", + "metadata": {}, + "outputs": [], + "source": [ + "iris_df = pd.read_csv(\"iris-data.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "charitable-knife", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal_length_cmsepal_width_cmpetal_length_cmpetal_width_cmclass
05.13.51.40.2Iris-setosa
14.93.01.40.2Iris-setosa
24.73.21.30.2Iris-setosa
34.63.11.50.2Iris-setosa
45.03.61.40.2Iris-setosa
\n", + "
" + ], + "text/plain": [ + " sepal_length_cm sepal_width_cm petal_length_cm petal_width_cm \\\n", + "0 5.1 3.5 1.4 0.2 \n", + "1 4.9 3.0 1.4 0.2 \n", + "2 4.7 3.2 1.3 0.2 \n", + "3 4.6 3.1 1.5 0.2 \n", + "4 5.0 3.6 1.4 0.2 \n", + "\n", + " class \n", + "0 Iris-setosa \n", + "1 Iris-setosa \n", + "2 Iris-setosa \n", + "3 Iris-setosa \n", + "4 Iris-setosa " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "earned-anatomy", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(150, 5)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "sorted-yemen", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal_length_cmsepal_width_cmpetal_length_cmpetal_width_cm
count150.000000150.000000150.000000145.000000
mean5.6446273.0546673.7586671.236552
std1.3127810.4331231.7644200.755058
min0.0550002.0000001.0000000.100000
25%5.1000002.8000001.6000000.400000
50%5.7000003.0000004.3500001.300000
75%6.4000003.3000005.1000001.800000
max7.9000004.4000006.9000002.500000
\n", + "
" + ], + "text/plain": [ + " sepal_length_cm sepal_width_cm petal_length_cm petal_width_cm\n", + "count 150.000000 150.000000 150.000000 145.000000\n", + "mean 5.644627 3.054667 3.758667 1.236552\n", + "std 1.312781 0.433123 1.764420 0.755058\n", + "min 0.055000 2.000000 1.000000 0.100000\n", + "25% 5.100000 2.800000 1.600000 0.400000\n", + "50% 5.700000 3.000000 4.350000 1.300000\n", + "75% 6.400000 3.300000 5.100000 1.800000\n", + "max 7.900000 4.400000 6.900000 2.500000" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "governing-huntington", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sepal_length_cm float64\n", + "sepal_width_cm float64\n", + "petal_length_cm float64\n", + "petal_width_cm float64\n", + "class object\n", + "dtype: object" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "major-detroit", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Iris-setosa', 'Iris-setossa', 'Iris-versicolor', 'versicolor',\n", + " 'Iris-virginica'], dtype=object)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_df['class'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "viral-moment", + "metadata": {}, + "outputs": [], + "source": [ + "iris_df.replace({'Iris-setossa': 'Iris-setosa','versicolor':'Iris-versicolor'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "intense-colony", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sepal_length_cm 0\n", + "sepal_width_cm 0\n", + "petal_length_cm 0\n", + "petal_width_cm 5\n", + "class 0\n", + "dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "decimal-vacuum", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal_length_cmsepal_width_cmpetal_length_cmpetal_width_cmclass
75.03.41.5NaNIris-setosa
84.42.91.4NaNIris-setosa
94.93.11.5NaNIris-setosa
105.43.71.5NaNIris-setosa
114.83.41.6NaNIris-setosa
\n", + "
" + ], + "text/plain": [ + " sepal_length_cm sepal_width_cm petal_length_cm petal_width_cm \\\n", + "7 5.0 3.4 1.5 NaN \n", + "8 4.4 2.9 1.4 NaN \n", + "9 4.9 3.1 1.5 NaN \n", + "10 5.4 3.7 1.5 NaN \n", + "11 4.8 3.4 1.6 NaN \n", + "\n", + " class \n", + "7 Iris-setosa \n", + "8 Iris-setosa \n", + "9 Iris-setosa \n", + "10 Iris-setosa \n", + "11 Iris-setosa " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_df[iris_df['petal_width_cm'].isnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "educational-pointer", + "metadata": {}, + "outputs": [], + "source": [ + "iris_filled = iris_df.fillna(iris_df['petal_width_cm'][iris_df['class']=='Iris-setosa'].mean())" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "italic-prediction", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal_length_cmsepal_width_cmpetal_length_cmpetal_width_cmclass
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [sepal_length_cm, sepal_width_cm, petal_length_cm, petal_width_cm, class]\n", + "Index: []" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_filled[iris_filled['petal_width_cm'].isnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "clean-bedroom", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "77 0.067\n", + "78 0.060\n", + "79 0.057\n", + "80 0.055\n", + "81 0.055\n", + "Name: sepal_length_cm, dtype: float64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_filled[iris_filled['sepal_length_cm'] < 2]['sepal_length_cm']" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "nuclear-grass", + "metadata": {}, + "outputs": [], + "source": [ + "def multiplyby100(number):\n", + " if number < 2: \n", + " return number*100\n", + " else: \n", + " return number" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "transsexual-wallace", + "metadata": {}, + "outputs": [], + "source": [ + "iris_filled['sepal_length_cm'] = iris_filled['sepal_length_cm'].apply(multiplyby100)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "painted-lingerie", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal_length_cmsepal_width_cmpetal_length_cmpetal_width_cmclass
05.13.51.40.2Iris-setosa
14.93.01.40.2Iris-setosa
24.73.21.30.2Iris-setosa
34.63.11.50.2Iris-setosa
45.03.61.40.2Iris-setosa
..................
1456.73.05.22.3Iris-virginica
1466.32.55.02.3Iris-virginica
1476.53.05.22.0Iris-virginica
1486.23.45.42.3Iris-virginica
1495.93.05.11.8Iris-virginica
\n", + "

150 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " sepal_length_cm sepal_width_cm petal_length_cm petal_width_cm \\\n", + "0 5.1 3.5 1.4 0.2 \n", + "1 4.9 3.0 1.4 0.2 \n", + "2 4.7 3.2 1.3 0.2 \n", + "3 4.6 3.1 1.5 0.2 \n", + "4 5.0 3.6 1.4 0.2 \n", + ".. ... ... ... ... \n", + "145 6.7 3.0 5.2 2.3 \n", + "146 6.3 2.5 5.0 2.3 \n", + "147 6.5 3.0 5.2 2.0 \n", + "148 6.2 3.4 5.4 2.3 \n", + "149 5.9 3.0 5.1 1.8 \n", + "\n", + " class \n", + "0 Iris-setosa \n", + "1 Iris-setosa \n", + "2 Iris-setosa \n", + "3 Iris-setosa \n", + "4 Iris-setosa \n", + ".. ... \n", + "145 Iris-virginica \n", + "146 Iris-virginica \n", + "147 Iris-virginica \n", + "148 Iris-virginica \n", + "149 Iris-virginica \n", + "\n", + "[150 rows x 5 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_filled" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "experienced-insertion", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Series([], Name: sepal_length_cm, dtype: float64)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_filled[iris_filled['sepal_length_cm'] < 2]['sepal_length_cm']" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "typical-airfare", + "metadata": {}, + "outputs": [], + "source": [ + "iris_filled.to_csv('iris_clean.csv',index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "configured-dictionary", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/module-3/Data-Cleaning-Challenge/iris_clean.csv b/module-3/Data-Cleaning-Challenge/iris_clean.csv new file mode 100644 index 00000000..9d6aee93 --- /dev/null +++ b/module-3/Data-Cleaning-Challenge/iris_clean.csv @@ -0,0 +1,151 @@ +sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class +5.1,3.5,1.4,0.2,Iris-setosa +4.9,3.0,1.4,0.2,Iris-setosa +4.7,3.2,1.3,0.2,Iris-setosa +4.6,3.1,1.5,0.2,Iris-setosa +5.0,3.6,1.4,0.2,Iris-setosa +5.4,3.9,1.7,0.4,Iris-setosa +4.6,3.4,1.4,0.3,Iris-setosa +5.0,3.4,1.5,0.25111111111111106,Iris-setosa +4.4,2.9,1.4,0.25111111111111106,Iris-setosa +4.9,3.1,1.5,0.25111111111111106,Iris-setosa +5.4,3.7,1.5,0.25111111111111106,Iris-setosa +4.8,3.4,1.6,0.25111111111111106,Iris-setosa +4.8,3.0,1.4,0.1,Iris-setosa +5.7,3.0,1.1,0.1,Iris-setosa +5.8,4.0,1.2,0.2,Iris-setosa +5.7,4.4,1.5,0.4,Iris-setosa +5.4,3.9,1.3,0.4,Iris-setosa +5.1,3.5,1.4,0.3,Iris-setosa +5.7,3.8,1.7,0.3,Iris-setosa +5.1,3.8,1.5,0.3,Iris-setosa +5.4,3.4,1.7,0.2,Iris-setosa +5.1,3.7,1.5,0.4,Iris-setosa +4.6,3.6,1.0,0.2,Iris-setosa +5.1,3.3,1.7,0.5,Iris-setosa +4.8,3.4,1.9,0.2,Iris-setosa +5.0,3.0,1.6,0.2,Iris-setosa +5.0,3.4,1.6,0.4,Iris-setosa +5.2,3.5,1.5,0.2,Iris-setosa +5.2,3.4,1.4,0.2,Iris-setosa +4.7,3.2,1.6,0.2,Iris-setosa +4.8,3.1,1.6,0.2,Iris-setosa +5.4,3.4,1.5,0.4,Iris-setosa +5.2,4.1,1.5,0.1,Iris-setosa +5.5,4.2,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.0,3.2,1.2,0.2,Iris-setosa +5.5,3.5,1.3,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +4.4,3.0,1.3,0.2,Iris-setosa +5.1,3.4,1.5,0.2,Iris-setosa +5.0,3.5,1.3,0.3,Iris-setosa +4.5,2.3,1.3,0.3,Iris-setosa +4.4,3.2,1.3,0.2,Iris-setosa +5.0,3.5,1.6,0.6,Iris-setosa +5.1,3.8,1.9,0.4,Iris-setosa +4.8,3.0,1.4,0.3,Iris-setosa +5.1,3.8,1.6,0.2,Iris-setosa +4.6,3.2,1.4,0.2,Iris-setosa +5.3,3.7,1.5,0.2,Iris-setosa +5.0,3.3,1.4,0.2,Iris-setosa +7.0,3.2,4.7,1.4,Iris-versicolor +6.4,3.2,4.5,1.5,Iris-versicolor +6.9,3.1,4.9,1.5,Iris-versicolor +5.5,2.3,4.0,1.3,Iris-versicolor +6.5,2.8,4.6,1.5,Iris-versicolor +5.7,2.8,4.5,1.3,Iris-versicolor +6.3,3.3,4.7,1.6,Iris-versicolor +4.9,2.4,3.3,1.0,Iris-versicolor +6.6,2.9,4.6,1.3,Iris-versicolor +5.2,2.7,3.9,1.4,Iris-versicolor +5.0,2.0,3.5,1.0,Iris-versicolor +5.9,3.0,4.2,1.5,Iris-versicolor +6.0,2.2,4.0,1.0,Iris-versicolor +6.1,2.9,4.7,1.4,Iris-versicolor +5.6,2.9,3.6,1.3,Iris-versicolor +6.7,3.1,4.4,1.4,Iris-versicolor +5.6,3.0,4.5,1.5,Iris-versicolor +5.8,2.7,4.1,1.0,Iris-versicolor +6.2,2.2,4.5,1.5,Iris-versicolor +5.6,2.5,3.9,1.1,Iris-versicolor +5.9,3.2,4.8,1.8,Iris-versicolor +6.1,2.8,4.0,1.3,Iris-versicolor +6.3,2.5,4.9,1.5,Iris-versicolor +6.1,2.8,4.7,1.2,Iris-versicolor +6.4,2.9,4.3,1.3,Iris-versicolor +6.6,3.0,4.4,1.4,Iris-versicolor +6.8,2.8,4.8,1.4,Iris-versicolor +6.7,3.0,5.0,1.7,Iris-versicolor +6.0,2.9,4.5,1.5,Iris-versicolor +5.7,2.6,3.5,1.0,Iris-versicolor +5.5,2.4,3.8,1.1,Iris-versicolor +5.5,2.4,3.7,1.0,Iris-versicolor +5.8,2.7,3.9,1.2,Iris-versicolor +6.0,2.8,5.1,1.6,Iris-versicolor +5.4,3.0,4.5,1.5,Iris-versicolor +6.0,3.4,4.5,1.6,Iris-versicolor +6.7,3.1,4.7,1.5,Iris-versicolor +6.3,2.3,4.4,1.3,Iris-versicolor +5.6,3.0,4.1,1.3,Iris-versicolor +5.5,2.5,4.0,1.3,Iris-versicolor +5.5,2.6,4.4,1.2,Iris-versicolor +6.1,3.0,4.6,1.4,Iris-versicolor +5.8,2.6,4.0,1.2,Iris-versicolor +5.0,2.3,3.3,1.0,Iris-versicolor +5.6,2.7,4.2,1.3,Iris-versicolor +5.7,3.0,4.2,1.2,Iris-versicolor +5.7,2.9,4.2,1.3,Iris-versicolor +6.2,2.9,4.3,1.3,Iris-versicolor +5.1,2.5,3.0,1.1,Iris-versicolor +5.7,2.8,4.1,1.3,Iris-versicolor +6.3,3.3,6.0,2.5,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +7.1,3.0,5.9,2.1,Iris-virginica +6.3,2.9,5.6,1.8,Iris-virginica +6.5,3.0,5.8,2.2,Iris-virginica +7.6,3.0,6.6,2.1,Iris-virginica +4.9,2.5,4.5,1.7,Iris-virginica +7.3,2.9,6.3,1.8,Iris-virginica +6.7,2.5,5.8,1.8,Iris-virginica +7.2,3.6,6.1,2.5,Iris-virginica +6.5,3.2,5.1,2.0,Iris-virginica +6.4,2.7,5.3,1.9,Iris-virginica +6.8,3.0,5.5,2.1,Iris-virginica +5.7,2.5,5.0,2.0,Iris-virginica +5.8,2.8,5.1,2.4,Iris-virginica +6.4,3.2,5.3,2.3,Iris-virginica +6.5,3.0,5.5,1.8,Iris-virginica +7.7,3.8,6.7,2.2,Iris-virginica +7.7,2.6,6.9,2.3,Iris-virginica +6.0,2.2,5.0,1.5,Iris-virginica +6.9,3.2,5.7,2.3,Iris-virginica +5.6,2.8,4.9,2.0,Iris-virginica +5.6,2.8,6.7,2.0,Iris-virginica +6.3,2.7,4.9,1.8,Iris-virginica +6.7,3.3,5.7,2.1,Iris-virginica +7.2,3.2,6.0,1.8,Iris-virginica +6.2,2.8,4.8,1.8,Iris-virginica +6.1,3.0,4.9,1.8,Iris-virginica +6.4,2.8,5.6,2.1,Iris-virginica +7.2,3.0,5.8,1.6,Iris-virginica +7.4,2.8,6.1,1.9,Iris-virginica +7.9,3.8,6.4,2.0,Iris-virginica +6.4,2.8,5.6,2.2,Iris-virginica +6.3,2.8,5.1,1.5,Iris-virginica +6.1,2.6,5.6,1.4,Iris-virginica +7.7,3.0,6.1,2.3,Iris-virginica +6.3,3.4,5.6,2.4,Iris-virginica +6.4,3.1,5.5,1.8,Iris-virginica +6.0,3.0,4.8,1.8,Iris-virginica +6.9,3.1,5.4,2.1,Iris-virginica +6.7,3.1,5.6,2.4,Iris-virginica +6.9,3.1,5.1,2.3,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +6.8,3.2,5.9,2.3,Iris-virginica +6.7,3.3,5.7,2.5,Iris-virginica +6.7,3.0,5.2,2.3,Iris-virginica +6.3,2.5,5.0,2.3,Iris-virginica +6.5,3.0,5.2,2.0,Iris-virginica +6.2,3.4,5.4,2.3,Iris-virginica +5.9,3.0,5.1,1.8,Iris-virginica diff --git a/module-3/Data-Cleaning-Challenge/iris_clean.ipynb b/module-3/Data-Cleaning-Challenge/iris_clean.ipynb new file mode 100644 index 00000000..e9eebc09 --- /dev/null +++ b/module-3/Data-Cleaning-Challenge/iris_clean.ipynb @@ -0,0 +1,796 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "moved-confusion", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "advanced-label", + "metadata": {}, + "outputs": [], + "source": [ + "iris_df = pd.read_csv(\"iris-data.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "charitable-knife", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal_length_cmsepal_width_cmpetal_length_cmpetal_width_cmclass
05.13.51.40.2Iris-setosa
14.93.01.40.2Iris-setosa
24.73.21.30.2Iris-setosa
34.63.11.50.2Iris-setosa
45.03.61.40.2Iris-setosa
\n", + "
" + ], + "text/plain": [ + " sepal_length_cm sepal_width_cm petal_length_cm petal_width_cm \\\n", + "0 5.1 3.5 1.4 0.2 \n", + "1 4.9 3.0 1.4 0.2 \n", + "2 4.7 3.2 1.3 0.2 \n", + "3 4.6 3.1 1.5 0.2 \n", + "4 5.0 3.6 1.4 0.2 \n", + "\n", + " class \n", + "0 Iris-setosa \n", + "1 Iris-setosa \n", + "2 Iris-setosa \n", + "3 Iris-setosa \n", + "4 Iris-setosa " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "earned-anatomy", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(150, 5)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "sorted-yemen", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal_length_cmsepal_width_cmpetal_length_cmpetal_width_cm
count150.000000150.000000150.000000145.000000
mean5.6446273.0546673.7586671.236552
std1.3127810.4331231.7644200.755058
min0.0550002.0000001.0000000.100000
25%5.1000002.8000001.6000000.400000
50%5.7000003.0000004.3500001.300000
75%6.4000003.3000005.1000001.800000
max7.9000004.4000006.9000002.500000
\n", + "
" + ], + "text/plain": [ + " sepal_length_cm sepal_width_cm petal_length_cm petal_width_cm\n", + "count 150.000000 150.000000 150.000000 145.000000\n", + "mean 5.644627 3.054667 3.758667 1.236552\n", + "std 1.312781 0.433123 1.764420 0.755058\n", + "min 0.055000 2.000000 1.000000 0.100000\n", + "25% 5.100000 2.800000 1.600000 0.400000\n", + "50% 5.700000 3.000000 4.350000 1.300000\n", + "75% 6.400000 3.300000 5.100000 1.800000\n", + "max 7.900000 4.400000 6.900000 2.500000" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "governing-huntington", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sepal_length_cm float64\n", + "sepal_width_cm float64\n", + "petal_length_cm float64\n", + "petal_width_cm float64\n", + "class object\n", + "dtype: object" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "major-detroit", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Iris-setosa', 'Iris-setossa', 'Iris-versicolor', 'versicolor',\n", + " 'Iris-virginica'], dtype=object)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_df['class'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "viral-moment", + "metadata": {}, + "outputs": [], + "source": [ + "iris_df.replace({'Iris-setossa': 'Iris-setosa','versicolor':'Iris-versicolor'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "intense-colony", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sepal_length_cm 0\n", + "sepal_width_cm 0\n", + "petal_length_cm 0\n", + "petal_width_cm 5\n", + "class 0\n", + "dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "decimal-vacuum", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal_length_cmsepal_width_cmpetal_length_cmpetal_width_cmclass
75.03.41.5NaNIris-setosa
84.42.91.4NaNIris-setosa
94.93.11.5NaNIris-setosa
105.43.71.5NaNIris-setosa
114.83.41.6NaNIris-setosa
\n", + "
" + ], + "text/plain": [ + " sepal_length_cm sepal_width_cm petal_length_cm petal_width_cm \\\n", + "7 5.0 3.4 1.5 NaN \n", + "8 4.4 2.9 1.4 NaN \n", + "9 4.9 3.1 1.5 NaN \n", + "10 5.4 3.7 1.5 NaN \n", + "11 4.8 3.4 1.6 NaN \n", + "\n", + " class \n", + "7 Iris-setosa \n", + "8 Iris-setosa \n", + "9 Iris-setosa \n", + "10 Iris-setosa \n", + "11 Iris-setosa " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_df[iris_df['petal_width_cm'].isnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "educational-pointer", + "metadata": {}, + "outputs": [], + "source": [ + "iris_filled = iris_df.fillna(iris_df['petal_width_cm'][iris_df['class']=='Iris-setosa'].mean())" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "italic-prediction", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal_length_cmsepal_width_cmpetal_length_cmpetal_width_cmclass
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [sepal_length_cm, sepal_width_cm, petal_length_cm, petal_width_cm, class]\n", + "Index: []" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_filled[iris_filled['petal_width_cm'].isnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "clean-bedroom", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "77 0.067\n", + "78 0.060\n", + "79 0.057\n", + "80 0.055\n", + "81 0.055\n", + "Name: sepal_length_cm, dtype: float64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_filled[iris_filled['sepal_length_cm'] < 2]['sepal_length_cm']" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "nuclear-grass", + "metadata": {}, + "outputs": [], + "source": [ + "def multiplyby100(number):\n", + " if number < 2: \n", + " return number*100\n", + " else: \n", + " return number" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "transsexual-wallace", + "metadata": {}, + "outputs": [], + "source": [ + "iris_filled['sepal_length_cm'] = iris_filled['sepal_length_cm'].apply(multiplyby100)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "painted-lingerie", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal_length_cmsepal_width_cmpetal_length_cmpetal_width_cmclass
05.13.51.40.2Iris-setosa
14.93.01.40.2Iris-setosa
24.73.21.30.2Iris-setosa
34.63.11.50.2Iris-setosa
45.03.61.40.2Iris-setosa
..................
1456.73.05.22.3Iris-virginica
1466.32.55.02.3Iris-virginica
1476.53.05.22.0Iris-virginica
1486.23.45.42.3Iris-virginica
1495.93.05.11.8Iris-virginica
\n", + "

150 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " sepal_length_cm sepal_width_cm petal_length_cm petal_width_cm \\\n", + "0 5.1 3.5 1.4 0.2 \n", + "1 4.9 3.0 1.4 0.2 \n", + "2 4.7 3.2 1.3 0.2 \n", + "3 4.6 3.1 1.5 0.2 \n", + "4 5.0 3.6 1.4 0.2 \n", + ".. ... ... ... ... \n", + "145 6.7 3.0 5.2 2.3 \n", + "146 6.3 2.5 5.0 2.3 \n", + "147 6.5 3.0 5.2 2.0 \n", + "148 6.2 3.4 5.4 2.3 \n", + "149 5.9 3.0 5.1 1.8 \n", + "\n", + " class \n", + "0 Iris-setosa \n", + "1 Iris-setosa \n", + "2 Iris-setosa \n", + "3 Iris-setosa \n", + "4 Iris-setosa \n", + ".. ... \n", + "145 Iris-virginica \n", + "146 Iris-virginica \n", + "147 Iris-virginica \n", + "148 Iris-virginica \n", + "149 Iris-virginica \n", + "\n", + "[150 rows x 5 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_filled" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "experienced-insertion", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Series([], Name: sepal_length_cm, dtype: float64)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris_filled[iris_filled['sepal_length_cm'] < 2]['sepal_length_cm']" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "typical-airfare", + "metadata": {}, + "outputs": [], + "source": [ + "iris_filled.to_csv('iris_clean.csv',index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "configured-dictionary", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}