From bc5627f9b580d7fc89e74eb7553f497ca4953e5b Mon Sep 17 00:00:00 2001 From: datanomnom Date: Thu, 6 Jul 2017 06:32:23 +0000 Subject: [PATCH 1/3] Done --- build.py | 50 +++++++++++++++++------ build.pyc | Bin 0 -> 2979 bytes tests/__init__.pyc | Bin 0 -> 168 bytes tests/test_get_categorical_variables.pyc | Bin 0 -> 2737 bytes 4 files changed, 38 insertions(+), 12 deletions(-) create mode 100644 build.pyc create mode 100644 tests/__init__.pyc create mode 100644 tests/test_get_categorical_variables.pyc diff --git a/build.py b/build.py index 35cdd2a..90f2e7e 100644 --- a/build.py +++ b/build.py @@ -1,26 +1,52 @@ -def get_categorical_variables(df): - return [] +import numpy as np +import pandas as pd +from pandas import Series, DataFrame +import operator +import matplotlib.pyplot as plt +df = pd.read_csv('data/conversion_data.csv') -def get_numerical_variables(df): - return [] +def get_categorical_variables(df): + return df[['country','source','new_user','converted']] +def get_numerical_variables(df): + return df._get_numeric_data() def get_numerical_variables_percentile(df): - pass - + df_temp = get_numerical_variables(df) + return df_temp.describe().T def get_categorical_variables_modes(df): - pass - + dic = {'converted':0, 'country':'', 'new_user':0, 'source':''} + for col in df.mode().columns: + dic[col] = df.mode()[col][0] + return pd.DataFrame(dic.items(), columns=['var_name', 'mode']) def get_missing_values_count(df): - pass - + my_ans = pd.DataFrame(df.isnull().sum(), columns=['missing_value_count']) + my_ans.index.name = 'var_name' + return my_ans def plot_histogram_with_numerical_values(df): + fig, axes = plt.subplots(2, 2) + df1 = get_numerical_variables(df) + list_of_cols = df1.columns + axes[0,0].hist(df[list_of_cols[0]]) + axes[0,0].set_title(list_of_cols[0]) + axes[0,1].hist(df[list_of_cols[1]]) + axes[0,1].set_title(list_of_cols[1]) + axes[1,0].hist(df[list_of_cols[2]]) + axes[1,0].set_title(list_of_cols[2]) + axes[1,1].hist(df[list_of_cols[3]]) + axes[1,1].set_title(list_of_cols[3]) + plt.tight_layout() + plt.show() pass - def plot_facet_box(df): - pass + list_of_cols = df.columns + for col in list_of_cols: + plt.boxplot(df[col], 1) + plt.title(col) + plt.show() + pass diff --git a/build.pyc b/build.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0ed5dcdb82057820e649041919b3ca6858c7d9dd GIT binary patch literal 2979 zcmcIm+iDy~5bd75NV}3O*|8NbI1V_5ECgwB2!enK0gI5l?87J^ve*ox>E6{Sb6HRK z>H^Y}^B4KDd`F%`a!!r*;t=P_lGLT@?wanZ)2FJt@#j|e^&fA3N!0C6fdAj4x!)lS z{G}Jq-MjOgB z)o)mIQ<;`1TgtRWxu?vUDBH@ci?X9kN0j@@bVYeUpN{$)@Gt!vKf^wK_HBqGTcy_d z4%EZg$3IkYZrx+_40?m4EY5A^(z1xy-%s3mqA|(VLVJbg9ztA8*e0OErl}@P)jR6b zV(8Z(lX6!0>PmKW<*Z7qU&nA^FQS>Vm26+{<*n&sM)kRcr<-P64{+4|2r)PO{PM^eOIA; zinUjET@7RtQ>%e6CN>F#5O|3NG`rF2TB|e7pw5CtTn*TSghS5ESeI1kiS-2IXCabA zCg*I%k+=EuuA+Q~Ek6)tG__z=;nU1+!^$-wl-{h*ju9;A>d;A>%m~o+pyulVd#$?H zRtEs$a1TRZKdDcK@iVqpXzm*bzyi3LTYwD` z^>1EUCLpHPI2)W9dD>J_i^@@-D-3HuxJ{lqmlhM`ayGM()Shq$<9$LBKA;f5*iBtA z%QC^o&2ldo7RFu*;)H&M;k3`MqPTE(g(C|uaeOZVSBJnw0`S(O^TD-M>&A-p z`obEcbL-BkHIij9(Op<$bZ)(|YK{A2F|oO@#we{BWShJYZA~=8I=lr0hABBEd6x2& z(*}ebVm*f5|8)7qit(~5OD(_sukOE$rAxvKxu%(yjCQk=X;ykC1+}Iu5AZZ~UT#5L zP?1l4W9+fVKxB%yl>JNE=a>Bk^%NB8uj_gYtXm$iC9>7VccYk?w- zFTZ8BAHI1Zghl}nPJ^qg#2~xImBb3ndRup-5bL))>R9MOVhn{8HH567Aw842ds1F9 z5-9<40^t{!TGrC=OPW987+Nhp?tA1OmK89^c+2<*4((&akq8QqZRv+-UEODm6F2e3 zz%zzn6;pXzP2s;+Y+VTt=O_GRUHcjU3>K3RH#XOILZ`4gtX%}rf*bX`k{5Cu7 IPOG!?596M12LJ#7 literal 0 HcmV?d00001 diff --git a/tests/__init__.pyc b/tests/__init__.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de27e4620161c7bd87decfb51f74c962cdb5b3c7 GIT binary patch literal 168 zcmZ9FK?(vf3`Hxt5W#!Qrse`7of!Y@W(0( literal 0 HcmV?d00001 diff --git a/tests/test_get_categorical_variables.pyc b/tests/test_get_categorical_variables.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47c1104e65660824314a960b5f76c68e1d53fb83 GIT binary patch literal 2737 zcmc&$O>fgc5FI-oZTJv{@-03DLL!$oCvFu&6oI&Ka6yP%EMsqqTgQ&P>!3>Qsr*!~ zocICo-ndOd+aM08apLjLdc5BE-psBWKQCYS{^P@^SdNYct}k%fK1_nYCz;65lfEY& zeP1$P`VC1M(r=1s$WBX4Qw|Z)R(DBETis?)1z;v#D7Hd?(e#n7mlc`?Ml_pvBpVUV;&C#k=CI>hH9;ZxQfc#8ufy^sH>R z;1fq)4b6$Xt$8Mk5h7M`b%jG>kKo#HoZ9qQpn%B`T&7`oKxC>&K$Yqn7ukn%0aDz< zTPISS0rV>FTS7P%Vj=<)YLj<2VE#!2rxk{b-9eF|`090POY?Y^_s1A;4)5W}pbm1E zW=84kvTACIInyb{S_6}T%*e>39iG;q`$^3koPB;&V=_fcc>a4%jA4X zzl}GN^kHg|#oO32vkFU#?hq&0X6(PPt#J!vo1X&{w$B3g=ueupx^*)Q!yE-0hFaDk zHK{VV0k?=?h2{ngYa`%pJ6%n+b)LIhz)ujan3fBC+v_ZM+MQOX)upRosyub1AwY-d z_CAX8B(ho|SR_RTSl=|XG*J@rONzbRX0?E7t@)k>;?((V)Ji<(Tomt$cgb&K$G+nK E20c%fkpKVy literal 0 HcmV?d00001 From af355ff19cfa039ff73f88067526815e4499171a Mon Sep 17 00:00:00 2001 From: datanomnom Date: Sun, 9 Jul 2017 10:14:16 +0000 Subject: [PATCH 2/3] Done --- build.py | 68 +++++++++++++++++++++++++++++------------------------- build.pyc | Bin 2979 -> 3192 bytes 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/build.py b/build.py index 90f2e7e..9433509 100644 --- a/build.py +++ b/build.py @@ -1,52 +1,56 @@ import numpy as np import pandas as pd -from pandas import Series, DataFrame -import operator +from scipy.stats import norm +import seaborn as sns import matplotlib.pyplot as plt -df = pd.read_csv('data/conversion_data.csv') +df = pd.read_csv("data/conversion_data.csv") def get_categorical_variables(df): - return df[['country','source','new_user','converted']] + return df[['country','new_user','source','converted']] def get_numerical_variables(df): - return df._get_numeric_data() + numerical = df._get_numeric_data().columns + return list(numerical) def get_numerical_variables_percentile(df): - df_temp = get_numerical_variables(df) - return df_temp.describe().T + final = df[get_numerical_variables(df)] + return final.describe() def get_categorical_variables_modes(df): - dic = {'converted':0, 'country':'', 'new_user':0, 'source':''} - for col in df.mode().columns: - dic[col] = df.mode()[col][0] - return pd.DataFrame(dic.items(), columns=['var_name', 'mode']) + modes = get_categorical_variables(df).mode() + return modes def get_missing_values_count(df): - my_ans = pd.DataFrame(df.isnull().sum(), columns=['missing_value_count']) - my_ans.index.name = 'var_name' - return my_ans + df1 = df.apply(lambda x: sum(x.isnull()), axis=0) + return pd.DataFrame(df1) + def plot_histogram_with_numerical_values(df): - fig, axes = plt.subplots(2, 2) - df1 = get_numerical_variables(df) - list_of_cols = df1.columns - axes[0,0].hist(df[list_of_cols[0]]) - axes[0,0].set_title(list_of_cols[0]) - axes[0,1].hist(df[list_of_cols[1]]) - axes[0,1].set_title(list_of_cols[1]) - axes[1,0].hist(df[list_of_cols[2]]) - axes[1,0].set_title(list_of_cols[2]) - axes[1,1].hist(df[list_of_cols[3]]) - axes[1,1].set_title(list_of_cols[3]) + num_cols = get_numerical_variables(df) + plt.figure(figsize=(15,6)) + plt.subplot(121) + plt.title(num_cols[0]) + sns.distplot(df[num_cols[0]], color='yellow', fit=norm, kde=False) + plt.subplot(122) + plt.title(num_cols[1]) + sns.distplot(df[num_cols[1]], color='yellow', fit=norm, kde=False) + plt.subplot(211) + plt.title(num_cols[2]) + sns.distplot(df[num_cols[2]], color='yellow', fit=norm, kde=False) + plt.subplot(212) + plt.title(num_cols[3]) + sns.distplot(df[num_cols[3]], color='yellow', fit=norm, kde=False) plt.tight_layout() plt.show() - pass def plot_facet_box(df): - list_of_cols = df.columns - for col in list_of_cols: - plt.boxplot(df[col], 1) - plt.title(col) - plt.show() - pass + plt.figure(figsize=(10,10)) + plt.subplot(221) + plt.title('Age') + sns.boxplot('converted','age',data=df) + plt.subplot(222) + plt.title('Total Pages Visited') + sns.boxplot('converted','total_pages_visited',data=df) + plt.tight_layout() + plt.show() diff --git a/build.pyc b/build.pyc index 0ed5dcdb82057820e649041919b3ca6858c7d9dd..aa64a6a9c0620345473ea039e3e9f51619cad6e5 100644 GIT binary patch literal 3192 zcmcIm-EJF26rS}@5<5Rm(yBE4h)_Tnq15VqRe?ZNuOj3i5V={cJ!5Cm@vgNqo77dg z3KzTpkHizeExZH|0N;1UiQOvn3Mbiv2t;)@58%uAwGPb=p+PeBGKlUB6zpq0U?Cw5?2Coo^~#S67KT-BP-t zeulYirJFL?QF=`V9i`W0u&eZj4EB_6$zWgUwhZpjo0IM)JhT7c=eV0-7#a|HQBM4Q zx}1fqH!SiCQ+iwEDd!J|{$j|be42Q^#^dioyb^%AMhA66jT`FVq{|h@VI6W}XV!M+@hV$}E^8e&J2%4JzyO!|5y>z}VP?bcna(qO`-zr59Ogvw<_7 zE_v>+5`oMiAA!s-0r_01E67Csl&DZsuM#HbL|i}a*0Cn+L8ROxuOYM}IUjSrwC7GXM5SfgYfQ}MN_$oam4ZKTZxw`64V79-HT-N@D(Mz(J;0^QqKIuy}*a6G;&<{YA|(0kgbCa&cwpP0YV~Z93Xt%$2hw8 zG;=uX*vW$(pM}(Av!V*pr2JWNxzwciJx&{N&8@6YeS$scUwQ^di!o{|y|m%%`c#y| zhlKwC>--^vO16^SWINeQKD$M>V;Q`53;6{fp$yQ3Nd3)7eaz)bBCN$^Q3nlm-eeI( zx`Y2$HIzzRPs)QCaXo0l$<6vU%VTN%TBH#cuPA}ThE6{Sb6HRK z>H^Y}^B4KDd`F%`a!!r*;t=P_lGLT@?wanZ)2FJt@#j|e^&fA3N!0C6fdAj4x!)lS z{G}Jq-MjOgB z)o)mIQ<;`1TgtRWxu?vUDBH@ci?X9kN0j@@bVYeUpN{$)@Gt!vKf^wK_HBqGTcy_d z4%EZg$3IkYZrx+_40?m4EY5A^(z1xy-%s3mqA|(VLVJbg9ztA8*e0OErl}@P)jR6b zV(8Z(lX6!0>PmKW<*Z7qU&nA^FQS>Vm26+{<*n&sM)kRcr<-P64{+4|2r)PO{PM^eOIA; zinUjET@7RtQ>%e6CN>F#5O|3NG`rF2TB|e7pw5CtTn*TSghS5ESeI1kiS-2IXCabA zCg*I%k+=EuuA+Q~Ek6)tG__z=;nU1+!^$-wl-{h*ju9;A>d;A>%m~o+pyulVd#$?H zRtEs$a1TRZKdDcK@iVqpXzm*bzyi3LTYwD` z^>1EUCLpHPI2)W9dD>J_i^@@-D-3HuxJ{lqmlhM`ayGM()Shq$<9$LBKA;f5*iBtA z%QC^o&2ldo7RFu*;)H&M;k3`MqPTE(g(C|uaeOZVSBJnw0`S(O^TD-M>&A-p z`obEcbL-BkHIij9(Op<$bZ)(|YK{A2F|oO@#we{BWShJYZA~=8I=lr0hABBEd6x2& z(*}ebVm*f5|8)7qit(~5OD(_sukOE$rAxvKxu%(yjCQk=X;ykC1+}Iu5AZZ~UT#5L zP?1l4W9+fVKxB%yl>JNE=a>Bk^%NB8uj_gYtXm$iC9>7VccYk?w- zFTZ8BAHI1Zghl}nPJ^qg#2~xImBb3ndRup-5bL))>R9MOVhn{8HH567Aw842ds1F9 z5-9<40^t{!TGrC=OPW987+Nhp?tA1OmK89^c+2<*4((&akq8QqZRv+-UEODm6F2e3 zz%zzn6;pXzP2s;+Y+VTt=O_GRUHcjU3>K3RH#XOILZ`4gtX%}rf*bX`k{5Cu7 IPOG!?596M12LJ#7 From 298ca5daa4d9dfec07e6131f559cbd120aaefe53 Mon Sep 17 00:00:00 2001 From: datanomnom Date: Sun, 9 Jul 2017 13:24:24 +0000 Subject: [PATCH 3/3] Done --- build.py | 8 ++++---- build.pyc | Bin 3192 -> 3192 bytes 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/build.py b/build.py index 9433509..63a0c8a 100644 --- a/build.py +++ b/build.py @@ -29,16 +29,16 @@ def get_missing_values_count(df): def plot_histogram_with_numerical_values(df): num_cols = get_numerical_variables(df) plt.figure(figsize=(15,6)) - plt.subplot(121) + plt.subplot(221) plt.title(num_cols[0]) sns.distplot(df[num_cols[0]], color='yellow', fit=norm, kde=False) - plt.subplot(122) + plt.subplot(222) plt.title(num_cols[1]) sns.distplot(df[num_cols[1]], color='yellow', fit=norm, kde=False) - plt.subplot(211) + plt.subplot(223) plt.title(num_cols[2]) sns.distplot(df[num_cols[2]], color='yellow', fit=norm, kde=False) - plt.subplot(212) + plt.subplot(224) plt.title(num_cols[3]) sns.distplot(df[num_cols[3]], color='yellow', fit=norm, kde=False) plt.tight_layout() diff --git a/build.pyc b/build.pyc index aa64a6a9c0620345473ea039e3e9f51619cad6e5..4834f7f9d0fcfac2f394dc78b87f185a50870b42 100644 GIT binary patch delta 48 xcmew%@k4@*`7vuZBJP6-CJ^ypvpPp03jlhm4Uhl; delta 48 xcmew%@k4@*`7