-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataanalysis.py
More file actions
97 lines (90 loc) · 3.69 KB
/
dataanalysis.py
File metadata and controls
97 lines (90 loc) · 3.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import logging
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st
plt.style.use(r"https://raw.githubusercontent.com/dhaitz/matplotlib-stylesheets/master/pitayasmoothie-dark.mplstyle")
class EDA:
def __init__(self):
self.logger = logging.getLogger(__name__)
def data_dist(self, data, col, fig_size):
"""this function showing data distribution of object and numerical
data"""
try:
if data[col].dtype == object:
data_count = data[col].value_counts()
plt.figure(figsize=fig_size)
sns.barplot(x=data_count.values,
y=data_count.keys()
, width=0.3)
st.pyplot(plt)
else:
plt.figure(figsize=fig_size)
ax = sns.histplot(data[col],
color='green', stat="density")
sns.kdeplot(data[col], ax=ax, color='red',
shade=True)
st.pyplot(plt)
except Exception as e:
self.logger.error(f"An error occurred: {e}")
return None
def null_dist(self, data, explode):
"""this function showing null distribution columns"""
try:
data_null_count = data.isna().sum()
plt.figure(figsize=(15, 10))
plt.subplot(2, 2, 1)
plt.title("Based On Whole Data")
plt.pie(x=[data.shape[0], data_null_count.sum()],
labels=["Data", "Null"],
autopct="%.2f%%")
plt.subplot(2, 2, 2)
sns.barplot(y=[data.shape[0], data_null_count.sum()],
x=["Data", "Null"])
plt.subplot(2, 2, 3)
plt.title("Based On Specific Features")
plt.pie(x=data_null_count[data_null_count != 0],
labels=data_null_count[
data_null_count != 0].index,
autopct="%.2f%%",
explode=[explode] * len(
data_null_count[data_null_count != 0]))
plt.subplot(2, 2, 4)
sns.barplot(y=data_null_count[data_null_count != 0],
x=data_null_count[
data_null_count != 0].index)
st.pyplot(plt)
except Exception as e:
self.logger.error(f"An error occurred: {e}")
return None
def show_corr(self, data):
"""this function showing correlation of data"""
try:
data_copy = data.select_dtypes(exclude=['object'])
mask = np.triu(np.ones_like(data_copy.corr()))
plt.figure(figsize=(15, 10))
sns.heatmap(data_copy.corr(), mask=mask,
cmap="hot", annot=True)
st.pyplot(plt)
except Exception as e:
self.logger.error(f"An error occurred: {e}")
return None
def feature_relation(self, data, feature1,
feature2, based_feature=None):
"""this function showing relation between 2 feature,
can be based on third feature"""
try:
if based_feature:
plt.figure(figsize=(15, 10))
sns.scatterplot(x=data[feature1],
y=data[feature2],
hue=data[based_feature])
st.pyplot(plt)
else:
plt.figure(figsize=(15, 10))
sns.scatterplot(x=data[feature1],
y=data[feature2])
st.pyplot(plt)
except Exception as e:
self.logger.error(f"An error occurred: {e}")
return None