-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_preprocessing.py
More file actions
59 lines (45 loc) · 1.61 KB
/
data_preprocessing.py
File metadata and controls
59 lines (45 loc) · 1.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# -*- coding: utf-8 -*-
"""Data Preprocessing.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1n4DDN-lgGvEAHp-oyICct_VkkrzFaR31
"""
#importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#loading my dataset
df=pd.read_csv('/content/covid19_global_statistics_2026.csv')
df.head()
#checking for missing values
df.isnull().sum()
#Handling missing values
for col in ['population', 'new_cases', 'active_cases', 'cases_per_million', 'new_deaths', 'deaths_per_million', 'total_deaths', 'tests_per_million', 'total_tests']:
if df[col].isnull().any():
df[col] = df[col].fillna(df[col].median())
if df['continent'].isnull().any():
df['continent'] = df['continent'].fillna(df['continent'].mode()[0])
df['date'] = pd.to_datetime(df['date'])
df.isnull().sum()
#Making histogram
plt.hist(df['new_cases'],bins=20,color='purple',label='New Cases')
plt.hist(df['active_cases'],bins=20,color='pink', label='Active Cases')
plt.legend()
plt.title('Distribution of New Cases and Active Cases')
plt.xlabel('Count')
plt.ylabel('Frequency')
plt.show()
#Heatmap of the data
sns.heatmap(df.corr(numeric_only=True),annot=True)
plt.show
#One Hot encoding
df=pd.get_dummies(df,drop_first=True)
df.head()
#Training the model
X=df.drop(['total_deaths', 'date'],axis=1)
y=df['total_deaths']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)