-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path574_Project_v2.Rmd
More file actions
193 lines (121 loc) · 6.63 KB
/
574_Project_v2.Rmd
File metadata and controls
193 lines (121 loc) · 6.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
---
title: "574 Project"
author: "Malvika Mohan"
date: "3/3/2020"
output: pdf_document
---
```{r setup, include=FALSE}
library(tidyverse)
library(MASS)
library(e1071)
library(corrplot)
library(stats)
library(caret)
library(pROC)
```
```{r}
weather_data <- read.csv('C:/RP/datasets/weatherAUS.csv')
weather_data <- data.frame(weather_data)
weather_data <- na.omit(weather_data)
rownames(weather_data) <- 1:nrow(weather_data)
View(weather_data)
weather_data
```
```{r }
#Checking correlation
temp <- weather_data$RainToday
weather_data2 <- weather_data %>% select_if(is.numeric) %>% dplyr::select(-RISK_MM)
bivrel <- cor(weather_data2,y=NULL,use = "everything",method = "pearson")
corrplot(bivrel, method = "square")
```
```{r }
#Determining the rainfall,average temperature,Humidity trends across three cities =1) Melbourne
weather_dataMel <-weather_data %>% filter(Location=='Melbourne')
dev.new() # Generates a new plot window.
#Plotting rainfall in inches for melbourne
plot(ts (weather_dataMel$Rainfall, start=c(2007), end=c(2017), frequency=1),xlab="",ylab="Rainfall in inches",col=" Blue",main = "Yearly precipitation trends of Melbourne inches")
weather_dataMel$Avgtemp <- weather_dataMel %>% group_by(Date) %>% summarise(avgt=mean(MinTemp,MaxTemp,na.rm=TRUE))
#Plotting average temperature in celsius in Melbourne
plot(ts (weather_dataMel$Avgtemp$avgt, start=c(2007), end=c(2017), frequency=1),ylab="Average temperature in celsius",col="Red",main="Yearly temperature trends of Melbourne ")
#Plotting Humidity percentage at 3pm in Melbourne
plot(ts (weather_dataMel$Humidity3pm, start=c(2007), end=c(2017), frequency=1),ylab="Percentage of humidity in Melbourne",col="green",main = "Yearly Humidity of Melbourne")
#Plotting Sunshine in Melbourne
plot(ts (weather_dataMel$Sunshine, start=c(2007), end=c(2017), frequency=1),ylab="Number of hours of sunshine per day",col="orange",main = "Number of days of sunshine in Melbourne")
```
```{r }
#Determining the rainfall,average temperature trends across three cities =2) Sydney
weather_dataSyd <- weather_data %>% filter(Location=='Sydney')
plot(ts (weather_dataSyd$Rainfall, start=c(2007), end=c(2017), frequency=1),xlab="",ylab= "Rainfall in inches",col="Blue",main = "Yearly precipitation trends of Sydeny ")
weather_dataSyd<-na.omit(weather_dataSyd);
weather_dataSyd$Avgtemp <- weather_dataSyd %>% group_by(Date) %>% summarise(avgt2=mean(MinTemp,MaxTemp,na.rm = TRUE))
#Plotting average temperature in celsius in Sydney
plot(ts (weather_dataSyd$Avgtemp$avgt2, start=c(2007), end=c(2017), frequency=1),xlab="",ylab="Average temperature in celsius",col="Red",main="Yearly temperature trends of Sydney")
#Plotting Humidity percentage at 3pm in Sydney
plot(ts (weather_dataSyd$Humidity3pm, start=c(2007), end=c(2017), frequency=1),ylab="Percentage of humidity at 3pm in Sydney",col=" green",main = "Yearly Humidity percentage of Sydney")
#Plotting Sunshine in Sydney
plot(ts (weather_dataSyd$Sunshine, start=c(2007), end=c(2017), frequency=1),ylab="Number of hours of sunshine per day",col="orange",main = "Yearly Number of hours of sunshine in Sydney")
```
```{r }
#Determining the rainfall,average temperature trends across three cities = 3)Cranberra
weather_dataCanberra <- weather_data %>% filter(Location=='Canberra')
plot(ts (weather_dataCanberra$Rainfall, start=c(2007), end=c(2017), frequency=1),xlab="",ylab="Yearly precipitation trends of Cranberra inches",col="Blue")
weather_dataCanberra$Avgtemp <- weather_dataCanberra %>% group_by(Date) %>% summarise(avgt3=mean(MinTemp,MaxTemp,na.rm = TRUE))
#Plotting average temperature in celsius in Cranberra
plot(ts (weather_dataCanberra$Avgtemp$avgt3, start=c(2007), end=c(2017), frequency=1),xlab="",ylab="Average temperature of Cranberra in degrees",col="Red")
#Plotting percentage of Humidity at 3pm in Canberra
plot(ts (weather_dataCanberra$Humidity3pm, start=c(2007), end=c(2017), frequency=1),ylab="Percentage of humidity at 3pm in Cranberra",col=" green",main = "Yearly Humidity percentage of Cranberra")
#Plotting Sunshine in Canberra
plot(ts (weather_dataCanberra$Sunshine, start=c(2007), end=c(2017), frequency=1),ylab="Number of hours of sunshine",col="orange",main = "Number of days of sunshine in Canberra")
```
```{r }
#Checking for significant variables for a linear regression model.
lower_regression_limit <- lm(Rainfall~1,data =head(weather_data2,1000))
upper_regression_limit <- lm(Rainfall~.,data=head(weather_data2,1000))
step_model <- stepAIC(lower_regression_limit , scope = list(lower = lower_regression_limit , upper = upper_regression_limit), direction ="both",trace = FALSE)
summary(step_model)
```
Retaining the variables as determined by least-squares inference.
```{r }
#Setting up the data for classification.
weather_data3 <- weather_data2 %>% dplyr::select(Humidity9am,Sunshine,WindSpeed9am,Temp3pm,MaxTemp,WindGustSpeed) %>% cbind(Rain_today = temp)
set.seed(32)
split_ind <- sample(nrow(weather_data3),size = 0.75 * nrow(weather_data3),replace = FALSE)
train <- weather_data3[split_ind,]
test <- weather_data3[-split_ind,]
train
attach(train)
```
```{r}
#Logistic regression.
model_logistic <- glm(Rain_today ~.,family = binomial(link = 'logit'),data = train)
summary(model_logistic)
predict_logistic <- predict(model_logistic,test,type = 'response')
#Confusion Matrix at a threshold of 0.5
op_logistic <- data.frame(test$Rain_today, predict_logistic)
op_logistic$classification <- ifelse(op_logistic$predict_logistic > 0.5,1,0)
op_logistic$test.Rain_today <- if_else(op_logistic$test.Rain_today == 'Yes',1,0)
confusionMatrix(factor(op_logistic$classification),factor(op_logistic$test.Rain_today))
#ROC
par(pty = 's')
roc(train$Rain_today,model_logistic$fitted.values,plot = TRUE,legacy.axes= TRUE,col = '#ff5252',lwd = 4,print.auc = TRUE)
```
```{r}
model_svm <- svm(Rain_today ~. ,train)
predict_svm <- predict(model_svm, test,type = 'response')
summary(model_svm)
model_svm$fitted
#ConfusionMatrix
op_svm <- data.frame(test$Rain_today, predict_svm)
confusionMatrix(factor(op_svm$predict_svm),factor(op_svm$test.Rain_today))
#svm_tune <- tune(svm, y~x, data = data, ranges=list(epsilon=seq(0,1,0.1), cost=seq(1,10,1)))
#ROC
par(pty = 's')
roc(train$Rain_today,if_else(model_svm$fitted == 'Yes',1,0),plot = TRUE,legacy.axes= TRUE,col = '#ff5252',lwd = 4,print.auc = TRUE)
```
```{r}
par(pty = 's')
#Purple curve: logistic regression model
roc(train$Rain_today,model_logistic$fitted.values,plot = TRUE,legacy.axes= TRUE,col = '#9c88ff',lwd = 4,print.auc = TRUE,print.auc.y = 0.3)
#Green curve: SVM
roc(train$Rain_today,if_else(model_svm$fitted == 'Yes',1,0),plot = TRUE,legacy.axes= TRUE,col = '#4cd137',lwd = 4,print.auc = TRUE,add = TRUE,print.auc.y = 0.2)
```