-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataMining.R
More file actions
120 lines (93 loc) · 3.31 KB
/
dataMining.R
File metadata and controls
120 lines (93 loc) · 3.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# load twitter library - the rtweet library is recommended now over twitteR
library(rtweet)
# plotting and pipes - tidyverse!
library(ggplot2)
library(dplyr)
# text mining library
library(tidytext)
# plotting packages
library(igraph)
library(ggraph)
hades_tweets <- search_tweets(q="hadestown", n=10000,
lang = "en",
include_rts = FALSE)
head(hades_tweets$text)
# removing urls
hades_tweets$stripped_text <- gsub("http.*","", hades_tweets$text)
hades_tweets$stripped_text <- gsub("https.*","", hades_tweets$stripped_text)
# note the words that are recognized as unique by R
a_list_of_words <- c("Dog", "dog", "dog", "cat","cat",",")
unique(a_list_of_words)
if ("dplyr" %in% installed.packages()[, "Package"]){
cat("'dplyr' is installed.")
} else {
install.packages("dplyr",dependencies=T)
}
library(dplyr)
# remove punctuation, convert to lowercase, add id for each tweet!
hades_tweets_clean <- hades_tweets %>%
dplyr::select(stripped_text) %>%
unnest_tokens(word, stripped_text)
# plot the top 15 words -- notice any issues?
hades_tweets_clean %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in tweets")
#load list of stop words - from the tidytext package
data("stop_words")
# view first 6 words
head(stop_words)
nrow(hades_tweets_clean)
cleaned_tweets_words <- hades_tweets_clean %>% anti_join(stop_words)
nrow(cleaned_tweets_words)
# plot the top 15 words -- notice any issues?
cleaned_tweets_words %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(y = "Count",
x = "Unique words",
title = "Count of unique words found in tweets",
subtitle = "Stop words removed from the list")
library(devtools)
install_github("dgrtwo/widyr")
library(widyr)
# remove punctuation, convert to lowercase, add id for each tweet!
hades_tweets_paired_words <- hades_tweets %>%
dplyr::select(stripped_text) %>%
unnest_tokens(paired_words, stripped_text, token = "ngrams", n = 2)
hades_tweets_paired_words %>%
count(paired_words, sort = TRUE)
library(tidyr)
hades_tweets_separated_words <- hades_tweets_paired_words %>% separate(paired_words, c("word1","word2"), sep = " ")
hades_tweets_filtered <- hades_tweets_separated_words %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
hades_tweets_counts <- hades_tweets_filtered %>%
count(word1, word2, sort = TRUE)
head(hades_tweets_counts)
library(igraph)
library(ggraph)
# plot climate change word network
hades_tweets_counts %>%
filter(n >= 24) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
geom_node_point(color = "darkslategray4", size = 3) +
geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
labs(title = "Word Network: Tweets using the hashtag - Hadestown",
subtitle = "Text mining twitter data ",
x = "", y = "")