-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtext_cleaning.py
More file actions
147 lines (91 loc) · 3.63 KB
/
text_cleaning.py
File metadata and controls
147 lines (91 loc) · 3.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# Import libraries
import pandas as pd
import numpy as np
import json
import yfinance as yf
import nltk
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
def get_tweet_data():
# Download and save tweet data
with open('tweets.json', 'rb') as f:
data = json.load(f)
df = pd.DataFrame(data)
df.to_excel('tweets.xls', index=False)
return df
df1 = get_tweet_data()
df1
df1['text']
# Add word_count column
df1['word_count'] = df1['text'].apply(lambda x: len(str(x).split(" ")))
df1[['text', 'word_count']]
# Word count statistical description
df1[['text', 'word_count']].describe()
# 1-word tweets, usually hhtp links, # tags or @ tags)
df1[['text', 'word_count']][df1['word_count'] == 1].head()
df1[['text', 'word_count']][df1['word_count'] == 1].count()
# 20 most common words
freq_common = pd.Series(' '.join(df1['text']).split()).value_counts()[:20]
freq_common
# 20 less common words
freq_uncommon = pd.Series(' '.join(df1['text']).split()).value_counts()[-20:]
freq_uncommon
# Remove 'RT + source account' from tweets
retweets = df1["text"][df1['is_retweet'] == True]
removeRT = retweets.str[3:]
cleanretweets = removeRT.str.split(n=1).str[1]
df1["text"][df1['is_retweet'] == True] = cleanretweets
df1["text"][df1['is_retweet'] == True]
# Setting frequent words without contextual meaning, 'stop_words' is built-in, 'new_words' is chosen.
stop_words = set(stopwords.words('english'))
new_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown"]
stop_words = stop_words.union(new_words)
# Removing links from tweets
#the old code
#df1["cleantext1"] = df1['text'].str.replace(
# 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', "")
r = '(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'
df1["cleantext1"] = df1['text'].str.replace(r, '')
df1["cleantext1"]
# Remove punctuation
df1["cleantext2"] = df1["cleantext1"].str.replace('[^\w\s]', '')
df1["cleantext2"]
# Remove special characters and digits
df1["cleantext3"] = df1["cleantext2"].str.replace("(\\d|\\W)+", " ")
df1["cleantext3"]
# Put all words in lower case
df1["cleantext4"] = df1["cleantext3"].str.lower()
df1["cleantext4"]
# 20 most common words after cleaning
freq_common = pd.Series(' '.join(df1['cleantext4']).split()).value_counts()[:20]
freq_common
# 20 less common words after cleaning
freq_uncommon = pd.Series(' '.join(df1['cleantext4']).split()).value_counts()[-20:]
freq_uncommon
# Combine all the tweets into a corpus
# e.g. corpus = combine_to_corpus(df1['cleantext4'])
def combine_to_corpus(tweets):
return ' | '.join(tweets)
# Split the corpus into individual tweets
# e.g. series = split_to_tweets(corpus)
def split_to_tweets(corpus):
tweets = corpus.split(' | ')
res = pd.Series(tweets)
return res
# Examples:
corpus = combine_to_corpus(df1['cleantext4'])
corpus = corpus.upper()
df1['testtext'] = split_to_tweets(corpus)
'''
Ruixu:
1. Do we need to remove tags all together? @realDonaldTrump for example?
If we dont do so we will end up with someting as weird as m_forese.
2. We failed to remove links such as www.youtube.com/user/mattressserta (fixed)
3. We failed to remove punctuations such as '_' probably from hashtags like 'suffolk_sheriff'.
Do we need to replace '_' with a space then?
4. What does 'Remove 'RT + source account' from tweets' mean?
'''