-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclassifier.py
More file actions
267 lines (227 loc) · 10.9 KB
/
classifier.py
File metadata and controls
267 lines (227 loc) · 10.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
# -------------------------------------------------------
# Assignment (2)
# Written by (Mohamed Hefny, 40033382)
# For COMP 472 Section (ABIX) – Summer 2020
# --------------------------------------------------------
import pandas
import matplotlib.pyplot as plt
from sklearn import metrics
import nltk_functions
import json
import math
import vocab
import const
import time
INPUT_COLUMNS = ["counter", "word", "freq_story", "prob_story", "freq_ask", "prob_ask", "freq_show", "prob_show", "freq_poll", "prob_poll"]
#intialize x,y variables for drawing frequency graphs
xticks = list()
xticks_top = list()
yticks_fmeasure = list()
yticks_accuracy = list()
yticks_recall = list()
yticks_precision = list()
yticks_top_fmeasure = list()
yticks_top_accuracy = list()
yticks_top_recall = list()
yticks_top_precision = list()
def draw_graph():
"""
draws two subplots for each frequency methods, one for number
frequency and the other for percentage frequency
"""
#create 2 subplots
fig, axs = plt.subplots(1, 2)
#plot points for the first graph
axs[0].plot(xticks,yticks_fmeasure, label = "F-measure",marker='x')
axs[0].plot(xticks,yticks_accuracy, label= "accuracy",marker='D')
axs[0].plot(xticks,yticks_precision,label= "precision",marker='*')
axs[0].plot(xticks,yticks_recall, label= "recall",marker='P')
#plot points for the second graphs
axs[1].plot(xticks_top,yticks_top_fmeasure, label = "F-measure",marker='x')
axs[1].plot(xticks_top,yticks_top_accuracy,label = "accuracy",marker='D')
axs[1].plot(xticks_top,yticks_top_precision, label = "precision",marker='*')
axs[1].plot(xticks_top,yticks_top_recall, label = "recall",marker='P')
#set the plot titles and legends
plt.setp(axs[0], ylabel='F-measure')
axs[0].set_title('Removal of low frequency\n By number')
axs[1].set_title('Removal of high frequency\n By percentage')
fig.text(0.5, 0.04, 'Words left in vocab', ha='center', va='center')
axs[0].legend()
axs[1].legend()
plt.show()
def load(input):
"""
loads the data model text
param input: csv file to be loaded
"""
#return the file as a pandas data frame
return pandas.read_csv(input, sep="\s+", header=None, names=INPUT_COLUMNS)
def load_test(input):
"""
loads the data csv file to get only the testing data
param input: csv file
"""
data = pandas.read_csv(input)
data["Created At"] = pandas.to_datetime(data["Created At"])
data["Year"] = data["Created At"].dt.year
data["Title"] = data["Title"].replace('\s+',' ',regex=True).str.lower()
data["Title"] = data["Title"].str.strip()
return data[data["Year"] == 2019]
def classify(csv, input, output,stop_words,frequency_filter=False,word_length_filter=False,top_freq=False,baseline=False):
"""
classifys the testing data
param csv: csv file to extract the data from
param input: input model to classify upon
param output: file to output the classification
param stop_words: set of stop words
param frequency_filter: if frequency filter experiment
param word_length_filter: if word length filter experiment
param top_freq: if top frequency from the infrequent filter experiment
param baseline: if baseline experiment
"""
#load the model data to classify based on
model_data = load(input)
print('processing, classification files')
#load the test data
test_set = load_test(csv)
#lemmatize all the test data titles
test_set['Titlelemma'] = test_set.apply(lambda row: nltk_functions.corpus_lemmatization(corpus_string=row['Title'],stopwords=stop_words,
word_length_filtering=word_length_filter,baseline=baseline),axis=1)
#To store the proccessed data in a h5 file, not supported by older pandas version
"""
## exists = os.path.isfile('processed_data.h5')
## data_store = pandas.HDFStore('processed_data.h5')
## data_store['preprocessed_df'] = test_set
## data_store.close()
## else:
## print('File, pre-processed')
## data_store = pandas.HDFStore('processed_data.h5')
## test_set = data_store['preprocessed_df']
## data_store.close()
"""
count = 0
voc_size = len(model_data)
with open("output/post_count.json") as json_file:
all_post = json.load(json_file)
#get the length of post counts
post_story = all_post["story"]
post_ask = all_post["ask"]
post_show = all_post["show"]
post_poll = all_post["poll"]
total_post = all_post["total"]
#get a list of the actual post types
true_list = test_set["Post Type"].tolist()
if not frequency_filter:
f = open(output, "w+",encoding="utf-8")
prediction_list = list()
#iterate through the test data tiles
for i,row in test_set.iterrows():
count+=1
title = row["Titlelemma"]
title_plain = row["Title"]
post_type = row["Post Type"]
#word_list = nltk_functions.corpus_lemmatization(title,stop_words,word_length_filter,baseline)
#get the words from the model that exist in the title
word_probab = model_data[model_data["word"].isin(title)]
#set the propability of story types if exists, otherwise -inf
p_story = float("-inf") if post_story == 0 else math.log(post_story/total_post, 10)
p_ask = float("-inf") if post_ask == 0 else math.log(post_ask/total_post, 10)
p_show = float("-inf") if post_show == 0 else math.log(post_show /total_post, 10)
p_poll = float("-inf") if post_poll == 0 else math.log(post_show /total_post, 10)
#print(title)
#check if the title consists of certain words, assign a higher probabilty for these words, its like a heurstic
if const.HEURSTIC:
if ("ask hn" in title_plain and post_type=="ask_hn") or ('ask_hn' in title_plain and post_type == 'ask_hn'):
p_ask = 10
if ("show hn" in title_plain and post_type=="show_hn") or ('show_hn' in title_plain and post_type == 'show_hn'):
p_show = 10
if ("poll" in title and post_type=="poll"):
p_poll = 10
#calculate the total post type score for each word in the title
for word in title:
word_prob = word_probab[word_probab["word"]==word]
if not word_prob.empty:
p_story += math.log(word_prob["prob_story"],10) if post_story != 0 else 0
p_ask += math.log(word_prob["prob_ask"],10) if post_ask != 0 else 0
p_show += math.log(word_prob["prob_show"],10) if post_show != 0 else 0
p_poll += math.log(word_prob["prob_poll"],10) if post_poll != 0 else 0
end1 =time.time()
scores = [p_story,p_ask,p_show,p_poll]
types = ["story","ask_hn","show_hn","poll"]
#get the most probable type
max_index = scores.index(max(scores))
prediction = types[max_index]
if not frequency_filter:
#check if the prediction matches the actual type
result = "right" if prediction == post_type else "wrong"
line ="%d %s %s %f %f %f %f %s %s\n" % (count, title_plain, prediction,
p_story, p_ask, p_show, p_poll,
post_type, result)
f.write(line)
prediction_list.append(prediction)
if frequency_filter:
#get performance metrics for the frequency filter classification
fScore, accuracy, recall, precision = get_metrics(true_list,prediction_list)
#if generating the percentage frequency graph
if top_freq:
xticks_top.append(voc_size)
yticks_top_fmeasure.append(fScore)
yticks_top_accuracy.append(accuracy)
yticks_top_precision.append(precision)
yticks_top_recall.append(recall)
#if generating the numerical frequency graph
else:
xticks.append(voc_size)
yticks_fmeasure.append(fScore)
yticks_accuracy.append(accuracy)
yticks_precision.append(precision)
yticks_recall.append(recall)
def get_metrics(true_list,prediction_list):
"""
provides the performance metrics from two equally sized lists of data
true_list: list of the true classifications
prediction_list: list of the predicted classifications
"""
#mcm = metrics.confusion_matrix(true_list,prediction_list, labels=["story","show_hn","ask_hn","poll"])
#print(mcm)
#print(metrics.classification_report(true_list,prediction_list, digits=7))
#generate 4 main metrics, fscore, accuracy, recall, precision
fScore = metrics.f1_score(true_list,prediction_list,average="weighted")
accuracy = metrics.accuracy_score(true_list,prediction_list,normalize=True)
recall = metrics.recall_score(true_list,prediction_list,average='weighted')
precision = metrics.precision_score(true_list,prediction_list,average='weighted')
return (fScore,accuracy,recall,precision)
def predict_classifier(stopword_removal=False, word_length_filter=False, frequency_filter=False, input_filename="", output_filename="",top_freq=False,baseline=True):
"""
sets constants for each experiment, such as model file names and prediction file names
param stopword_removal: if stopword experiment
param word_length_filter: if word length filter experiment
param frequency_filter: if frequency filter experiment
param input_file: the name of the model file to be analyzed
param output_filename: the name of the outputted classification file
param top_freq: if using the infrequent word filtering experiment and analyzing the top % frequency
param baseline: if using the baseline experiment
"""
print("Classifying...")
model_file = const.MODEL_FILE
prediction_file = const.BASELINE_RESULT
stop_words = set()
#if stopword experiment, generate the stopword classification
if stopword_removal:
print("Classifying using experiment 1 (stop-word)")
model_file = const.STOPWORD_MODEL
prediction_file = const.STOPWORD_RESULT
stop_words = vocab.get_stopwords(const.STOPWORDS)
#if word length filter experiment, generate the word length filter classification
if word_length_filter:
print("classifying using experiment 2 (word-length)")
model_file= const.WORDLENGTH_MODEL
prediction_file= const.WORDLENGTH_RESULT
stop_words = set()
#if frequency filter experiment, generate the frequency filter classification
if frequency_filter:
print("clasifying using experiment 3 (frequency-filter)")
model_file = input_filename
prediction_file = output_filename
stop_words = set()
classify(const.INPUT_TEST,model_file,prediction_file,frequency_filter=frequency_filter,word_length_filter=word_length_filter,stop_words=stop_words,top_freq=top_freq,baseline=baseline)