-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLemmatizationFilePreprocessing.py
More file actions
123 lines (92 loc) · 4.63 KB
/
LemmatizationFilePreprocessing.py
File metadata and controls
123 lines (92 loc) · 4.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
""" import statements """
import json
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from AbstractFilePreprocessing import AbstractFilePreprocessing
class LemmatizationFilePreprocessing(AbstractFilePreprocessing):
""" This class implements lemmatization and is based on an abstract method. """
@staticmethod
def string_transformation(input_string):
""" This method returns a string array with the words of the document.
It needs to be used for query preprocessing as well. """
transformed_input = []
# tokenize words
tokenized_string = nltk.word_tokenize(input_string)
# POS tagging
pos_tagged_tokens = nltk.pos_tag(tokenized_string)
# initialize lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
# lemmatize
for pos_tagged_token in pos_tagged_tokens:
word, part_of_speech = pos_tagged_token
if str(part_of_speech).startswith("N"):
#token is a noun
transformed_input.append(wordnet_lemmatizer.lemmatize(word, pos=wordnet.NOUN))
elif str(part_of_speech).startswith("V"):
#token is a verb
transformed_input.append(wordnet_lemmatizer.lemmatize(word, pos=wordnet.VERB))
elif str(part_of_speech).startswith("J"):
#token is an adjective
transformed_input.append(wordnet_lemmatizer.lemmatize(word, pos=wordnet.ADJ))
elif str(part_of_speech).startswith("R"):
#token is adverb
#note: lemmatizer does not handle adverbs
extended_adverb = word + ".r.1"
# handle the exception that no lemma is found
try:
lemmas = wordnet.synset(extended_adverb).lemmas()
lemmatized_adverb = lemmas[0].pertainyms()[0].name()
except (IndexError, AttributeError, nltk.corpus.reader.wordnet.WordNetError):
lemmatized_adverb = word
# add base form
transformed_input.append(lemmatized_adverb)
else:
#token is not tagged -> simply add token
transformed_input.append(word)
# initialize return structure
return_structure = []
# delete everything except characters
for token in transformed_input:
# replace with empty string
return_structure.append(re.sub("[^a-züäößáàéè]", "", str(token).lower()))
# remove empty entries
return_structure = [x for x in return_structure if x]
# english stopword list
stopword_list = set(stopwords.words('english'))
# stopword removal
return_structure = [i for i in return_structure if i not in stopword_list]
return return_structure
@staticmethod
def save_bag_of_words(path_to_corpus, name_of_target_file):
"""This method reads the 20 newsgroup corpus, processes it, and writes the
BOW to the specified file."""
# dictionary for data structure: filepath -> BOW
all_files = AbstractFilePreprocessing.__get_paths_to_resource_files__(path_to_corpus)
collection = {}
# Loop over all files
for file_path in all_files:
# open file, process content and add to dictionary
with open(file_path, 'rt', encoding='utf-8', errors='replace') as file:
print('Processing File ' + file_path)
if path_to_corpus.endswith("test"):
# FOR TEST DATA
regex_to_get_new_file_name = "(?<=20news-bydate-test\\/)(.*)"
if path_to_corpus.endswith("train"):
# FOR TRAIN DATA
# This regex will select the filename after 20news-bydate-train in the filepath;
# demasked: (?<=20news-bydate-train\/)(.*)
regex_to_get_new_file_name = "(?<=20news-bydate-train\\/)(.*)"
regexer = re.search(regex_to_get_new_file_name, file_path)
key = regexer.group(0) # get filename
data = file.read()
value = LemmatizationFilePreprocessing.string_transformation(data)
collection[key] = value # write filename (key) and BOW (value) into collection
# save the dictionary
with open(name_of_target_file, 'w+') as outfile:
json.dump(collection, outfile)
print("Number of files processed: " + str(len(collection)))
print("Result saved in " + name_of_target_file)
return collection