-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
113 lines (93 loc) · 2.89 KB
/
preprocessing.py
File metadata and controls
113 lines (93 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import re
from ekphrasis.classes.segmenter import Segmenter
from abbreviations import abbreviations
from nltk.stem import WordNetLemmatizer
#from nltk.stem import PorterStemmer
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
class Preprocessing:
def __init__(self, segmenter):
# Segmenter using the word statistics from Wikipedia
self.seg = segmenter
#self.porter_stemmer = PorterStemmer()
self.wordnet_lemmatizer = WordNetLemmatizer()
self.en_stopwords = set(stopwords.words('english'))
def __check_for_abbreviations(self, text):
"""
Replace abbreviations using a dictionary
Args:
text(list): list of words in a predicate
Returns:
list of the same words but abbreviations replaced by the full word
"""
full_words = []
for word in text:
if word in abbreviations:
full_words += abbreviations[word].split()
else:
full_words.append(word)
return full_words
def __remove_special_characters_from_list(self,literals):
"""
Remove special characters from string
Args:
literals(list): list of predicates/literals
Returns:
list of the same predicates/literals with no special characters
"""
for i in range(len(literals)):
literals[i] = self.__remove_special_characters(literals[i])
return literals
def __remove_special_characters(self,string):
"""
Remove special characters from string
Args:
string(array): predicate/literal
Returns:
string with no special characters
"""
return re.sub('[^A-Za-z0-9]+', '', string)
def __remove_stopwords(self, predicate):
"""
Remove stopwords from string
Args:
predicate(str): list of predicate/literal
Returns:
the same predicate/literal with no stopwords
"""
return [word for word in predicate if word not in self.en_stopwords]
def __segment(self, text):
"""
Remove special characters from string
Args:
atoms(list): list of predicates/literals
Returns:
list of the same predicates/literals with no special characters
"""
# Tokenize (segment) the predicates into words
# wasbornin -> was, born, in
return self.seg.segment(text)
def pre_process_text(self, text):
"""
Data preprocessing
Args:
text(str): a single predicate
Returns:
the predicate after lemma
"""
tag_map = defaultdict(lambda : wordnet.NOUN)
tag_map['V'] = wordnet.VERB
predicate = []
for token, tag in pos_tag(self.__check_for_abbreviations(self.__segment(text).split())):
# If it's a verb or a noun in plural, we apply lemmatization
if token == 'as':
predicate.append(token)
continue
predicate.append(self.wordnet_lemmatizer.lemmatize(token, tag_map[tag[0]]))
return predicate
#from ekphrasis.classes.segmenter import Segmenter
#segmenter = Segmenter(corpus="english")
#test = Preprocessing(segmenter)
#print(test.pre_process_text('actorhasmembersofproject'))