forked from anshuljdhingra/Text-Summarization
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfunction.py
More file actions
91 lines (73 loc) · 3.35 KB
/
function.py
File metadata and controls
91 lines (73 loc) · 3.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import nltk
import string
import re
from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
class TextCleaner():
def __init__(self):
self.stop_words = set(stopwords.words("english"))
self.punctuations = set(string.punctuation)
self.pos_tags = {
NOUN: ['NN', 'NNS', 'NNP', 'NNPS', 'PRP', 'PRP$', 'WP', 'WP$'],
VERB: ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
ADJ: ['JJ', 'JJR', 'JJS'],
ADV: ['RB', 'RBR', 'RBS', 'WRB']
}
def _remove_stop_words(self, words):
return [w for w in words if w not in self.stop_words]
def _remove_regex(self):
self.input_sent = " ".join([w.lower() for w in self.input_sent])
self.input_sent = re.sub(r"i'm", "i am", self.input_sent)
self.input_sent = re.sub(r"he's", "he is", self.input_sent)
self.input_sent = re.sub(r"she's", "she is", self.input_sent)
self.input_sent = re.sub(r"that's", "that is", self.input_sent)
self.input_sent = re.sub(r"what's", "what is", self.input_sent)
self.input_sent = re.sub(r"where's", "where is", self.input_sent)
self.input_sent = re.sub(r"\'ll", " will", self.input_sent)
self.input_sent = re.sub(r"\'ve", " have", self.input_sent)
self.input_sent = re.sub(r"\'re", " are", self.input_sent)
self.input_sent = re.sub(r"\'d", " would", self.input_sent)
self.input_sent = re.sub(r"won't", "will not", self.input_sent)
self.input_sent = re.sub(r"can't", "cannot", self.input_sent)
self.input_sent = re.sub(r"don't", "do not", self.input_sent)
patterns = re.finditer("#[\w]*", self.input_sent)
for pattern in patterns:
self.input_sent = re.sub(pattern.group().strip(), "", self.input_sent)
self.input_sent = "".join(ch for ch in self.input_sent if ch not in self.punctuations)
def _tokenize(self):
return word_tokenize(self.input_sent)
def _process_content_for_pos(self, words):
tagged_words = pos_tag(words)
pos_words = []
for word in tagged_words:
flag = False
for key, value in self.pos_tags.items():
if word[1] in value:
pos_words.append((word[0], key))
flag = True
break
if not flag:
pos_words.append((word[0], NOUN))
return pos_words
def _remove_noise(self):
self._remove_regex()
words = self._tokenize()
noise_free_words = self._remove_stop_words(words)
return noise_free_words
def _normalize_text(self, words):
lem = WordNetLemmatizer()
pos_words = self._process_content_for_pos(words)
normalized_words = [lem.lemmatize(w, pos=p) for w, p in pos_words]
return normalized_words
def clean_up(self, input_sent):
self.input_sent = input_sent
cleaned_words = self._remove_noise()
cleaned_words = self._normalize_text(cleaned_words)
return cleaned_words