-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdatautils.py
More file actions
80 lines (55 loc) · 2.28 KB
/
datautils.py
File metadata and controls
80 lines (55 loc) · 2.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""
data loading and preprocessing scripts
"""
from nltk.tokenize import sent_tokenize
def documents(path="./data/Summarization-master/data"):
import os
for f in os.listdir(path):
if f.endswith(".txt"):
yield (open(os.path.join(path, f)), f)
def load_clusters():
clusters = {}
for file_handler,filename in documents():
cluster = []
for line in file_handler:
if line.strip() != "#" or " ":
# append all the lines in the cluster of documents as 1
cluster.extend(sent_tokenize(line.strip()))
else:
continue
clusters[filename] = cluster
return clusters
def nlp_preprocessing():
clusters = load_clusters()
from nltk.corpus import stopwords
from nltk import ngrams
from nltk.tokenize import word_tokenize
import re
import string
regex = re.compile('[%s]' % re.escape(string.punctuation))
regexb=re.compile('b[\'\"]')
stop_words = set(stopwords.words('english'))
for filename, sentences in clusters.iteritems():
for index, sentence in enumerate(sentences):
# preprocess each sentence in cluster
#print sentences
try:
# lower strings
tokens = map(str.lower, word_tokenize(sentence))
#remove punctuation
tokens = map(lambda token:regex.sub('', token), tokens)
#filter empty strings
tokens = filter(lambda token: token != '', tokens )
# remove stopwords (remove stopwords at algorithm level, insteald of corpus)
# Stopwords can be removed at vectorization (bag_of_words)/word_embedding
# the original sentenec is preserved for calculation of bleu scores
# Since we will be comparing against human written summaries
# tokens = filter(lambda word: word not in stop_words, tokens)
# can do stemming
# Unigrams are generated after removal of stopwords
if " ".join(tokens) != "":
clusters[filename][index] = " ".join(tokens)
except BaseException, e:
print "error", filename, index, sentence, e
return clusters
nlp_preprocessing()