-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathAKE.py
More file actions
158 lines (136 loc) · 7.37 KB
/
AKE.py
File metadata and controls
158 lines (136 loc) · 7.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#from http://bdewilde.github.io/blog/2014/09/23/intro-to-automatic-keyphrase-extraction/
def extract_candidate_features(candidates, doc_text, doc_excerpt, doc_title):
import collections, math, nltk, re
candidate_scores = collections.OrderedDict()
# get word counts for document
doc_word_counts = collections.Counter(word.lower()
for sent in nltk.sent_tokenize(doc_text)
for word in nltk.word_tokenize(sent))
for candidate in candidates:
pattern = re.compile(r'\b'+re.escape(candidate)+r'(\b|[,;.!?]|\s)', re.IGNORECASE)
# frequency-based
# number of times candidate appears in document
cand_doc_count = len(pattern.findall(doc_text))
# count could be 0 for multiple reasons; shit happens in a simplified example
if not cand_doc_count:
print '**WARNING:', candidate, 'not found!'
continue
# statistical
candidate_words = candidate.split()
max_word_length = max(len(w) for w in candidate_words)
term_length = len(candidate_words)
# get frequencies for term and constituent words
sum_doc_word_counts = float(sum(doc_word_counts[w] for w in candidate_words))
try:
# lexical cohesion doesn't make sense for 1-word terms
if term_length == 1:
lexical_cohesion = 0.0
else:
lexical_cohesion = term_length * (1 + math.log(cand_doc_count, 10)) * cand_doc_count / sum_doc_word_counts
except (ValueError, ZeroDivisionError) as e:
lexical_cohesion = 0.0
# positional
# found in title, key excerpt
in_title = 1 if pattern.search(doc_title) else 0
in_excerpt = 1 if pattern.search(doc_excerpt) else 0
# first/last position, difference between them (spread)
doc_text_length = float(len(doc_text))
first_match = pattern.search(doc_text)
abs_first_occurrence = first_match.start() / doc_text_length
if cand_doc_count == 1:
spread = 0.0
abs_last_occurrence = abs_first_occurrence
else:
for last_match in pattern.finditer(doc_text):
pass
abs_last_occurrence = last_match.start() / doc_text_length
spread = abs_last_occurrence - abs_first_occurrence
candidate_scores[candidate] = {'term_count': cand_doc_count,
'term_length': term_length, 'max_word_length': max_word_length,
'spread': spread, 'lexical_cohesion': lexical_cohesion,
'in_excerpt': in_excerpt, 'in_title': in_title,
'abs_first_occurrence': abs_first_occurrence,
'abs_last_occurrence': abs_last_occurrence}
return candidate_scores
def score_keyphrases_by_textrank(text, n_keywords=0.05):
from itertools import takewhile, tee, izip
import networkx, nltk
# tokenize for all words, and extract *candidate* words
words = [word.lower()
for sent in nltk.sent_tokenize(text)
for word in nltk.word_tokenize(sent)]
candidates = extract_candidate_words(text)
# build graph, each node is a unique candidate
graph = networkx.Graph()
graph.add_nodes_from(set(candidates))
# iterate over word-pairs, add unweighted edges into graph
def pairwise(iterable):
"""s -> (s0,s1), (s1,s2), (s2, s3), ..."""
a, b = tee(iterable)
next(b, None)
return izip(a, b)
for w1, w2 in pairwise(candidates):
if w2:
graph.add_edge(*sorted([w1, w2]))
# score nodes using default pagerank algorithm, sort by score, keep top n_keywords
ranks = networkx.pagerank(graph)
if 0 < n_keywords < 1:
n_keywords = int(round(len(candidates) * n_keywords))
word_ranks = {word_rank[0]: word_rank[1]
for word_rank in sorted(ranks.iteritems(), key=lambda x: x[1], reverse=True)[:n_keywords]}
keywords = set(word_ranks.keys())
# merge keywords into keyphrases
keyphrases = {}
j = 0
for i, word in enumerate(words):
if i < j:
continue
if word in keywords:
kp_words = list(takewhile(lambda x: x in keywords, words[i:i+10]))
avg_pagerank = sum(word_ranks[w] for w in kp_words) / float(len(kp_words))
keyphrases[' '.join(kp_words)] = avg_pagerank
# counter as hackish way to ensure merged keyphrases are non-overlapping
j = i + len(kp_words)
return sorted(keyphrases.iteritems(), key=lambda x: x[1], reverse=True)
def score_keyphrases_by_tfidf(texts, candidates='chunks'):
import gensim, nltk
# extract candidates from each text in texts, either chunks or words
if candidates == 'chunks':
boc_texts = [extract_candidate_chunks(text) for text in texts]
elif candidates == 'words':
boc_texts = [extract_candidate_words(text) for text in texts]
# make gensim dictionary and corpus
dictionary = gensim.corpora.Dictionary(boc_texts)
corpus = [dictionary.doc2bow(boc_text) for boc_text in boc_texts]
# transform corpus with tf*idf model
tfidf = gensim.models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
return corpus_tfidf, dictionary
def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
import itertools, nltk, string
# exclude candidates that are stop words or entirely punctuation
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
# tokenize and POS-tag words
tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
for sent in nltk.sent_tokenize(text)))
# filter on certain POS tags and lowercase all words
candidates = [word.lower() for word, tag in tagged_words
if tag in good_tags and word.lower() not in stop_words
and not all(char in punct for char in word)]
return candidates
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
import itertools, nltk, string
# exclude candidates that are stop words or entirely punctuation
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
# tokenize, POS-tag, and chunk using regular expressions
chunker = nltk.chunk.regexp.RegexpParser(grammar)
tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
for tagged_sent in tagged_sents))
# join constituent chunk words into a single chunked phrase
candidates = [' '.join(word for word, pos, chunk in group).lower()
for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
return [cand for cand in candidates
if cand not in stop_words and not all(char in punct for char in cand)]