-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTexualFeatureExtractor.py
More file actions
72 lines (60 loc) · 2.09 KB
/
TexualFeatureExtractor.py
File metadata and controls
72 lines (60 loc) · 2.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
#Count number of words which are in the WordNet dictionary
def extractFormality(tokens):
number = 0
for token in tokens:
if wordnet.synsets(token):
number = number + 1
number = float(float(number)/float(len(tokens)))
return number
#Extract tokens using nltk tool
def extractTokens(text):
tokens = word_tokenize(text)
return tokens
#Extract number of tokens which are repeated during posts
def extractRepeatation(tokens, text):
number = 0
for word in tokens:
number = getAllTheMatch(word, text)
frequencyOfRepeatation = number/len(tokens)
return frequencyOfRepeatation
#Return the text which after finding one match, if it in none and zero, there is no match, otherwise there is a match and remaining text is returned
def countMatch(search, text):
index = text.find(search)
if index ==-1:
return None,0
else:
end = index+len(search)
text = text[end:]
return text,1
#Count number of matches of a token in whole text content
def getAllTheMatch(search, text):
count = 0
while True:
text, number = countMatch(search, text)
if number==1:
count = count+1
else:
break
return count
#Extract capital letters and words
def ExtractCapital(tokens):
capitalLetter = 0
capitalWord = 0
for token in tokens:
size = len(token)
uppers = [l for l in token if l.isupper()]
length = len(uppers)
capitalLetter = capitalLetter+length
if size == length:
capitalWord = capitalWord+1
return capitalWord, capitalLetter
#Remove stop words from the text
# This function is language dependent, therefore it is needed to detect the language first and then run this function on that particular langiage
# this function is for English
def removeStopWords(text):
tokens = extractTokens(text)
filteredToken = [w for w in tokens if not w in stopwords.words('english')]
return filteredToken