FakeReviewDetector/processText.py at master · artik008/FakeReviewDetector · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/local/bin/python3
# -*- coding: utf-8 -*-
# import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk import regexp_tokenize

import pymorphy2

import sys

sent_tokenizer = (lambda s: sent_tokenize(s))

punkt_tokenizer   = (lambda s: regexp_tokenize(s, r'[a-zA-Z0-9\s]', gaps=True))
punkt_tokenizer_2 = (lambda s: regexp_tokenize(s, r'[,\.\?!"$%<>/\-\–]', gaps=False))

morph = pymorphy2.MorphAnalyzer()

only_word_tokenizer = (lambda s: regexp_tokenize(s, r'[,\.\?!"\s0-9$%\(\)\-\–]', gaps=True))


words = only_word_tokenizer(sys.argv[1])
sents = sent_tokenize(sys.argv[1])
puncts = punkt_tokenizer_2(sys.argv[1])

speechParts = {}
for word in words:
  scannedWord = str(morph.parse(word)[0].tag.POS)
  if scannedWord in speechParts.keys():
    speechParts[scannedWord] += 1
  else:
    speechParts.update({scannedWord:1})

punctCount = {}
for punct in puncts:
  if punct == '"':
    punct = '\\"'
  if punct in punctCount.keys():
    punctCount[punct] += 1
  else:
    punctCount.update({punct:1})

total_word_length = 0
for word in words:
  total_word_length += len(word)

words_count = len(words)

sent_count = len(sents)

result = "ProcessedTextInfo {" +\
           "ptAvgWordsInSent = " + str(words_count/sent_count) + "," +\
           "ptWordCount = " + str(words_count) + "," +\
           "ptAvgWordLength = " + str(total_word_length/words_count) + "," +\
           "ptPunctuation = ["

punctPairs = []

for punctPair in punctCount.items():
  punctPairs.append("(\"" + str(punctPair[0]) + "\"," + str(punctPair[1]) + ")")

result += ','.join(punctPairs)

result += "]" + "," +\
          "ptSpeechParts = ["

speechPartsPairs = []

for spPair in speechParts.items():
  speechPartsPairs.append("(\"" + str(spPair[0]) + "\"," + str(spPair[1]) + ")")

result += ','.join(speechPartsPairs)

result += "]}"

print(result, end='')