-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathContentExtractor.py
More file actions
62 lines (56 loc) · 2.46 KB
/
ContentExtractor.py
File metadata and controls
62 lines (56 loc) · 2.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from sklearn.externals import joblib
from sklearn.svm import SVC
class ContentExtractor:
def __init__(self, mode):
self.mode = mode
if mode == 1:
self.classifier_path = "model\svm_news.pkl"
self.classifier_title_path = "model\svm_news_title.pkl"
self.sc_path = "model\sc_news.pkl"
self.sc_title_path = "model\sc_news_title.pkl"
elif mode == 2:
self.classifier_path = "model\svm_blog.pkl"
self.classifier_title_path = "model\svm_blog_title.pkl"
self.sc_path = "model\sc_blog.pkl"
self.sc_title_path = "model\sc_blog_title.pkl"
elif mode == 3:
self.classifier_path = "model\svm_shop.pkl"
self.classifier_title_path = "model\svm_shop_title.pkl"
self.sc_path = "model\sc_shop.pkl"
self.sc_title_path = "model\sc_shop_title.pkl"
try:
self.classifier = joblib.load(self.classifier_path)
self.classifier_title = joblib.load(self.classifier_title_path)
self.sc = joblib.load(self.sc_path)
self.sc_title = joblib.load(self.sc_title_path)
except:
print("!!!\n\tNo learned model\n!!!")
def setblocklist(self, blocklist):
self.BlockList = blocklist
def extractcontent(self):
self.inputs = []
self.title_inputs = []
self.contents = []
for block in self.BlockList:
x = [block.x, block.y, block.w, block.h, block.fontsize]
self.inputs.append(x)
self.pred = self.classifier.predict(self.sc.transform(self.inputs))
for index in range(len(self.BlockList)):
block = self.BlockList[index]
if self.pred[index] == 1:
x = [block.x, block.y, block.w, block.h, block.fontsize]
self.title_inputs.append(x)
self.contents.append([block.type, block.content])
self.pred_title = self.classifier_title.predict(self.sc_title.transform(self.title_inputs))
self.title = []
self.image = []
self.text = []
for index in range(len(self.contents)):
content = self.contents[index]
if self.pred_title[index] == 1:
self.title.append(content[1])
elif content[0] == "text":
self.text.append(content[1])
elif content[0] == "img":
self.image.append(content[1])
return self.title, self.text, self.image