-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
111 lines (90 loc) · 3.16 KB
/
main.py
File metadata and controls
111 lines (90 loc) · 3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#! /usr/bin/env python3
"""
This module creates a bag of word for the existing news paper articles in table NewsArticles.
The BOW is saved in table NewsArticlesBOW.
A delta mechanism is implemented, i.e., only BOW for new articles are created.
"""
import configparser
import sys
import os
import logging
from DatabaseHandler import DatabaseHandler
from LemmatizationFilePreprocessing import LemmatizationFilePreprocessing
import argparse
def parse_args():
"""
initiates the argparseres and returns configpath gotten from parser
"""
parser = argparse.ArgumentParser(
description='Crawls newspages', prog='crawler')
parser.add_argument(
'-c',
type=str,
nargs='?',
dest='config',
required=True,
help='path to configfile')
args = parser.parse_args()
return args.config
def load_config(config_file):
"""
Loads config from config file and returns it
"""
cparser = configparser.ConfigParser()
try:
cparser.read(config_file)
return cparser
except Exception as exc:
print(exc)
sys.exit(1)
def insert_bow_of_new_articles(limit=10000000):
"""
Selects URIs and texts of articles in NewsArticles that are not yet present in NewsArticlesBOW.
Inserts URI and BOW into NewsArticlesBOW
"""
os.chdir(os.path.dirname(os.path.realpath(__file__)))
config_file = parse_args()
conf = load_config(config_file)
handler = DatabaseHandler(
host=conf['DATABASE']['Host'],
user=conf['DATABASE']['User'],
password=conf['DATABASE']['Password'],
db_name=conf['DATABASE']['DB']
)
total_number_of_articles = handler.execute(
"""
SELECT count(*)
FROM NewsArticles
""")
print("There are " + str(total_number_of_articles[0]['count(*)']) + " articles.")
#find new URIs for which no BOW exists and get the text along with the URI
new_uris = handler.execute(
"""
SELECT source_uri as 'uri', text
FROM NewsArticles
WHERE source_uri
NOT IN (SELECT source_uri FROM NewsArticlesBOW);
""")
number_of_new_articles = len(new_uris)
print("There are " + str(number_of_new_articles) + " new articles.")
counter = 0;
for entry in new_uris:
counter = counter + 1
print("Processing article " + str(counter) + " of " + str(number_of_new_articles) + ".")
print("Processing: " + entry['uri'])
#transforming the URI to avoid SQL errors
entry['uri'] = str.replace(entry['uri'], "'", "''")
transformed_result = LemmatizationFilePreprocessing.string_transformation(str(entry['text']))
bow = (' '.join(transformed_result))
sql_insert_command = "INSERT INTO NewsArticlesBOW (source_uri, bow)" \
" VALUES ('" + entry['uri'] + "', '" + bow + "');"
result = handler.execute(sql_insert_command)
if counter == limit:
return
def main():
logging.basicConfig(level=logging.DEBUG)
insert_bow_of_new_articles()
# to start, make sure you have a config.ini
# run in console: python main.py -c "config.ini"
if __name__ == "__main__":
main()