-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathspider.py
More file actions
206 lines (183 loc) · 6.76 KB
/
spider.py
File metadata and controls
206 lines (183 loc) · 6.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import sys
import time
import pymysql
import json
import traceback
import requests
from selenium.webdriver import Chrome, ChromeOptions
def get_tencent_data():
url1 = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5"
url2 = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_other"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/80.0.3987.122 Safari/537.36 "
}
r1 = requests.get(url1, headers)
r2 = requests.get(url2, headers)
res1 = json.loads(r1.text)
res2 = json.loads(r2.text)
data_all1 = json.loads(res1["data"])
data_all2 = json.loads(res2["data"])
history = {}
for i in data_all2["chinaDayList"]:
ds = "2020." + i["date"]
tup = time.strptime(ds, "%Y.%m.%d") # 匹配时间
ds = time.strftime("%Y-%m-%d", tup) # 改变时间格式
confirm = i["confirm"]
suspect = i["suspect"]
heal = i["heal"]
dead = i["dead"]
history[ds] = {"confirm": confirm, "suspect": suspect, "heal": heal, "dead": dead}
for i in data_all2["chinaDayAddList"]:
ds = "2020." + i["date"]
tup = time.strptime(ds, "%Y.%m.%d") # 匹配时间
ds = time.strftime("%Y-%m-%d", tup) # 改变时间格式
confirm = i["confirm"]
suspect = i["suspect"]
heal = i["heal"]
dead = i["dead"]
history[ds].update({"confirm_add": confirm, "suspect_add": suspect, "heal_add": heal, "dead_add": dead})
details = []
update_time = data_all1["lastUpdateTime"]
data_country = data_all1["areaTree"]
data_province = data_country[0]["children"]
for pro_infos in data_province:
province = pro_infos["name"]
for city_infos in pro_infos["children"]:
city = city_infos["name"]
confirm = city_infos["total"]["confirm"]
confirm_add = city_infos["today"]["confirm"]
heal = city_infos["total"]["heal"]
dead = city_infos["total"]["dead"]
details.append([update_time, province, city, confirm, confirm_add, heal, dead])
return history, details
def get_conn():
# 建立连接
conn = pymysql.connect(host="127.0.0.1", user="root", password="123456", db="cov", charset="utf8")
# c创建游标
cursor = conn.cursor()
return conn, cursor
def close_conn(conn, cursor):
if cursor:
cursor.close()
if conn:
conn.close()
# 定义更新细节函数
def update_details():
cursor = None
conn = None
try:
li = get_tencent_data()[1] # 1代表最新数据
conn, cursor = get_conn()
sql = "insert into details(update_time,province,city,confirm,confirm_add,heal,dead) values(%s,%s,%s,%s,%s,%s," \
"%s) "
sql_query = 'select %s=(select update_time from details order by id desc limit 1)'
# 对比当前最大时间戳
cursor.execute(sql_query, li[0][0])
if not cursor.fetchone()[0]:
print(f"{time.asctime()}开始更新数据")
for item in li:
cursor.execute(sql, item)
conn.commit()
print(f"{time.asctime()}更新到最新数据")
else:
print(f"{time.asctime()}已是最新数据!")
except:
traceback.print_exc()
finally:
close_conn(conn, cursor)
# 插入历史数据
def insert_history():
cursor = None
conn = None
try:
dic = get_tencent_data()[0] # 0代表历史数据字典
print(f"{time.asctime()}开始插入历史数据")
conn, cursor = get_conn()
sql = "insert into history values (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
for k, v in dic.items():
cursor.execute(sql, [k, v.get("confirm"), v.get("confirm_add"), v.get("suspect"),
v.get("suspect_add"), v.get("heal"), v.get("heal_add"),
v.get("dead"), v.get("dead_add")])
conn.commit()
print(f"{time.asctime()}插入历史数据完毕")
except:
traceback.print_exc()
finally:
close_conn(conn, cursor)
# 更新历史数据
def update_history():
cursor = None
conn = None
try:
dic = get_tencent_data()[0] # 0代表历史数据字典
print(f"{time.asctime()}开始更新历史数据")
conn, cursor = get_conn()
sql = "insert into history values (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
sql_query = "select confirm from history where ds=%s"
for k, v in dic.items():
if not cursor.execute(sql_query, k):
cursor.execute(sql, [k, v.get("confirm"), v.get("confirm_add"), v.get("suspect"),
v.get("suspect_add"), v.get("heal"), v.get("heal_add"),
v.get("dead"), v.get("dead_add")])
conn.commit()
print(f"{time.asctime()}历史数据更新完毕")
except:
traceback.print_exc()
finally:
close_conn(conn, cursor)
# 爬取百度热搜数据
def get_baidu_hot():
option = ChromeOptions()
option.add_argument("--headless") # 隐藏游览器
option.add_argument("--no--sandbox")
browser = Chrome(options=option, executable_path="chromedriver-dev.exe")
url = "https://voice.baidu.com/act/virussearch/virussearch?from=osari_map&tab=0&infomore=1"
browser.get(url)
# print(brower.page_source)
but = browser.find_element_by_css_selector(
'#ptab-0 > div > div.VirusHot_1-5-3_32AY4F.VirusHot_1-5-3_2RnRvg > section > div')
# 点击加载更多
but.click()
time.sleep(1)
# 爬虫与反爬,模拟人等待1秒
c = browser.find_elements_by_xpath('//*[@id="ptab-0"]/div/div[2]/section/a/div/span[2]')
context = [i.text for i in c]
browser.close()
return context
def update_hotsearch():
cursor = None
conn = None
try:
context = get_baidu_hot()
print(f"{time.asctime()}开始更新数据")
conn, cursor = get_conn()
sql = "insert into hotsearch(dt,content) values(%s,%s)"
ts = time.strftime("%Y-%m-%d %X")
for i in context:
cursor.execute(sql, (ts, i))
conn.commit()
print(f"{time.asctime()}数据更新完毕")
except:
traceback.print_exc()
finally:
close_conn(conn, cursor)
if __name__ == "__main__":
length = len(sys.argv)
if length == 1:
s = """
请输入参数
参数说明,
up_his 更新历史记录表
up_hot 更新实时热搜
up_det 更新详细表
"""
print(s)
else:
order = sys.argv[1]
if order == "up_his":
update_history()
elif order == "up_det":
update_details()
elif order == "up_hot":
update_hotsearch()