-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathSpiderHelp.py
More file actions
548 lines (507 loc) · 21.1 KB
/
SpiderHelp.py
File metadata and controls
548 lines (507 loc) · 21.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
# -*- coding:utf-8 -*-
import os
import sys
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
import scrapy
import re
import requests
from scrapy.http.response.text import TextResponse
from scrapy.http.response import Response
import urllib.parse
from user_agent import generate_user_agent
from myselector import Selector as S
import json
import logging
from math import ceil
import time
from RedisHelp import _Request
from functools import wraps
from json import JSONDecodeError
import execjs
conn_flag = False # False使用本地数据库 True 使用公司数据库
start_time = time.time()
proxy_flag = True
'_configs 统一为页面抓取信息配置'
# 睡眠函数 可放入check_response内
def sleep_flag():
if (time.time() - start_time) % (20 * 60) > (15 * 60):
print('start_sleep')
time.sleep(5)
print('continue')
else:
print('pass')
# 日志handler
logger = logging.getLogger(__name__)
# 代理去http/https
_ip = re.compile('\/\/(.*?):')
# 一些页面的pagesize全局变量
PAGESIZE = 100
# 处理json
true = 'true'
# 处理json
false = 'false'
# 最大重试次数
MAX_TIMES = 10
# 删除proxy
def delete_proxy(response: Response): # 删除不可用代理
ip = _ip.search(response.meta['proxy']).group(1)
requests.get(
"http://10.1.18.35:8000/delete?ip={}".format(ip),
allow_redirects=False)
requests.get(
"http://10.1.18.35:8000/delete?types=2", allow_redirects=False)
# 实例化Item
class Item(scrapy.Item):
# define the fields for your item here like:
result = scrapy.Field() # 封装结果
keys = scrapy.Field()
db = scrapy.Field()
conn = scrapy.Field()
class SpiderHelp(object):
faile_respon = 'seturl|没有查询到相关结果[\s\S]*没有查询到相关结果|The proxy server received an invalid|UncategorizedSQLException|访问过于频繁'
def __init__(self,**kwargs):
pass
@property
def default_header(self)->dict:
return {
'User-Agent': generate_user_agent(os=('win',),navigator=('chrome', 'firefox',)),
'Content-Type': 'application/x-www-form-urlencoded',
'Accept-Encoding': 'gzip, deflate',
'Accept': '*/*'
}
def header_update(self,headers,**kwargs):
headers.update(kwargs)
return headers
# 默认的表头
@property
def default_jheader(self)->dict:
return {
'User-Agent': generate_user_agent(os=('win',),navigator=('chrome', 'firefox',)),
'Content-Type': 'application/json',
'Accept-Encoding': 'gzip, deflate',
'Accept': '*/*'
}
# 删除 绑定在response内的代理
def delete_proxy(self, response: Response):
'''
@params response
@目标 删除response proxy
@output None
'''
if response.meta.get('proxy'):
ip = _ip.search(response.meta['proxy']).group(1)
requests.get("http://10.1.18.35:8000/delete?ip={}".format(ip))
# 类判断
def get_instance(self, args: tuple or list, cls):
'''
@params args:参数列表
@params cls:类
@output i -> if ininstance(i,cls) == True
'''
if args:
for i in args:
if isinstance(i, cls):
return i
def _get_instance(self, kwargs: dict, cls):
'''
@params args:参数字典
@params cls:类
@output i.values() -> if ininstance(i.values(),cls) == True
'''
if kwargs:
_list = list(kwargs.values())
return self.get_instance(_list)
def getfinstance(self, args, kwargs, cls):
'''
@params cls -->type or tuple of types
'''
flag = self.get_instance(args, cls)
return flag if flag else self._get_instance(kwargs, cls)
# 判断失败页面逻辑
def response_failed(self, response: Response) -> bool:
'''
@output Ture-->重新发送请求 False:为正确请求
@output False-->正确请求,参数传递给 parse
'''
if response.status in [301, 403, 404, 502, 500, 504, 407]:
if response.status in [301, 403, 404, 502, 500, 504, 407]:
self.delete_proxy(response)
return True
# print(__file__)
elif re.compile(self.faile_respon, re.I|re.S).search(response.text):
return True
else:
return False
def set_faile_reason(self, value = 'seturl|没有查询到相关结果[\s\S]*没有查询到相关结果|The proxy server received an invalid|UncategorizedSQLException|访问过于频繁'):
self.faile_respon = value
def new_request(self, response):
__meta = response.meta.copy()
try_time = __meta.get('try_time', 0) + 1
if try_time > MAX_TIMES:
logger.warning('TRY MORE THAN %s TIMES ON %s' % (try_time,response.url))
else:
__meta['try_time'] = try_time
logger.debug('RESPONSE IS INVALID,TRY %s TIME ON %s' % (try_time - 1,response.url))
yield response.request.replace(dont_filter=True, meta=__meta)
def response_try(self, response):
if self.response_failed(response):
return self.new_request(response)
# logger 日志
@staticmethod
def check_response(func: callable):
'''
@parsms:func 装饰函数
'''
@wraps(func)
def decorate(self, *args, **kwargs):
if not callable(func):
raise TypeError('<class %s is not a callback func>' %
(func.__class__))
response = self.getfinstance(args, kwargs,
(TextResponse, Response))
logger.info('<url:%s body:%s status:%s proxy:%s>' %
(response.url, response.request.body, response.status,
response.meta.get('proxy')))
if self.response_failed(response):
request = self.new_request(response)
if request:
return request
result = func(self, *args, **kwargs)
if result:
for i in result.__iter__():
yield i
return decorate
def item_parse(self, _configs: list, response, response1=None) -> dict:
'''
@parsma _configs->字段抓取设置 list
@params response->Response
@output -->result 字段-值 的字典
'''
if hasattr(response,'url'):
response1 = response
for configs in _configs:
response_change = self.change_response_f_type(configs, response)
if configs['list']['v']:
_response_copy = S.select_content(response_change,
configs['list'], response1) or []
else:
if isinstance(response_change, list):
_response_copy = response_change
else:
_response_copy = [response_change]
for _response in _response_copy:
if not _response:
return
result = dict()
for config in configs['data']:
result[config['En']] = S.select_content(
_response, config, response1)
result[config['En']] = S.replace_all(result[config['En']])
item = self.item_db_parse(configs, result)
if item:
# 持久化记录item
self.state['items_count'] = self.state.get(
'items_count', 0) + 1
yield item
def item_db_parse(self, configs, result):
'''
@params configs 字典 里面有keys,db,check3个参数 check在result里有对应key
@params result 字典 解析字段的返回值
'''
check = configs['list']['check']
if not result[check]: # 非空字段检索 为None '' return
return
item = Item()
item['result'] = result
item['db'] = configs['list']['db']
item['keys'] = configs['list']['keys']
item['conn'] = configs['list'].get('conn',False)
return item
def change_response_f_type(self, configs, response):
if configs.get('response_type') == 'json':
try:
try:
v = json.loads(response.text)
except:
v = execjs.eval(response.text)
return v
except BaseException as e:
logger.info('jsonDecoder error %r' % e)
else:
return response
# 翻页函数
def scrapy_page_help(self,
response: Response,
config: dict = None,
callback: callable = None,
headers: dict = None,
urlfunc: callable = None,
bodyfunc: callable = None,
divmod: int = 1,
response_type: 'xpath' or 'json' = 'xpath',
method: 'GET' or 'POST' = 'GET',
flag=False, # True为下一页翻页,False为生成所有页面
pagestart=1, # 其实页说明
redis_flag=False,
redis_conn=None,
errback=None,
cookies=None,
offset=1,
meta={},
readpage=128, # 每次余数 生成nextpages数
connect_type: 'urlencode'
or 'json' = 'urlencode') -> scrapy.Request:
'''
@ params response parse的response形参
@ params config 获取total方法参数 调用S.select_content
@ callback 回调函数
@ headers 默认为urlencode
@ urlfunc 常用lambda函数
@ connect_type 决定body的encode方法
@ response_type 决定参数获取方式
@ method Request method
@ divmod 获取到total 后计算totalpage的除数
@ bodyfunc 常用lambda表达式
return [Requests]
'''
_pagestart = response.meta.get('pagestart') or pagestart
_offset = response.meta.get('offset') or offset
page = response.meta.get('page') or 1
dataencode = urllib.parse.urlencode if connect_type == 'urlencode' else json.dumps
if not response.meta.get('totalpage'):
if response_type.lower() == 'json':
try:
JS_response = json.loads(response.text)
except:
JS_response = execjs.eval(response.text) if hasattr(response,'text') else response if isinstance(response,(dict,list)) else {}
else:
JS_response = response
else:
JS_response = response
reqs = set()
logger.info('page'*100)
# 直接获取最大页码 生成request 更新逻辑为一次生成默认32页,优化内存
if not flag:
totalpage = response.meta['totalpage'] if response.meta.get('totalpage') else \
ceil(int(S.select_content(JS_response, config, response)) / divmod) if S.select_content(JS_response, config, response)\
else 1
if page < totalpage and not flag:
_readpage = readpage * _offset
pagestart = _pagestart % _readpage
if page % _readpage == pagestart:
minpage = min(page + _readpage,totalpage)
logger.info('from %s to %s,totalpage is %s' % (page+1,minpage,totalpage))
for page in range(page + _offset, minpage + _offset, _offset):
if callable(bodyfunc):
body = bodyfunc(page, response=response)
if isinstance(body, str):
pass
else:
body = dataencode(body)
else:
body = None
if callable(urlfunc):
url = urlfunc(page,response=response)
else:
url = response.url
_meta = response.meta.copy()
_meta.update({'page': page,
'pagestart':_pagestart,
'totalpage': totalpage,
'offset':_offset})
req = _Request(
url,
method=method,
body=body,
headers=headers,
redis_flag=redis_flag,
redis_conn=redis_conn,
errback=errback,
cookies=cookies,
meta=_meta,
callback=callback)
reqs.add(req)
elif page > totalpage and not flag:
_readpage = readpage * _offset
pagestart = _pagestart % _readpage
if page % _readpage == pagestart:
minpage = max(page-_readpage,totalpage)
logger.info('from %s to %s,totalpage is %s' % (page,minpage,totalpage))
for page in range(minpage, page):
if callable(bodyfunc):
body = bodyfunc(page, response=response)
if isinstance(body, str):
pass
else:
body = dataencode(body)
else:
body = None
if callable(urlfunc):
url = urlfunc(page,response=response)
else:
url = response.url
_meta = response.meta.copy()
_meta.update({'page': page,
'pagestart':_pagestart,
'totalpage': totalpage,
'offset':_offset})
req = _Request(
url,
method=method,
body=body,
headers=headers,
redis_flag=redis_flag,
redis_conn=redis_conn,
errback=errback,
cookies=cookies,
meta=_meta,
callback=callback)
reqs.add(req)
# 下一页判断 默认生成32页 翻页
elif flag:
if S.select_content(JS_response, config):
_readpage = readpage * _offset
pagestart = _pagestart % _readpage
if page % _readpage == pagestart:
logger.info('from %s to %s,totalpage is undefind' % (page+1,page+readpage))
for _page in range(page + 1, page+_readpage+1):
if callable(urlfunc):
url = urlfunc(_page,response=response)
else:
url = response.url
if callable(bodyfunc):
body = bodyfunc(_page, response=response)
if isinstance(body, str):
pass
else:
body = dataencode(body)
else:
body = None
_meta = response.meta.copy()
_meta.update({'page': _page,
'pagestart':_pagestart,
'offset':_offset})
req = _Request(
url,
method=method,
body=body,
headers=headers,
meta=_meta,
redis_flag=redis_flag,
redis_conn=redis_conn,
callback=callback,
errback=errback)
reqs.add(req)
else:
# logger.error(response.text)
with open('1.html','wb') as f:
f.write(response.body)
return reqs
# return config生成的url
def scrapy_info_url_help(self,
response: Response,
config: dict = None,
callback: callable = None,
errback=None,
headers: dict = None,
urlfunc: callable = None,
bodyfunc: callable = None,
divmod: int = 1,
meta=None,
priority=100,
redis_flag=False,
redis_conn=None,
dont_filter=False,
response_type: 'xpath' or 'json' = 'xpath',
method: 'GET' or 'POST' = 'GET',
flag=False, # True为下一页翻页,False为生成所有页面
pagestart=1, # 其实页说明
connect_type: 'urlencode'
or 'json' = 'urlencode') -> scrapy.Request:
'''
@ params response parse的response形参
@ params config 获取total方法参数 调用S.select_content
@ callback 回调函数
@ headers 默认为urlencode
@ urlfunc 常用lambda函数
@ connect_type 决定body的encode方法
@ response_type 决定参数获取方式
@ method Request method
@ divmod 获取到total 后计算totalpage的除数
@ bodyfunc 常用lambda表达式
return [Requests]
'''
dataencode = urllib.parse.urlencode if connect_type == 'urlencode' else json.dumps
if response_type.lower() == 'json':
try:
JS_response = json.loads(response.text)
except:
JS_response = execjs.eval(response.text)
else:
JS_response = response
reqs = set()
urls = S.select_content(JS_response, config, response)
if isinstance(urls, list):
pass
else:
urls = [urls]
for page in urls:
if not page:
return []
if callable(bodyfunc):
body = bodyfunc(page, response=response)
if isinstance(body, str):
pass
else:
body = dataencode(body)
else:
body = None
if callable(urlfunc):
if isinstance(page,tuple):
url = urlfunc(*page,response=response)
else:
url = urlfunc(page,response=response)
else:
url = response.url
_meta = response.meta.copy()
meta = meta if meta else {}
_meta.update(meta)
req = _Request(
url,
method=method,
body=body,
headers=headers,
meta=_meta,
priority=priority,
redis_flag=False,
redis_conn=None,
dont_filter=dont_filter,
callback=callback,
errback=errback)
reqs.add(req)
return reqs
def request(self,url,callback=None,dont_filter=False,
method='GET',cookies=None,
headers=None,priority=0,meta=None,encoding='utf-8',body=None,
redis_flag=False,redis_conn=None):
callback = callback if callback else self.parse
headers=headers if headers else self.default_header
if redis_flag:
return _Request(url,callback=callback,dont_filter=dont_filter,body=body,method=method,cookies=cookies,
headers=headers,priority=priority,meta=meta,encoding=encoding,redis_flag=redis_flag,redis_conn=self.r)
else:
return _Request(url,callback=callback,dont_filter=dont_filter,body=body,method=method,cookies=cookies,
headers=headers,priority=priority,meta=meta,encoding=encoding)
@classmethod
def put_redis(cls,*args,**kwargs):
if hasattr(cls,'_start_requests'):
reqs = cls(*args,**kwargs)._start_requests()
for req in reqs:
pass
else:
raise AttributeError('<class object %s> has not attribute _start_requests' % cls.__name__)
def errbackparse(self, failure):
logger.error(failure)