-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathget_ryf.py
More file actions
64 lines (46 loc) · 1.2 KB
/
get_ryf.py
File metadata and controls
64 lines (46 loc) · 1.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# -*- coding:utf-8-*-
import BeautifulSoup
import time
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf8')
start_url = 'http://www.ruanyifeng.com/blog/2011/03/china_celebrity_gossips_pre-1949.html'
def get_soup(url):
page = urllib2.urlopen(url).read()
soup = BeautifulSoup.BeautifulSoup(page)
return soup
def get_next_url(soup):
next_url = soup.findAll('link', rel='next')[0].get('href')
return next_url
def save_content(url):
soup = get_soup(url)
title = soup.html.title.text
content = soup.findAll('div', id='main-content')[0]
with open('ryf.md', 'a') as f:
f.write(title)
f.write('---')
f.write(str(content))
f.write('---')
next_url = get_next_url(soup)
return next_url
def get_ryf(url):
next_url = save_content(url)
time.sleep(5)
while next_url:
_url = save_content(next_url)
if _url:
print _url
next_url = _url
else:
print 'finished'
if __name__ == '__main__':
get_ryf(start_url)
# while next_url:
# time.sleep(5)
# start_url = next_url
# start_page = urllib2.urlopen(start_url).read()
# soup = BeautifulSoup.BeautifulSoup(start_page)
# title = soup.html.title.text
# print title
# next_url = soup.findAll('link', rel='next')[0].get('href')