-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdump_community_urls.py
More file actions
executable file
·50 lines (46 loc) · 1.33 KB
/
dump_community_urls.py
File metadata and controls
executable file
·50 lines (46 loc) · 1.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env python
import sys
import psycopg2
from bs4 import BeautifulSoup # type: ignore
from oil import oil
from weaver import WebScraper
import weaver.enc as enc
def getLastPage(db: 'psycopg2.connection', scraper: WebScraper, url: str
) -> int:
w = scraper.softScrape(url)
dec = enc.decode(w.response, url)
if dec is None:
raise Exception(f"uhoh {w.url}")
html = dec[1]
soup = BeautifulSoup(html, 'html5lib')
lcWrap = soup.find('div', { 'class': 'lc-wrapper' })
if lcWrap is None:
return 1
maxSeen = 1
stub = '/'.join([''] + url.split('/')[3:6] + [''])
for a in lcWrap.findAll('a'):
if a is None or a.getText() is None:
continue
href = a.get('href')
if href is not None:
if href.startswith(stub):
maxSeen = max(maxSeen, int(href.split('/')[-2]))
if a.getText().strip() != 'Last':
continue
ps = href.split('/')
return int(ps[-2])
return maxSeen
#raise Exception(f"uhoh2 {w.url} {stub} {maxSeen}")
with oil.open() as db:
scraper = WebScraper(db)
scraper.baseDelay = 30
for line in sys.stdin:
url = line.strip()
if not url.startswith('https://www.fanfiction.net/communities/'):
continue
if url.startswith('https://www.fanfiction.net/communities/general/'):
continue
cnt = getLastPage(db, scraper, url)
ps = url.split('/')[:-2]
for p in range(1, cnt + 1):
print('/'.join(ps + [str(p), '']))