-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprescrape_ffn_users.py
More file actions
executable file
·136 lines (109 loc) · 3.29 KB
/
prescrape_ffn_users.py
File metadata and controls
executable file
·136 lines (109 loc) · 3.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python
import sys
import time
import random
import psycopg2
from typing import List, Set
from oil import oil
from weaver import WebScraper, Web
import weaver.enc as enc
from minerva import FFNUser
def plog(msg: str, fname: str = "./pffnu.log") -> None:
with open(fname, 'a') as f:
f.write(msg + '\n')
print(msg)
def extractFFNUserDeathCode(html: str) -> int:
if html.find('id=content_wrapper') >= 0:
return 0
deletedCodes = [
# probably never created
('User does not exist or is no longer an active member.', 0),
# probably deleted
('User is no longer an active member.', 1),
]
from bs4 import BeautifulSoup # type: ignore
soup = BeautifulSoup(html, 'html5lib')
content_wrapper = soup.find(id='content_wrapper')
if content_wrapper is not None:
return 0
# story might've been deleted
gui_warnings = soup.find_all('span', { 'class': 'gui_warning' })
for gui_warning in gui_warnings:
for dc in deletedCodes:
if gui_warning.get_text() == dc[0]:
return dc[1]
# might be in the abbreviated page message box
gui_normal = soup.find_all('span', { 'class': 'gui_normal' })
for msg in gui_normal:
for dc in deletedCodes:
if msg.get_text().find(dc[0]) >= 0:
return dc[1]
return -1 # retry later
def getUrl(uid: int) -> str:
return f"https://www.fanfiction.net/u/{uid}"
def prescrapeUid(db: 'psycopg2.connection', scraper: WebScraper, uid: int
) -> None:
plog(f"prescraping uid {uid}")
url = getUrl(uid)
w = scraper.softScrape(url)
dec = enc.decode(w.response, url)
if dec is None:
plog(" {uid} has unknown encoding")
return
html = dec[1]
code = extractFFNUserDeathCode(html)
if code != 0:
plog(f" {uid} is freshly dead: {code}")
FFNUser.bury(db, uid, code, w.created)
def prescrapeUidBlock(db: 'psycopg2.connection', scraper: WebScraper,
start: int, end: int, stripeCount: int, stripe: int,
minId: int, maxId: int) -> None:
uids = [uid for uid in range(start, end) if uid % stripeCount == stripe]
urls = [getUrl(uid) for uid in uids]
wcache = Web.wcache(db, urls)
random.shuffle(uids)
needsScraped = False
for url in urls:
if url not in wcache:
needsScraped = True
break
if not needsScraped:
plog(f"skipping block [{start}, {end})")
return
plog(f"prescraping block [{start}, {end})")
for uid in uids:
if uid < minId or uid > maxId:
continue
if getUrl(uid) in wcache:
continue
prescrapeUid(db, scraper, uid)
# 2019-08-27
minId = 1
maxId = 12677500 # roughly
blockSize = 1000
stripeCount = None
stripe = None
if len(sys.argv) > 1:
stripeCount = int(sys.argv[1])
if len(sys.argv) > 2:
stripe = int(sys.argv[2])
if stripe is None or stripeCount is None:
raise Exception("expected stripeCount stripe")
stripe %= stripeCount
blockSize *= stripeCount
plog(f"stripeCount: {stripeCount}")
plog(f"stripe: {stripe}")
blockIdxs = [idx for idx in range(max(int(minId / blockSize) - 20, 0), int(maxId / blockSize) + 1)]
random.shuffle(blockIdxs)
plog(f"block size: {blockSize}")
plog(f"total blocks: {len(blockIdxs)}")
with oil.open() as db:
scraper = WebScraper(db)
plog('==========')
plog(f"source: {scraper.source.__dict__}")
for idx in blockIdxs:
prescrapeUidBlock(db, scraper, idx * blockSize, (idx + 1) * blockSize,
stripeCount, stripe, minId, maxId)
plog("prescraped all of our blocks")
while True:
time.sleep(600)