-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_story_content.py
More file actions
executable file
·154 lines (127 loc) · 3.85 KB
/
process_story_content.py
File metadata and controls
executable file
·154 lines (127 loc) · 3.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python3
import sys
import math
import traceback
import psycopg2
from typing import Any, List
from bs4 import BeautifulSoup # type: ignore
from oil import oil
import oil.util as util
from weaver import Web
import weaver.enc as enc
from minerva import FFNFicContent, extractFFNDeathCode
processType = 'content'
storyUrlPrefix = 'https://www.fanfiction.net/s/'
logFileName = f'./process_story_{processType}.log'
def plog(msg: str) -> None:
global logFileName
print(msg)
util.logMessage(msg, fname = logFileName, logDir = './')
def extractContent(html: str) -> str:
if html.lower().find('chapter not found.') != -1 \
and html.lower().find("id='storytext") < 0:
raise Exception('unable to find chapter content')
lines = html.replace('\r', '\n').replace('>', '>\n').split('\n')
parts: List[str] = []
inStory = False
for line in lines:
if line.find("id='storytext'") != -1 \
or line.find('id="storytext"') != -1:
inStory = True
if inStory:
if line.find("SELECT id=chap_select") != -1 \
or line.lower().find('<script') != -1:
inStory = False
break
parts += [line]
while len(parts) > 0 and (parts[-1].startswith('< Prev</button') \
or parts[-1].startswith('<button class=btn TYPE=BUTTON')):
parts = parts[:-1]
return ' '.join(parts)
def handleStoryPage(db: 'psycopg2.connection', w: Web, stripeCount: int,
stripe: int) -> None:
assert(w.url is not None and w.created is not None and w.id is not None)
global storyUrlPrefix
if not w.url.startswith(storyUrlPrefix):
return
url = w.url
ts = int(w.created / 1000)
fid = int(url[len(storyUrlPrefix):].split('/')[0])
cid = int(url[len(storyUrlPrefix):].split('/')[1])
if fid % stripeCount != stripe:
return
dec = enc.decode(w.response, w.url)
if dec is None:
raise Exception("unknown encoding")
html = dec[1]
deathCode = extractFFNDeathCode(html)
if deathCode != 0:
#print(f" {fid} is dead: {deathCode}")
return
#plog(f"{w.url} {len(w.response)}: {fid}/{cid}")
try:
# try to grab just the story content
content = extractContent(html)
FFNFicContent.upsert(db, fid, cid, w.id, content, stripe)
#plog(f"{w.url} has content len: {len(content)}")
except:
plog(f"{w.url} is broken")
with open(f"./edump_{fid}_{cid}.html", 'w') as f:
f.write(html)
plog(traceback.format_exc())
raise
def handlePage(db: 'psycopg2.connection', w: Web, stripeCount: int,
stripe: int) -> None:
global storyUrlPrefix
assert(w.url is not None)
if w.url.startswith(storyUrlPrefix):
handleStoryPage(db, w, stripeCount, stripe)
return
def main(db: 'psycopg2.connection') -> None:
if len(sys.argv) not in {1, 2, 4}:
print(f"usage: {sys.argv[0]} [start [stripeCount stripe]]")
sys.exit(1)
if len(sys.argv) == 4:
global logFileName
logFileName = f"./process_story_{processType}_{sys.argv[2]}_{sys.argv[3]}.log"
plog(f"using log {logFileName}")
maxId = Web.maxId(db)
plog(f"maxId: {maxId}")
roundTo = 100
overshoot = 20
start = 0
end = maxId
end = int((end + roundTo -1) / roundTo) * roundTo
stripeCount = 1
stripe = 0
if len(sys.argv) >= 2:
start = int(sys.argv[1])
if len(sys.argv) >= 4:
stripeCount = int(sys.argv[2])
stripe = int(sys.argv[3])
plog(f"stripe: {stripe}")
plog(f"stripeCount: {stripeCount}")
plog(f"from {start} to {end}")
blockSize = 1000 * stripeCount
FFNFicContent.createStripeTable(db, stripe)
fidx = start - blockSize
while fidx < end:
fidx += blockSize
eidx = min(fidx + blockSize, end)
plog(f" doing ids [{fidx}, {eidx})")
try:
with db:
for s in Web.fetchIdRange_g(db, fidx, eidx,
ulike='https://www.fanfiction.net/s/%/%'):
if s.response is None or len(s.response) < 1:
continue
handlePage(db, s, stripeCount, stripe)
except SystemExit as e:
raise
except:
plog(f" trouble in ids [{fidx}, {eidx})")
raise
if __name__ == '__main__':
with oil.open() as db:
main(db)
sys.exit(0)