-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathget_data.py
More file actions
144 lines (123 loc) · 4.43 KB
/
get_data.py
File metadata and controls
144 lines (123 loc) · 4.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python3
import argparse
import logging
import json
from pathlib import Path
import csv
import unicodedata
import re
import requests
from bs4 import BeautifulSoup
import tinysegmenter
segmenter = tinysegmenter.TinySegmenter()
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)
item_url = "https://api.atlasacademy.io/export/JP/nice_item.json"
ce_url = "https://api.atlasacademy.io/export/JP/nice_equip.json"
Item_blacklist_file = Path(__file__).resolve().parent / Path("item_bl.txt")
shortname_file = Path(__file__).resolve().parent / Path("shortname.csv")
CE_blacklist_file = Path(__file__).resolve().parent / Path("ce_bl.txt")
CE_gacha_file = Path(__file__).resolve().parent / Path("ce_gacha.txt")
ces = []
def parse_page(load_url):
html = requests.get(load_url)
soup = BeautifulSoup(html.content, "html.parser")
page_title = soup.find('title')
if "ピックアップ召喚" not in page_title.get_text():
return []
ces = []
items = soup.select(".gainen_ttl")
for item in items:
text = unicodedata.normalize('NFKC', item.get_text())
text =re.sub("\([^\(\)]*\)$", "", text.strip()).strip()
ces.append(text)
return ces
def get_pages():
base_url = "https://news.fate-go.jp"
html = requests.get(base_url)
soup = BeautifulSoup(html.content, "html.parser")
tag_item = soup.select('ul.list_news li a')
ces = []
for tag in tag_item:
load_url = base_url + tag.get("href")
logger.debug(load_url)
try:
ce_list = parse_page(load_url)
except Exception as e:
logger.exception(e)
ce_list = None
if ce_list is not None:
ces += ce_list
return ces
def is_gachaCe(ce):
"""
ガチャ産CEか判別
"""
if ce in ces:
return True
else:
return False
def main(args):
global ces
ces = get_pages()
# phash に存在しない(新)アイテム(id, name)の追記(shortname.csv, name_alias.csv)
r_get = requests.get(item_url)
item_list = r_get.json()
with open(shortname_file, encoding='UTF-8') as f:
reader = csv.DictReader(f)
shortnames = [row for row in reader]
with open(Item_blacklist_file, encoding='UTF-8') as f:
bl_item = [s.strip() for s in f.readlines()]
name2shortname = {}
for item in shortnames:
name2shortname[item["name"]] = item["shortname"]
for item in item_list:
if item["type"] not in ["qp", "questRewardQp", "skillLvUp",
"tdLvUp", "eventItem", "eventPoint", "dice"]:
continue
if item["name"] not in bl_item:
if item["name"] not in name2shortname.keys():
name2shortname[item["name"]] = segmenter.tokenize(item["name"])[-1]
# 概念礼装
r_get = requests.get(ce_url)
ce_list = r_get.json()
with open(CE_blacklist_file, encoding='UTF-8') as f:
bl_ces = [s.strip() for s in f.readlines()]
with open(CE_gacha_file, encoding='UTF-8') as f:
gacha_ces = [s.strip() for s in f.readlines()]
for ce in ce_list:
if ce["rarity"] <= 2:
continue
name = ce["name"]
if ce["atkMax"]-ce["atkBase"]+ce["hpMax"]-ce["hpBase"] == 0 \
and not ce["name"].startswith("概念礼装EXPカード:"):
continue
# 除外礼装は読み込まない
if name in bl_ces + gacha_ces:
continue
# 公式ウェブのピックアップ召喚情報にある概念礼装を除外
if is_gachaCe(ce["name"]):
gacha_ces.append(ce["name"])
continue
if ce["name"] not in name2shortname.keys():
name2shortname[ce["name"]] = ""
rows = [[k, v] for k, v in name2shortname.items()]
with open(shortname_file, "w", encoding='UTF-8') as f:
writer = csv.writer(f, lineterminator="\n")
writer.writerow(["name", "shortname"])
writer.writerows(rows)
with open(CE_gacha_file, "w", encoding='UTF-8') as f:
f.write('\n'.join(gacha_ces))
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
'--loglevel',
choices=('DEBUG', 'INFO', 'WARNING'),
default='WARNING',
help='loglevel [default: WARNING]',
)
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
logger.setLevel(args.loglevel)
main(args)