fgoscdata/get_data.py at master · fgosc/fgoscdata · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python3
import argparse
import logging
import json
from pathlib import Path
import csv
import unicodedata
import re

import requests
from bs4 import BeautifulSoup
import tinysegmenter
segmenter = tinysegmenter.TinySegmenter()

logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)

item_url = "https://api.atlasacademy.io/export/JP/nice_item.json"
ce_url = "https://api.atlasacademy.io/export/JP/nice_equip.json"
Item_blacklist_file = Path(__file__).resolve().parent / Path("item_bl.txt")
shortname_file = Path(__file__).resolve().parent / Path("shortname.csv")
CE_blacklist_file = Path(__file__).resolve().parent / Path("ce_bl.txt")
CE_gacha_file = Path(__file__).resolve().parent / Path("ce_gacha.txt")
ces = []

def parse_page(load_url):
    html = requests.get(load_url)
    soup = BeautifulSoup(html.content, "html.parser")
    page_title = soup.find('title')
    if "ピックアップ召喚" not in page_title.get_text():
        return []
    ces = []
    items = soup.select(".gainen_ttl")
    for item in items:
        text = unicodedata.normalize('NFKC', item.get_text())
        text =re.sub("\([^\(\)]*\)$", "", text.strip()).strip()
        ces.append(text)
    return ces


def get_pages():
    base_url = "https://news.fate-go.jp"
    html = requests.get(base_url)
    soup = BeautifulSoup(html.content, "html.parser")
    tag_item = soup.select('ul.list_news li a')
    ces = []

    for tag in tag_item:
        load_url = base_url + tag.get("href")
        logger.debug(load_url)
        try:
            ce_list = parse_page(load_url)
        except Exception as e:
            logger.exception(e)
            ce_list = None
        if ce_list is not None:
            ces += ce_list
    return ces

def is_gachaCe(ce):
    """
    ガチャ産CEか判別
    """
    if ce in ces:
        return True
    else:
        return False


def main(args):
    global ces
    ces = get_pages()
    # phash に存在しない(新)アイテム(id, name)の追記(shortname.csv, name_alias.csv)
    r_get = requests.get(item_url)
    item_list = r_get.json()

    with open(shortname_file, encoding='UTF-8') as f:
        reader = csv.DictReader(f)
        shortnames = [row for row in reader]

    with open(Item_blacklist_file, encoding='UTF-8') as f:
        bl_item = [s.strip() for s in f.readlines()]

    name2shortname = {}
    for item in shortnames:
        name2shortname[item["name"]] = item["shortname"]

    for item in item_list:
        if item["type"] not in ["qp", "questRewardQp", "skillLvUp",
                                "tdLvUp", "eventItem", "eventPoint", "dice"]:
            continue
        if item["name"] not in bl_item:
            if item["name"] not in name2shortname.keys():
                name2shortname[item["name"]] = segmenter.tokenize(item["name"])[-1]

    # 概念礼装
    r_get = requests.get(ce_url)

    ce_list = r_get.json()
    with open(CE_blacklist_file, encoding='UTF-8') as f:
        bl_ces = [s.strip() for s in f.readlines()]
    with open(CE_gacha_file, encoding='UTF-8') as f:
        gacha_ces = [s.strip() for s in f.readlines()]
    for ce in ce_list:
        if ce["rarity"] <= 2:
            continue
        name = ce["name"]
        if ce["atkMax"]-ce["atkBase"]+ce["hpMax"]-ce["hpBase"] == 0 \
           and not ce["name"].startswith("概念礼装EXPカード："):
            continue
        # 除外礼装は読み込まない
        if name in bl_ces + gacha_ces:
            continue
        # 公式ウェブのピックアップ召喚情報にある概念礼装を除外
        if is_gachaCe(ce["name"]):
            gacha_ces.append(ce["name"])
            continue
        if ce["name"] not in name2shortname.keys():
            name2shortname[ce["name"]] = ""

    rows = [[k, v] for k, v in name2shortname.items()]
    with open(shortname_file, "w", encoding='UTF-8') as f:
        writer = csv.writer(f, lineterminator="\n")
        writer.writerow(["name", "shortname"])
        writer.writerows(rows)
    with open(CE_gacha_file, "w", encoding='UTF-8') as f:
        f.write('\n'.join(gacha_ces))


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--loglevel',
        choices=('DEBUG', 'INFO', 'WARNING'),
        default='WARNING',
        help='loglevel [default: WARNING]',
    )
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    logger.setLevel(args.loglevel)
    main(args)