-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata.py
More file actions
116 lines (89 loc) · 3.7 KB
/
data.py
File metadata and controls
116 lines (89 loc) · 3.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os, io
import requests
import polars as pl
import json
# Dataset info ----
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
REPO = "MGFPKU/target_dataset"
ASSET_NAME = "dataset.xlsx"
LOCAL_DATA: bool = os.getenv("LOCAL_DATA", "FALSE").upper() == "TRUE"
WANTED_COLS = ["Announcement_Year", "Target_Category"]
def promote_second_row_to_header(df: pl.DataFrame) -> pl.DataFrame:
"""Use the first row of `df` as column names and drop that row.
Polars by default uses the first Excel row as column names. If the actual
headers live in the second Excel row (which becomes the first row of the
DataFrame), promote that row to be the DataFrame columns and remove it.
"""
if df.height == 0:
return df
header_vals = [str(v) if v is not None else "" for v in df.row(0)]
df = df.slice(1)
df.columns = header_vals
return df
def fetch_raw_data() -> io.BytesIO:
if LOCAL_DATA:
with open("../CHINA'S NATIONAL CLIMATE TARGETS DATABASE.xlsx", "rb") as f:
print("Using local data...")
return io.BytesIO(f.read())
headers = {
"Authorization": f"Bearer {GITHUB_TOKEN}",
"Accept": "application/vnd.github+json",
}
# 1️⃣ Get latest release metadata
latest_url = f"https://api.github.com/repos/{REPO}/releases/latest"
res = requests.get(latest_url, headers=headers)
if res.status_code != 200:
raise RuntimeError(f"Failed to fetch file: {res.status_code}\n{res.text}")
release = res.json()
# 2️⃣ Find the asset
asset = next((a for a in release["assets"] if a["name"] == ASSET_NAME), None)
if asset is None:
raise RuntimeError("dataset.xlsx not found in latest release.")
asset_id = asset["id"]
# 3️⃣ Download asset binary
download_headers = {
"Authorization": f"Bearer {GITHUB_TOKEN}",
"Accept": "application/octet-stream",
}
download_url = f"https://api.github.com/repos/{REPO}/releases/assets/{asset_id}"
file_res = requests.get(download_url, headers=download_headers)
if file_res.status_code != 200:
raise RuntimeError(f"Failed to download file:\n{file_res.text}")
# 4️⃣ Load Excel into Polars
return io.BytesIO(file_res.content)
def get_sheet_names() -> tuple[list[str], str]:
with open("sheets.json", "r", encoding="utf-8") as f:
dicts: dict = json.load(f)
sheet_names: list[str] = dicts.get("sheets", [])[0]
if not sheet_names:
raise RuntimeError("No sheet names found in sheets.json")
source_sheet: str = dicts.get("source", "")
if source_sheet == "":
raise RuntimeError("No source sheet specified in sheets.json")
return sheet_names, source_sheet
def get_data() -> pl.DataFrame:
raw_xlsx = fetch_raw_data()
sheet_names, source_sheet = get_sheet_names()
source_sheet = pl.read_excel(raw_xlsx, sheet_name=source_sheet)
source_sheet = promote_second_row_to_header(source_sheet).select(
["code", "doc_name_en", "doc_name_zh"]
)
combined_sheet: pl.DataFrame | None = None
for sheet_name in sheet_names:
sheet = (
pl.read_excel(raw_xlsx, sheet_name=sheet_name)
.with_columns(pl.all().cast(pl.Utf8))
.filter(pl.col("Count") != "r")
)
sheet = sheet.with_columns(
pl.col("Target_Category").str.replace(r"\s*target$", "", literal=False)
)
sheet = sheet.select(WANTED_COLS)
combined_sheet = (
sheet if combined_sheet is None else pl.concat([combined_sheet, sheet])
)
if combined_sheet is None:
raise RuntimeError(
"No sheets were processed. Check sheets.json and dataset.xlsx"
)
return combined_sheet.fill_null("N/A")