Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions analysis/gold_mlp_prediction_2026-02-25.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"setup": {
"seq_len": 2,
"hidden": 128,
"dropout": 0.2,
"train_frac": 0.7,
"val_frac": 0.1,
"batch_size": 64,
"epochs": 80,
"patience": 20,
"lr": 0.001,
"weight_decay": 0.0001,
"include_indicators": true,
"seed": 42
},
"test_metrics": {
"rmse_price": 0.24380004778602185,
"mae_price": 0.19273792207241058,
"mse_price": 0.05943846330046654,
"r2": -0.3544858694076538,
"direction_accuracy": 0.5454545454545454
},
"baseline_metrics": {
"rmse_price": 6.5826802233882455,
"mae_price": 6.579346187018833,
"mse_price": 43.331678923386725
},
"prediction": {
"predicted_log_close": 7.6717987060546875,
"predicted_close": 2146.939687700816,
"prediction_for_date": "2026-02-25",
"based_on_date": "2026-02-24"
},
"model_path": "analysis/models/gold_mlp_seq2_ep80_20260224.pt"
}
40 changes: 40 additions & 0 deletions analysis/gold_transformer_prediction_2026-02-25.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{
"setup": {
"seq_len": 3,
"train_frac": 0.7,
"val_frac": 0.1,
"d_model": 60,
"nhead": 5,
"layers": 1,
"dim_ff": 256,
"dropout": 0.2,
"input_dropout": 0.1,
"batch_size": 32,
"epochs": 100,
"patience": 100,
"lr": 0.001,
"weight_decay": 0.0001,
"include_indicators": true,
"seed": 42,
"train_end_date": "2026-02-13",
"prediction_base_date": "2026-02-24"
},
"test_metrics": {
"rmse_price": 0.5291360702329879,
"mae_price": 0.4818355441093445,
"mse_price": 0.2799849808216095,
"r2": -5.383824825286865,
"direction_accuracy": 0.4874141876430206
},
"baseline_metrics": {
"rmse_price": 6.5820665192088015,
"mae_price": 6.578734009235542,
"mse_price": 43.323599663289464
},
"prediction": {
"predicted_log_close": 6.082147121429443,
"predicted_close": 437.96855756983575,
"prediction_for_date": "2026-02-25",
"based_on_date": "2026-02-24"
}
}
7 changes: 6 additions & 1 deletion analysis/ui_mlp_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,12 @@ def predict_with_bundle(bundle: TrainedBundle, datasets: List[Dict[str, Any]]) -
with tempfile.TemporaryDirectory() as tmp_dir:
raw_dir = Path(tmp_dir)
_prepare_raw_files(datasets, raw_dir)
panel = run_prep(raw_dir=raw_dir, out_path=None, include_indicators=bundle.include_indicators)
panel = run_prep(
raw_dir=raw_dir,
out_path=None,
include_indicators=bundle.include_indicators,
drop_last_unlabeled=False,
)

feature_cols = bundle.feature_cols
if "label" not in panel.columns:
Expand Down
23 changes: 15 additions & 8 deletions prep_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def _load_price(path: Path, tz: str, close_hour: int) -> pd.Series:
return pd.Series(grouped.values, index=grouped.index, name="close")


def _build_with_indicators(comex_path: Path, sh_path: Path) -> pd.DataFrame:
def _build_with_indicators(comex_path: Path, sh_path: Path, drop_last_unlabeled: bool = True) -> pd.DataFrame:
if compute_technical_indicators is None:
raise RuntimeError("compute_technical_indicators unavailable; set include_indicators=False")

Expand All @@ -56,11 +56,12 @@ def _build_with_indicators(comex_path: Path, sh_path: Path) -> pd.DataFrame:
df = base.join(comex_ind, how="inner").join(sh_ind, how="inner")
df = df.dropna().sort_index()
df["label"] = df["log_close_sh"].shift(-1)
df = df.dropna()
if drop_last_unlabeled:
df = df.dropna()
return df


def _build_minimal(comex_path: Path, sh_path: Path) -> pd.DataFrame:
def _build_minimal(comex_path: Path, sh_path: Path, drop_last_unlabeled: bool = True) -> pd.DataFrame:
comex = _load_price(comex_path, tz="America/New_York", close_hour=17)
sh = _load_price(sh_path, tz="Asia/Shanghai", close_hour=15)

Expand All @@ -69,11 +70,17 @@ def _build_minimal(comex_path: Path, sh_path: Path) -> pd.DataFrame:
df = pd.concat([comex_log, sh_log], axis=1, join="inner")
df = df.dropna().sort_index()
df["label"] = df["log_close_sh"].shift(-1)
df = df.dropna()
if drop_last_unlabeled:
df = df.dropna()
return df


def run_prep(raw_dir: Path, out_path: Optional[Path] = None, include_indicators: bool = False) -> pd.DataFrame:
def run_prep(
raw_dir: Path,
out_path: Optional[Path] = None,
include_indicators: bool = False,
drop_last_unlabeled: bool = True,
) -> pd.DataFrame:
"""Prepare feature table with optional technical indicators and label = next-day log_close_sh.

Default keeps indicators off to remain robust for very short synthetic datasets (tests).
Expand All @@ -84,12 +91,12 @@ def run_prep(raw_dir: Path, out_path: Optional[Path] = None, include_indicators:
sh_path = raw_dir / "shanghai_gold_9999.csv"

if include_indicators and compute_technical_indicators is not None:
df = _build_with_indicators(comex_path, sh_path)
df = _build_with_indicators(comex_path, sh_path, drop_last_unlabeled=drop_last_unlabeled)
if len(df) < 3:
# Fallback when rolling-window indicators wipe out short samples
df = _build_minimal(comex_path, sh_path)
df = _build_minimal(comex_path, sh_path, drop_last_unlabeled=drop_last_unlabeled)
else:
df = _build_minimal(comex_path, sh_path)
df = _build_minimal(comex_path, sh_path, drop_last_unlabeled=drop_last_unlabeled)

if out_path is not None:
out_path = Path(out_path)
Expand Down
28 changes: 28 additions & 0 deletions tests/test_ui_mlp_service.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from pathlib import Path

import pandas as pd
import pytest

from analysis.ui_mlp_service import predict_with_bundle, train_from_uploaded_data
Expand Down Expand Up @@ -59,3 +60,30 @@ def test_train_fails_on_missing_required_columns(tmp_path: Path) -> None:
datasets = [{"name": "broken.csv", "data": [{"date": "2024-01-01", "open": 1.0, "close": 1.0}]}]
with pytest.raises(ValueError, match="missing required columns"):
train_from_uploaded_data(datasets=datasets, params={"epochs": 1}, save_path=tmp_path / "m.pth")


def test_predict_uses_latest_available_date(tmp_path: Path) -> None:
rows = _make_rows(1.1, n=24)
rows[-1]["date"] = "2024-02-24"
datasets = [{"name": "shanghai.csv", "data": rows}]
params = {
"seq_len": 2,
"train_frac": 0.7,
"val_frac": 0.1,
"epochs": 3,
"patience": 2,
"batch_size": 8,
"lr": 1e-3,
"weight_decay": 1e-4,
"dropout": 0.1,
"hidden": 32,
"seed": 42,
"include_indicators": False,
}

bundle = train_from_uploaded_data(datasets=datasets, params=params, save_path=tmp_path / "model.pth")
pred = predict_with_bundle(bundle, datasets)

assert pred["based_on_date"] == "2024-02-24"
expected_next_business_day = (pd.Timestamp("2024-02-24") + pd.tseries.offsets.BDay(1)).strftime("%Y-%m-%d")
assert pred["prediction_for_date"] == expected_next_business_day