diff --git a/analysis/gold_mlp_prediction_2026-02-25.json b/analysis/gold_mlp_prediction_2026-02-25.json new file mode 100644 index 0000000..e046fcc --- /dev/null +++ b/analysis/gold_mlp_prediction_2026-02-25.json @@ -0,0 +1,35 @@ +{ + "setup": { + "seq_len": 2, + "hidden": 128, + "dropout": 0.2, + "train_frac": 0.7, + "val_frac": 0.1, + "batch_size": 64, + "epochs": 80, + "patience": 20, + "lr": 0.001, + "weight_decay": 0.0001, + "include_indicators": true, + "seed": 42 + }, + "test_metrics": { + "rmse_price": 0.24380004778602185, + "mae_price": 0.19273792207241058, + "mse_price": 0.05943846330046654, + "r2": -0.3544858694076538, + "direction_accuracy": 0.5454545454545454 + }, + "baseline_metrics": { + "rmse_price": 6.5826802233882455, + "mae_price": 6.579346187018833, + "mse_price": 43.331678923386725 + }, + "prediction": { + "predicted_log_close": 7.6717987060546875, + "predicted_close": 2146.939687700816, + "prediction_for_date": "2026-02-25", + "based_on_date": "2026-02-24" + }, + "model_path": "analysis/models/gold_mlp_seq2_ep80_20260224.pt" +} \ No newline at end of file diff --git a/analysis/gold_transformer_prediction_2026-02-25.json b/analysis/gold_transformer_prediction_2026-02-25.json new file mode 100644 index 0000000..6b7da7a --- /dev/null +++ b/analysis/gold_transformer_prediction_2026-02-25.json @@ -0,0 +1,40 @@ +{ + "setup": { + "seq_len": 3, + "train_frac": 0.7, + "val_frac": 0.1, + "d_model": 60, + "nhead": 5, + "layers": 1, + "dim_ff": 256, + "dropout": 0.2, + "input_dropout": 0.1, + "batch_size": 32, + "epochs": 100, + "patience": 100, + "lr": 0.001, + "weight_decay": 0.0001, + "include_indicators": true, + "seed": 42, + "train_end_date": "2026-02-13", + "prediction_base_date": "2026-02-24" + }, + "test_metrics": { + "rmse_price": 0.5291360702329879, + "mae_price": 0.4818355441093445, + "mse_price": 0.2799849808216095, + "r2": -5.383824825286865, + "direction_accuracy": 0.4874141876430206 + }, + "baseline_metrics": { + "rmse_price": 6.5820665192088015, + "mae_price": 6.578734009235542, + "mse_price": 43.323599663289464 + }, + "prediction": { + "predicted_log_close": 6.082147121429443, + "predicted_close": 437.96855756983575, + "prediction_for_date": "2026-02-25", + "based_on_date": "2026-02-24" + } +} \ No newline at end of file diff --git a/analysis/ui_mlp_service.py b/analysis/ui_mlp_service.py index 866fa23..5ac569d 100644 --- a/analysis/ui_mlp_service.py +++ b/analysis/ui_mlp_service.py @@ -200,7 +200,12 @@ def predict_with_bundle(bundle: TrainedBundle, datasets: List[Dict[str, Any]]) - with tempfile.TemporaryDirectory() as tmp_dir: raw_dir = Path(tmp_dir) _prepare_raw_files(datasets, raw_dir) - panel = run_prep(raw_dir=raw_dir, out_path=None, include_indicators=bundle.include_indicators) + panel = run_prep( + raw_dir=raw_dir, + out_path=None, + include_indicators=bundle.include_indicators, + drop_last_unlabeled=False, + ) feature_cols = bundle.feature_cols if "label" not in panel.columns: diff --git a/prep_script.py b/prep_script.py index 64bef79..8d628ef 100644 --- a/prep_script.py +++ b/prep_script.py @@ -33,7 +33,7 @@ def _load_price(path: Path, tz: str, close_hour: int) -> pd.Series: return pd.Series(grouped.values, index=grouped.index, name="close") -def _build_with_indicators(comex_path: Path, sh_path: Path) -> pd.DataFrame: +def _build_with_indicators(comex_path: Path, sh_path: Path, drop_last_unlabeled: bool = True) -> pd.DataFrame: if compute_technical_indicators is None: raise RuntimeError("compute_technical_indicators unavailable; set include_indicators=False") @@ -56,11 +56,12 @@ def _build_with_indicators(comex_path: Path, sh_path: Path) -> pd.DataFrame: df = base.join(comex_ind, how="inner").join(sh_ind, how="inner") df = df.dropna().sort_index() df["label"] = df["log_close_sh"].shift(-1) - df = df.dropna() + if drop_last_unlabeled: + df = df.dropna() return df -def _build_minimal(comex_path: Path, sh_path: Path) -> pd.DataFrame: +def _build_minimal(comex_path: Path, sh_path: Path, drop_last_unlabeled: bool = True) -> pd.DataFrame: comex = _load_price(comex_path, tz="America/New_York", close_hour=17) sh = _load_price(sh_path, tz="Asia/Shanghai", close_hour=15) @@ -69,11 +70,17 @@ def _build_minimal(comex_path: Path, sh_path: Path) -> pd.DataFrame: df = pd.concat([comex_log, sh_log], axis=1, join="inner") df = df.dropna().sort_index() df["label"] = df["log_close_sh"].shift(-1) - df = df.dropna() + if drop_last_unlabeled: + df = df.dropna() return df -def run_prep(raw_dir: Path, out_path: Optional[Path] = None, include_indicators: bool = False) -> pd.DataFrame: +def run_prep( + raw_dir: Path, + out_path: Optional[Path] = None, + include_indicators: bool = False, + drop_last_unlabeled: bool = True, +) -> pd.DataFrame: """Prepare feature table with optional technical indicators and label = next-day log_close_sh. Default keeps indicators off to remain robust for very short synthetic datasets (tests). @@ -84,12 +91,12 @@ def run_prep(raw_dir: Path, out_path: Optional[Path] = None, include_indicators: sh_path = raw_dir / "shanghai_gold_9999.csv" if include_indicators and compute_technical_indicators is not None: - df = _build_with_indicators(comex_path, sh_path) + df = _build_with_indicators(comex_path, sh_path, drop_last_unlabeled=drop_last_unlabeled) if len(df) < 3: # Fallback when rolling-window indicators wipe out short samples - df = _build_minimal(comex_path, sh_path) + df = _build_minimal(comex_path, sh_path, drop_last_unlabeled=drop_last_unlabeled) else: - df = _build_minimal(comex_path, sh_path) + df = _build_minimal(comex_path, sh_path, drop_last_unlabeled=drop_last_unlabeled) if out_path is not None: out_path = Path(out_path) diff --git a/tests/test_ui_mlp_service.py b/tests/test_ui_mlp_service.py index 2ba64f6..7cca8b7 100644 --- a/tests/test_ui_mlp_service.py +++ b/tests/test_ui_mlp_service.py @@ -1,5 +1,6 @@ from pathlib import Path +import pandas as pd import pytest from analysis.ui_mlp_service import predict_with_bundle, train_from_uploaded_data @@ -59,3 +60,30 @@ def test_train_fails_on_missing_required_columns(tmp_path: Path) -> None: datasets = [{"name": "broken.csv", "data": [{"date": "2024-01-01", "open": 1.0, "close": 1.0}]}] with pytest.raises(ValueError, match="missing required columns"): train_from_uploaded_data(datasets=datasets, params={"epochs": 1}, save_path=tmp_path / "m.pth") + + +def test_predict_uses_latest_available_date(tmp_path: Path) -> None: + rows = _make_rows(1.1, n=24) + rows[-1]["date"] = "2024-02-24" + datasets = [{"name": "shanghai.csv", "data": rows}] + params = { + "seq_len": 2, + "train_frac": 0.7, + "val_frac": 0.1, + "epochs": 3, + "patience": 2, + "batch_size": 8, + "lr": 1e-3, + "weight_decay": 1e-4, + "dropout": 0.1, + "hidden": 32, + "seed": 42, + "include_indicators": False, + } + + bundle = train_from_uploaded_data(datasets=datasets, params=params, save_path=tmp_path / "model.pth") + pred = predict_with_bundle(bundle, datasets) + + assert pred["based_on_date"] == "2024-02-24" + expected_next_business_day = (pd.Timestamp("2024-02-24") + pd.tseries.offsets.BDay(1)).strftime("%Y-%m-%d") + assert pred["prediction_for_date"] == expected_next_business_day