From 935414ee7465bf8917e0c9485bb2d423e8df440d Mon Sep 17 00:00:00 2001
From: Casper Ng <casperngeen@gmail.com>
Date: Mon, 13 Apr 2026 21:50:47 +0800
Subject: [PATCH 1/4] Add submodule

---
 .gitmodules | 3 +++
 stg         | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 stg

diff --git a/.gitmodules b/.gitmodules
index 1f80847..17f0267 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -2,3 +2,6 @@
 	path = freqtrade
 	url = https://github.com/mlsys-io/freqtrade.git
 	branch = develop
+[submodule "stg"]
+	path = stg
+	url = https://github.com/casperngeen/event-pred
diff --git a/stg b/stg
new file mode 160000
index 0000000..e07faf7
--- /dev/null
+++ b/stg
@@ -0,0 +1 @@
+Subproject commit e07faf7b6a0c7764230f4f359ebd1c48301be8d4

From 110004939d220b9633af6a94cc7b268dbe15e1cc Mon Sep 17 00:00:00 2001
From: Casper Ng <casperngeen@gmail.com>
Date: Wed, 15 Apr 2026 14:41:06 +0800
Subject: [PATCH 2/4] Add submodule branch

---
 .gitmodules | 1 +
 stg         | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 17f0267..922498a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -5,3 +5,4 @@
 [submodule "stg"]
 	path = stg
 	url = https://github.com/casperngeen/event-pred
+	branch = main
\ No newline at end of file
diff --git a/stg b/stg
index e07faf7..6b78226 160000
--- a/stg
+++ b/stg
@@ -1 +1 @@
-Subproject commit e07faf7b6a0c7764230f4f359ebd1c48301be8d4
+Subproject commit 6b782264d41f82b45c91947f81e21f0c725c4ba6

From 7145f9a0adf2ef15669becbadc1b9bbe700e8636 Mon Sep 17 00:00:00 2001
From: Casper Ng <casperngeen@gmail.com>
Date: Wed, 15 Apr 2026 22:32:32 +0800
Subject: [PATCH 3/4] Add kalshi data

---
 .gitignore             | 2 ++
 kalshi/.env.example    | 9 +++++++++
 stg                    | 2 +-
 utils/download_data.py | 6 ++++++
 4 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100644 kalshi/.env.example

diff --git a/.gitignore b/.gitignore
index 3858e46..41d8411 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,7 @@ user_data/*
 user_data/data/usstock/
 user_data/data/polymarket/
 user_data/data/portfoliobench/
+user_data/data/kalshi/
 # Generated ML artifacts (regenerable via scripts/prepare_event_model.py)
 user_data/data/polymarket_ml/
 # Local development data, model checkpoints, and notebooks
@@ -30,6 +31,7 @@ freqtrade-plot.html
 freqtrade-profit-plot.html
 freqtrade/rpc/api_server/ui/*
 build_helpers/ta-lib/*
+kalshi/*.parquet
 
 # Macos related
 .DS_Store
diff --git a/kalshi/.env.example b/kalshi/.env.example
new file mode 100644
index 0000000..e94b38a
--- /dev/null
+++ b/kalshi/.env.example
@@ -0,0 +1,9 @@
+# Copy this file to kalshi/.env and adjust values for your local setup.
+# .env is gitignored — never commit it.
+
+# Path to the Kalshi data root (must contain markets/ and trades/ subdirs)
+KALSHI_DATA_DIR=user_data/data/kalshi/kalshi
+
+# Inclusive date range for implied mean series (YYYY-MM-DD)
+KALSHI_DATE_START=2022-05-01
+KALSHI_DATE_END=2025-08-01
diff --git a/stg b/stg
index 6b78226..bad38d1 160000
--- a/stg
+++ b/stg
@@ -1 +1 @@
-Subproject commit 6b782264d41f82b45c91947f81e21f0c725c4ba6
+Subproject commit bad38d1b71001039923166b8036afd0297a5b748
diff --git a/utils/download_data.py b/utils/download_data.py
index 8593e6b..568121b 100644
--- a/utils/download_data.py
+++ b/utils/download_data.py
@@ -45,6 +45,12 @@
         "description": "Polymarket prediction-market contracts",
         "output_dirs": ["user_data/data/polymarket"],
     },
+    "kalshi": {
+        "folder_id": "1DDo6uumqlsHeO4Ikvbo8LleEvWrXBKnP",
+        "folder_url": "https://drive.google.com/drive/folders/1DDo6uumqlsHeO4Ikvbo8LleEvWrXBKnP",
+        "description": "Kalshi prediction-market contracts",
+        "output_dirs": ["user_data/data/kalshi"],
+    },
 }
 
 PROJECT_ROOT = Path(__file__).resolve().parent.parent

From 6546912200ec27ec8348b9a5797b40cb6a3b4100 Mon Sep 17 00:00:00 2001
From: Casper Ng <casperngeen@gmail.com>
Date: Wed, 15 Apr 2026 22:33:12 +0800
Subject: [PATCH 4/4] Add baseline model for Kalshi event prediction

---
 kalshi/__init__.py |   0
 kalshi/baseline.py | 218 +++++++++++++++++++++++++++++++++++++++++++++
 pyproject.toml     |   1 +
 pyrightconfig.json |   8 ++
 4 files changed, 227 insertions(+)
 create mode 100644 kalshi/__init__.py
 create mode 100644 kalshi/baseline.py
 create mode 100644 pyrightconfig.json

diff --git a/kalshi/__init__.py b/kalshi/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/kalshi/baseline.py b/kalshi/baseline.py
new file mode 100644
index 0000000..e969f1b
--- /dev/null
+++ b/kalshi/baseline.py
@@ -0,0 +1,218 @@
+"""Kalshi Economic Events — Implied Mean Surprise Direction Baseline.
+
+Recovers implied means from Kalshi threshold-based event markets
+(CPI, Core CPI, FED level/decision, Payrolls, Unemployment, GDP)
+via the stg_infra events submodule and trains a logistic regression
+baseline to predict surprise direction:
+
+  label=1  → actual print > implied mean  (upside surprise)
+  label=0  → actual print < implied mean  (downside surprise)
+
+Features come directly from the PDF-derived statistics returned by the
+stg_infra event series functions: implied_mean, implied_std, implied_skew,
+implied_kurtosis, implied_entropy, implied_median, n_submarkets.
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+from pathlib import Path
+
+from dotenv import load_dotenv
+load_dotenv(Path(__file__).parent / ".env")
+
+# stg_infra uses two import roots:
+#   - PortfolioBench/stg          → makes `stg_infra.*` findable (namespace pkg)
+#   - PortfolioBench/stg/stg_infra → makes `stg.*` findable (installed pkg root)
+_STG_ROOT = Path(__file__).parent.parent / "stg"
+sys.path.insert(0, str(_STG_ROOT))
+sys.path.insert(0, str(_STG_ROOT / "stg_infra"))
+
+import numpy as np
+import polars as pl
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import classification_report, accuracy_score
+from sklearn.preprocessing import StandardScaler
+
+from stg.events.cpi import compute_cpi_mom_series, compute_cpi_yoy_series
+from stg.events.core_cpi import compute_core_cpi_mom_series, compute_core_cpi_yoy_series
+from stg.events.fed import compute_fed_level_series, compute_fed_decision_series
+from stg.events.payrolls import compute_payrolls_series
+from stg.events.unemployment import compute_unemployment_series
+from stg.events.gdp import compute_gdp_series
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s  %(message)s")
+log = logging.getLogger(__name__)
+
+DATA_DIR    = Path("user_data/data/kalshi/kalshi")
+TRAIN_RATIO = 0.7
+
+SERIES_ORDER = [
+    "cpi_mom", "cpi_yoy",
+    "core_cpi_mom", "core_cpi_yoy",
+    "fed_level", "decision_bps",
+    "payrolls",
+    "unemployment",
+    "gdp",
+]
+
+# ---------------------------------------------------------------------------
+# 1. Load data
+# ---------------------------------------------------------------------------
+log.info("Loading markets...")
+markets = pl.read_parquet(str(DATA_DIR / "markets/*.parquet"))
+
+log.info("Loading trades...")
+trades = pl.read_parquet(str(DATA_DIR / "trades/*.parquet"))
+
+log.info("Markets: %d rows | Trades: %d rows", len(markets), len(trades))
+
+# ---------------------------------------------------------------------------
+# 2. Compute implied mean series for every supported event type
+# ---------------------------------------------------------------------------
+log.info("Computing implied mean series...")
+
+_series_fns = [
+    compute_cpi_mom_series,
+    compute_cpi_yoy_series,
+    compute_core_cpi_mom_series,
+    compute_core_cpi_yoy_series,
+    compute_fed_level_series,
+    compute_fed_decision_series,
+    compute_payrolls_series,
+    compute_unemployment_series,
+    compute_gdp_series,
+]
+
+frames = []
+for fn in _series_fns:
+    df = fn(markets, trades)
+    if not df.is_empty():
+        frames.append(df)
+        log.info("  %-25s  %d rows  %d events",
+                 fn.__name__, len(df), df["event_ticker"].n_unique())
+
+if not frames:
+    sys.exit("No implied mean data found.")
+
+implied = pl.concat(frames, how="diagonal").sort(["event_ticker", "date"])
+log.info("Total implied-mean rows: %d across %d events",
+         len(implied), implied["event_ticker"].n_unique())
+
+# ---------------------------------------------------------------------------
+# 3. Label each observation
+# ---------------------------------------------------------------------------
+# Keep only rows where resolved_value is known and implied_mean is not null
+# and there is a clear surprise (skip exact ties)
+labeled = (
+    implied
+    .filter(
+        pl.col("resolved_value").is_not_null()
+        & pl.col("implied_mean").is_not_null()
+        & (pl.col("resolved_value") != pl.col("implied_mean"))
+    )
+    .with_columns(
+        pl.when(pl.col("resolved_value") > pl.col("implied_mean"))
+        .then(pl.lit(1))
+        .otherwise(pl.lit(0))
+        .alias("label")
+    )
+)
+
+log.info("Labeled observations: %d (up=%d, down=%d)",
+         len(labeled),
+         (labeled["label"] == 1).sum(),
+         (labeled["label"] == 0).sum())
+
+if labeled.is_empty():
+    sys.exit("No labeled observations after filtering.")
+
+# ---------------------------------------------------------------------------
+# 4. Chronological train / test split  (split on date, not shuffle)
+# ---------------------------------------------------------------------------
+all_dates = labeled["date"].sort()
+split_idx  = int(len(all_dates) * TRAIN_RATIO)
+split_date = all_dates[split_idx]
+
+train_df = labeled.filter(pl.col("date") < split_date)
+test_df  = labeled.filter(pl.col("date") >= split_date)
+
+log.info("Train: %d obs (%s → %s)", len(train_df),
+         train_df["date"].min(), train_df["date"].max())
+log.info("Test:  %d obs (%s → %s)", len(test_df),
+         test_df["date"].min(), test_df["date"].max())
+
+# ---------------------------------------------------------------------------
+# 5. Assemble feature matrix
+# ---------------------------------------------------------------------------
+FEATURE_COLS = [
+    "implied_mean",
+    "implied_std",
+    "implied_skew",
+    "implied_kurtosis",
+    "implied_entropy",
+    "implied_median",
+    "n_submarkets",
+]
+
+
+def to_xy(df: pl.DataFrame) -> tuple[np.ndarray, np.ndarray]:
+    X = df.select(FEATURE_COLS).to_numpy().astype(np.float64)
+    y = df["label"].to_numpy().astype(np.int64)
+    # Drop rows with any NaN feature (lag windows produce NaNs at start of series)
+    mask = ~np.isnan(X).any(axis=1)
+    return X[mask], y[mask]
+
+
+X_train, y_train = to_xy(train_df)
+X_test,  y_test  = to_xy(test_df)
+
+log.info("Train samples after NaN drop: %d (up=%d, down=%d)",
+         len(y_train), (y_train == 1).sum(), (y_train == 0).sum())
+log.info("Test  samples after NaN drop: %d (up=%d, down=%d)",
+         len(y_test), (y_test == 1).sum(), (y_test == 0).sum())
+
+if len(np.unique(y_train)) < 2:
+    sys.exit("Training set has only one class — cannot train classifier.")
+
+# ---------------------------------------------------------------------------
+# 6. Train logistic regression baseline
+# ---------------------------------------------------------------------------
+scaler = StandardScaler()
+X_train_s = scaler.fit_transform(X_train)
+X_test_s  = scaler.transform(X_test)
+
+model = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
+model.fit(X_train_s, y_train)
+
+y_pred = model.predict(X_test_s)
+
+# ---------------------------------------------------------------------------
+# 7. Report
+# ---------------------------------------------------------------------------
+print("\n" + "=" * 60)
+print("Implied Mean Surprise Direction Baseline (Logistic Regression)")
+print("=" * 60)
+print(f"Test accuracy: {accuracy_score(y_test, y_pred):.4f}")
+print()
+print(classification_report(y_test, y_pred, target_names=["downside", "upside"]))
+
+coefs = model.coef_[0]
+print("Feature coefficients (sorted by magnitude):")
+for name, coef in sorted(zip(FEATURE_COLS, coefs), key=lambda x: abs(x[1]), reverse=True):
+    print(f"  {name:20s}  {coef:+.4f}")
+
+# Per-series breakdown
+print("\nPer-series accuracy:")
+for stype in SERIES_ORDER:
+    sub = test_df.filter(pl.col("series_type") == stype)
+    if sub.is_empty():
+        continue
+    Xs, ys = to_xy(sub)
+    if len(Xs) == 0:
+        continue
+    Xs_s = scaler.transform(Xs)
+    preds = model.predict(Xs_s)
+    acc = accuracy_score(ys, preds)
+    print(f"  {stype:20s}  n={len(ys):4d}  acc={acc:.4f}")
diff --git a/pyproject.toml b/pyproject.toml
index 08bb692..7bfb5bd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,6 +66,7 @@ dependencies = [
   "aiohttp",
   "cryptography",
   "sdnotify",
+  "python-dotenv",
   "python-dateutil",
   "pytz",
   "packaging",
diff --git a/pyrightconfig.json b/pyrightconfig.json
new file mode 100644
index 0000000..1d6060e
--- /dev/null
+++ b/pyrightconfig.json
@@ -0,0 +1,8 @@
+{
+  "venvPath": ".",
+  "venv": ".venv",
+  "extraPaths": [
+    "stg/stg_infra",
+    "stg"
+  ]
+}