diff --git a/PROGRESS.md b/PROGRESS.md index ed2a86c..8475bbf 100644 --- a/PROGRESS.md +++ b/PROGRESS.md @@ -38,3 +38,4 @@ Phase docs live in `docs/progress/` (untracked, local only). | 8 | Universe expansion Session 2 | COMPLETE — graph + filings re-ingestion on 140 tickers; supply_edges 247→**1,659**, ownership_edges 7,446→**34,843** (140/140 covered), board_edges 104→**326** (39/140), centrality_history 1,740→**8,120**; ISSUER_NAME_MAP 37→150 fragments + AMAT CIK flip leftover from Session 1; HGT embeddings deferred to Session 3 retrain | `docs/progress/phase_8.md` | | 8 | Factor backtest Session 3 | COMPLETE — multi-horizon IC backtest on 140-ticker universe (21d/63d/126d); Phase 5 t-stats shown to be N=30 artifacts (momentum_12_1 126d: 4.149→1.506); graph_delta_eigenvector sign-flipped (t=−3.194 at 63d) → wired as centrality penalty overlay in portfolio.py; graph_customer_momentum definitively null (CLOSED); rolling_registry rebuilt (406 rows); **Phase 8 paper trader CAGR +5.97%, Sharpe 0.374, Max DD −36.68%** — degradation vs Phase 7A reflects accurate N=140 factor gates + extended cash periods | `docs/progress/phase_8.md` | | 9 | EDGAR XBRL fundamentals Session 1 | COMPLETE — migration 0011 adds form_type/accession_number/UNIQUE; `form_xbrl.py` parser (filed≥end integrity invariant, 5 metrics, 4 revenue aliases); 74,662 rows / 137 of 140 tickers covered; 4 factors; **`fundamental_margin_compression` t=+4.834 at 126d — first NEXUS factor to PASS HLZ M=400 Bonferroni** (composer sign-flipped vs literature: compression = buy; CALM-regime t=+6.35; sub-window late-third t=+5.13 — strengthening, not decaying); registered status='approved' in signal_registry; rolling_registry refreshed (464 rows); ROA decayed (late-third t=-0.06) → NOT registered; rd_intensity / asset_growth null; **paper trader CAGR +8.72%, Sharpe 0.488, Max DD -32.68%** (vs Phase 8 baseline +5.97% / 0.374 / -36.68%) | `docs/progress/phase_9.md` | +| 9 | HGT retraining Session 2 | COMPLETE — retrained HGT on 140-ticker graph (no code changes; metadata extracted dynamically); val AUC **0.9807** at epoch 280 (vs 0.9803 / e=240 on the prior 30-ticker run); 7m 03s wall-clock; `MODEL_VERSION` bumped `hgt_link_pred_v1` → **`hgt_link_pred_v2`**; node_embeddings re-backfilled at 58 monthly snapshots → **8,120 rows** (140 × 58, dim=64); embedding validation passed (cos(NVDA,AMD)=0.98 > cos(NVDA,ARW)=0.63; per-component std median 0.05); **`graph_gnn_embedding_drift` IC backtest NULL at all horizons** (t=+0.382 @ 21d / +0.524 @ 63d / +0.368 @ 126d on N=52..57; HLZ fail by 10×); registered `status='rejected'` in signal_registry with full evidence record; paper trader unchanged from Phase 9 Session 1 | `docs/progress/phase_9.md` | diff --git a/nexus/graph/backfill.py b/nexus/graph/backfill.py index 59d4d61..81a61ca 100644 --- a/nexus/graph/backfill.py +++ b/nexus/graph/backfill.py @@ -34,7 +34,7 @@ ) from nexus.graph.gnn.model import HGTLinkPredictor -MODEL_VERSION = "hgt_link_pred_v1" +MODEL_VERSION = "hgt_link_pred_v2" CHECKPOINT_PATH = Path(__file__).resolve().parents[2] / "models" / "hgt_link_pred.pt" diff --git a/research/_validate_embeddings.py b/research/_validate_embeddings.py new file mode 100644 index 0000000..cb13005 --- /dev/null +++ b/research/_validate_embeddings.py @@ -0,0 +1,128 @@ +"""Phase 9 Session 2 — embedding validation spot-checks. + +Ephemeral helper; not part of the permanent code path. Confirms the +retrained HGT embeddings are not degenerate and that semiconductor +peer/distributor cosine ordering matches structural intuition before +running the IC backtest. +""" +from __future__ import annotations + +import sys +from datetime import date + +import numpy as np +import pandas as pd +from sqlalchemy import create_engine, text + +from nexus.config import settings + + +def _load_latest_panel(engine) -> tuple[pd.DataFrame, date]: + with engine.connect() as c: + latest = c.execute(text("SELECT MAX(snapshot_date) FROM node_embeddings")).scalar() + df = pd.read_sql( + text( + """ + SELECT c.ticker, ne.embedding + FROM node_embeddings ne + JOIN companies c ON c.id = ne.company_id + WHERE ne.snapshot_date = :snap + """ + ), + c, + params={"snap": latest}, + ) + return df, latest + + +def _cos(a: np.ndarray, b: np.ndarray) -> float: + return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))) + + +def main() -> None: + engine = create_engine(settings.database_url_sync) + + # Global stats + with engine.connect() as c: + stats = c.execute( + text( + """ + SELECT COUNT(*) n_rows, + COUNT(DISTINCT company_id) n_co, + COUNT(DISTINCT snapshot_date) n_snap, + MIN(snapshot_date) min_snap, + MAX(snapshot_date) max_snap, + MIN(embedding_dim) min_dim, + MAX(embedding_dim) max_dim, + MIN(model_version) ver + FROM node_embeddings + """ + ) + ).one() + print("[GLOBAL]") + print(f" rows = {stats.n_rows}") + print(f" companies = {stats.n_co}") + print(f" snapshot dates = {stats.n_snap}") + print(f" date range = {stats.min_snap} → {stats.max_snap}") + print(f" dim range = [{stats.min_dim}, {stats.max_dim}]") + print(f" model_version = {stats.ver}") + + # Latest snapshot panel + df, latest = _load_latest_panel(engine) + print(f"\n[LATEST SNAPSHOT] {latest} n_tickers={len(df)}") + + emb = np.vstack([np.asarray(e, dtype=np.float32) for e in df["embedding"]]) + print(f" emb shape = {emb.shape}") + print(f" per-component std (median across dims) = {float(np.median(emb.std(axis=0))):.6f}") + print(f" per-component std (min / max) = {float(emb.std(axis=0).min()):.6f} / {float(emb.std(axis=0).max()):.6f}") + print(f" per-row L2 norm (min / median / max) = {float(np.linalg.norm(emb, axis=1).min()):.4f} / {float(np.median(np.linalg.norm(emb, axis=1))):.4f} / {float(np.linalg.norm(emb, axis=1).max()):.4f}") + + tickers = df["ticker"].tolist() + by_t = {t: emb[i] for i, t in enumerate(tickers)} + + def pair(a: str, b: str) -> float | None: + if a not in by_t or b not in by_t: + return None + return _cos(by_t[a], by_t[b]) + + print("\n[PEER vs DISTRIBUTOR — gate: peer cos > distributor cos]") + pairs = [ + ("NVDA", "AMD", "fab-less GPU/CPU peers"), + ("NVDA", "ARW", "GPU designer vs electronics distributor"), + ("AMD", "INTC", "x86 CPU peers"), + ("AMD", "AVT", "CPU designer vs distributor"), + ("LRCX", "AMAT", "wafer-fab equipment peers"), + ("LRCX", "ARW", "equipment maker vs distributor"), + ] + for a, b, note in pairs: + v = pair(a, b) + print(f" cos({a:>6}, {b:>6}) = {v:.4f} [{note}]" if v is not None else f" cos({a:>6}, {b:>6}) = N/A") + + print("\n[SUPPLY-CHAIN — gate: documented supply-chain partner higher than unrelated]") + chain_pairs = [ + ("NVDA", "TSM", "fab customer ↔ foundry"), # TSMC ADR ticker in companies? + ("NVDA", "KO", "unrelated baseline"), + ("AMD", "TSM", "fab customer ↔ foundry"), + ("ASML", "TSM", "EUV supplier ↔ foundry customer"), + ("LRCX", "TSM", "equipment supplier ↔ foundry customer"), + ("LRCX", "KO", "unrelated baseline"), + ] + for a, b, note in chain_pairs: + v = pair(a, b) + print(f" cos({a:>6}, {b:>6}) = {v:.4f} [{note}]" if v is not None else f" cos({a:>6}, {b:>6}) = N/A [{note}]") + + # Sanity: full cosine matrix - find NVDA's top-5 nearest + print("\n[NVDA top-5 nearest neighbours]") + nvda = by_t.get("NVDA") + if nvda is not None: + sims = {t: _cos(nvda, e) for t, e in by_t.items() if t != "NVDA"} + top = sorted(sims.items(), key=lambda kv: kv[1], reverse=True)[:5] + for t, s in top: + print(f" {t:>6} cos={s:.4f}") + + engine.dispose() + print("\n[OK] validation finished") + + +if __name__ == "__main__": + main() diff --git a/research/phase9_gnn_drift_backtest.py b/research/phase9_gnn_drift_backtest.py new file mode 100644 index 0000000..1e3ab64 --- /dev/null +++ b/research/phase9_gnn_drift_backtest.py @@ -0,0 +1,183 @@ +"""Phase 9 Session 2 — Multi-horizon IC backtest for graph_gnn_embedding_drift. + +Reads node_embeddings (140 companies x 58 monthly snapshots, retrained HGT +hgt_link_pred_v1, best val AUC 0.9807). Computes IC at 21d / 63d / 126d +horizons on the 140-ticker UNIVERSE. Reports per-horizon IC table, +HLZ Bonferroni correction, CALM/NON-CALM regime split for |t| > 1.5, +and sub-window stability (early/mid/late thirds). + +Read-only — no DB writes. signal_registry untouched. +""" +from __future__ import annotations + +import math +from datetime import date + +import numpy as np +from sqlalchemy import create_engine + +from nexus.config import settings +from nexus.signals.backtest import ( + CALM_FSI_THRESHOLD, + FactorBacktest, + _aggregate, + _compute_forward_returns, + _load_fsi_by_date, + _load_fundamentals_panel, + compute_factor_ics, + load_all_panels, +) +from nexus.signals.hlz import format_table, hlz_correct + +HORIZONS: list[int] = [21, 63, 126] +FACTOR = "graph_gnn_embedding_drift" + + +def _t_stat(arr: np.ndarray) -> float: + if len(arr) < 2: + return float("nan") + s = arr.std(ddof=1) + if not (s > 0): + return float("nan") + return float(math.sqrt(len(arr)) * arr.mean() / s) + + +def _regime_split(periods: list[date], ics: list[float], + fsi_by_date: dict[date, float | None]) -> None: + calm, stressed = [], [] + for p, ic in zip(periods, ics): + fsi = fsi_by_date.get(p) + if fsi is None: + continue + (calm if fsi < CALM_FSI_THRESHOLD else stressed).append(ic) + for label, vals in [("CALM (FSI<0)", calm), ("NON-CALM(FSI>=0)", stressed)]: + if len(vals) < 2: + print(f" {label:18s} N={len(vals):>3d} (insufficient)") + continue + arr = np.asarray(vals) + print( + f" {label:18s} N={len(arr):>3d} " + f"mean_IC={arr.mean():>+7.4f} t={_t_stat(arr):>+6.3f}" + ) + + +def _subwindow_thirds(periods: list[date], ics: list[float]) -> None: + n = len(ics) + if n < 9: + print(" (insufficient for thirds)") + return + third = n // 3 + chunks = [ + ("Early", 0, third), + ("Mid", third, 2 * third), + ("Late", 2 * third, n), + ] + for label, lo, hi in chunks: + arr = np.asarray(ics[lo:hi]) + if len(arr) < 2: + continue + win = f"{periods[lo]} → {periods[hi - 1]}" + print( + f" {label:5s} {win} N={len(arr):>3d} " + f"mean_IC={arr.mean():>+7.4f} t={_t_stat(arr):>+6.3f}" + ) + + +def main() -> None: + engine = create_engine(settings.database_url_sync) + print("[*] Loading panels...", flush=True) + price_panel, centrality_panel, embedding_panel = load_all_panels(engine) + fundamentals_panel = _load_fundamentals_panel(engine) + snap_dates = centrality_panel.sorted_dates + fsi_by_date = _load_fsi_by_date(snap_dates, engine) + engine.dispose() + + n_emb_tickers = len({k[0] for k in embedding_panel.by_pair.keys()}) \ + if hasattr(embedding_panel, "by_pair") else "?" + print( + f"[*] Snapshots: {len(snap_dates)} " + f"({snap_dates[0]} → {snap_dates[-1]})", flush=True, + ) + print(f"[*] Factor: {FACTOR}", flush=True) + + t_by_horizon: dict[int, float] = {} + per_horizon_records: dict[int, tuple[list[float], list[date], FactorBacktest]] = {} + + for horizon_days in HORIZONS: + fwd = _compute_forward_returns( + snap_dates, price_panel, forward_days=horizon_days + ) + n_pairs = sum(len(v) for v in fwd.values()) + print( + f"\n[*] horizon={horizon_days}d — forward return pairs: {n_pairs}", + flush=True, + ) + + ics, sizes, periods = compute_factor_ics( + FACTOR, + snap_dates, + fwd, + price_panel, + centrality_panel, + embedding_panel, + fundamentals_panel=fundamentals_panel, + ) + r = _aggregate(FACTOR, ics, sizes, periods) + per_horizon_records[horizon_days] = (ics, periods, r) + + print( + f" {'factor':38s} {'N':>4s} {'mean_IC':>8s} " + f"{'std_IC':>8s} {'t_stat':>7s} {'avg_xs':>7s} period" + ) + print(" " + "-" * 96) + period_s = ( + f"{r.first_period} → {r.last_period}" + if r.first_period and r.last_period else "(insufficient)" + ) + print( + f" {r.name:38s} {r.n_periods:>4d} {r.mean_ic:>+8.4f} " + f"{r.std_ic:>8.4f} {r.t_stat:>+7.3f} {r.avg_xs_size:>7.1f} {period_s}" + ) + if math.isfinite(r.t_stat): + t_by_horizon[horizon_days] = r.t_stat + df_for_t = max(r.n_periods - 1, 1) + print(f"\n HLZ correction (df={df_for_t})") + hlz_out = hlz_correct({FACTOR: r.t_stat}, df=df_for_t) + print("\n" + format_table(hlz_out)) + + if math.isfinite(r.t_stat) and abs(r.t_stat) > 1.5: + print("\n Regime split (|t| > 1.5):") + _regime_split(periods, ics, fsi_by_date) + print("\n Sub-window thirds:") + _subwindow_thirds(periods, ics) + + print(f"\n{'=' * 96}") + print(" Phase 9 Session 2 — Decision matrix verdict") + print(f"{'=' * 96}") + print( + f" {'factor':38s} {'t@21d':>7s} {'t@63d':>7s} " + f"{'t@126d':>7s} verdict" + ) + print(" " + "-" * 92) + cells = [] + for h in HORIZONS: + t = t_by_horizon.get(h, float("nan")) + cells.append(f"{t:>+7.3f}" if math.isfinite(t) else " nan") + finite = [t for t in t_by_horizon.values() if math.isfinite(t)] + if not finite: + verdict = "INSUFFICIENT_DATA" + else: + abs_max = max(abs(t) for t in finite) + if abs_max > 3.0: + verdict = "RECOMMEND_REGISTER (|t|>3 at one horizon)" + elif abs_max >= 2.0: + verdict = "RECOMMEND_MONITOR (2<=|t|<=3)" + else: + verdict = "NULL (|t|<2 all horizons)" + print(f" {FACTOR:38s} {cells[0]} {cells[1]} {cells[2]} {verdict}") + + print("\n[*] Done. No DB writes performed. signal_registry untouched.") + + +if __name__ == "__main__": + main() diff --git a/scripts/register_gnn_embedding_drift_rejected.py b/scripts/register_gnn_embedding_drift_rejected.py new file mode 100644 index 0000000..ec1a190 --- /dev/null +++ b/scripts/register_gnn_embedding_drift_rejected.py @@ -0,0 +1,93 @@ +"""Phase 9 Session 2 — register graph_gnn_embedding_drift as rejected. + +The factor was excluded from every Phase 8 backtest because the 30-ticker +HGT embeddings were out-of-distribution on the 140-ticker universe. +Phase 9 Session 2 retrained HGT on the expanded graph (val AUC 0.9807, +model_version hgt_link_pred_v2, 8,120 embedding rows) and re-ran the +multi-horizon IC backtest. + +Result: NULL at full power. |t| < 1 at every horizon (+0.382 @ 21d, ++0.524 @ 63d, +0.368 @ 126d) on N = 52..57 monthly rebalances with +average cross-section ~138.6. HLZ Bonferroni fails by an order of +magnitude at every horizon. + +Writing status='rejected' so future sessions see the verdict in +signal_registry without re-running the test. The factor remains +computable on demand via compute_gnn_embedding_drift(as_of_date); +this commit only records the registry verdict. + +Idempotent INSERT ... ON CONFLICT (name) DO UPDATE. +""" +import json + +from sqlalchemy import create_engine, text + +from nexus.config import settings + +_FACTOR_NAME = "graph_gnn_embedding_drift" + +_REGIME_PROFILE = { + "role": "graph_gnn_drift", + "verdict": "null_at_full_power", + "phase": "9.2", + "model_version": "hgt_link_pred_v2", + "universe_size": 140, + "horizons": { + "21d": {"n": 57, "mean_ic": +0.0075, "t_stat": +0.382, "hlz_passes": False}, + "63d": {"n": 55, "mean_ic": +0.0101, "t_stat": +0.524, "hlz_passes": False}, + "126d": {"n": 52, "mean_ic": +0.0077, "t_stat": +0.368, "hlz_passes": False}, + }, + "evidence": ( + "Retrained HGT on 140-ticker graph (val AUC 0.9807). Embedding " + "validation passed structural ordering checks (cos(NVDA,AMD)=0.98 " + "> cos(NVDA,ARW)=0.63 etc.) — embeddings are not degenerate. " + "Month-over-month cosine drift in the structurally meaningful " + "embeddings does not predict forward returns on this universe." + ), + "phase8_30tk_finding": ( + "At N=30 the factor was DECLINING (mean_IC -0.006). The N=140 " + "result confirms the factor is genuinely null, not a power issue." + ), +} + + +def main() -> None: + engine = create_engine(settings.database_url_sync) + with engine.begin() as conn: + result = conn.execute( + text( + """ + INSERT INTO signal_registry + (name, type, tier, t_stat, hlz_passes, status, regime_profile) + VALUES + (:name, :type, :tier, :t_stat, :hlz_passes, :status, + CAST(:regime_profile AS JSONB)) + ON CONFLICT (name) DO UPDATE + SET t_stat = EXCLUDED.t_stat, + hlz_passes = EXCLUDED.hlz_passes, + status = EXCLUDED.status, + regime_profile = EXCLUDED.regime_profile + RETURNING signal_id, name, status, t_stat, hlz_passes + """ + ), + { + "name": _FACTOR_NAME, + "type": "graph", + "tier": "A", + "t_stat": 0.524, # highest |t| across the three tested horizons (63d) + "hlz_passes": False, + "status": "rejected", + "regime_profile": json.dumps(_REGIME_PROFILE), + }, + ) + row = result.fetchone() + + engine.dispose() + print( + f"[+] signal_registry: signal_id={row.signal_id} name={row.name} " + f"status={row.status} t_stat={row.t_stat} hlz_passes={row.hlz_passes}" + ) + + +if __name__ == "__main__": + main()