Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions PROGRESS.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,4 @@ Phase docs live in `docs/progress/` (untracked, local only).
| 8 | Universe expansion Session 2 | COMPLETE — graph + filings re-ingestion on 140 tickers; supply_edges 247→**1,659**, ownership_edges 7,446→**34,843** (140/140 covered), board_edges 104→**326** (39/140), centrality_history 1,740→**8,120**; ISSUER_NAME_MAP 37→150 fragments + AMAT CIK flip leftover from Session 1; HGT embeddings deferred to Session 3 retrain | `docs/progress/phase_8.md` |
| 8 | Factor backtest Session 3 | COMPLETE — multi-horizon IC backtest on 140-ticker universe (21d/63d/126d); Phase 5 t-stats shown to be N=30 artifacts (momentum_12_1 126d: 4.149→1.506); graph_delta_eigenvector sign-flipped (t=−3.194 at 63d) → wired as centrality penalty overlay in portfolio.py; graph_customer_momentum definitively null (CLOSED); rolling_registry rebuilt (406 rows); **Phase 8 paper trader CAGR +5.97%, Sharpe 0.374, Max DD −36.68%** — degradation vs Phase 7A reflects accurate N=140 factor gates + extended cash periods | `docs/progress/phase_8.md` |
| 9 | EDGAR XBRL fundamentals Session 1 | COMPLETE — migration 0011 adds form_type/accession_number/UNIQUE; `form_xbrl.py` parser (filed≥end integrity invariant, 5 metrics, 4 revenue aliases); 74,662 rows / 137 of 140 tickers covered; 4 factors; **`fundamental_margin_compression` t=+4.834 at 126d — first NEXUS factor to PASS HLZ M=400 Bonferroni** (composer sign-flipped vs literature: compression = buy; CALM-regime t=+6.35; sub-window late-third t=+5.13 — strengthening, not decaying); registered status='approved' in signal_registry; rolling_registry refreshed (464 rows); ROA decayed (late-third t=-0.06) → NOT registered; rd_intensity / asset_growth null; **paper trader CAGR +8.72%, Sharpe 0.488, Max DD -32.68%** (vs Phase 8 baseline +5.97% / 0.374 / -36.68%) | `docs/progress/phase_9.md` |
| 9 | HGT retraining Session 2 | COMPLETE — retrained HGT on 140-ticker graph (no code changes; metadata extracted dynamically); val AUC **0.9807** at epoch 280 (vs 0.9803 / e=240 on the prior 30-ticker run); 7m 03s wall-clock; `MODEL_VERSION` bumped `hgt_link_pred_v1` → **`hgt_link_pred_v2`**; node_embeddings re-backfilled at 58 monthly snapshots → **8,120 rows** (140 × 58, dim=64); embedding validation passed (cos(NVDA,AMD)=0.98 > cos(NVDA,ARW)=0.63; per-component std median 0.05); **`graph_gnn_embedding_drift` IC backtest NULL at all horizons** (t=+0.382 @ 21d / +0.524 @ 63d / +0.368 @ 126d on N=52..57; HLZ fail by 10×); registered `status='rejected'` in signal_registry with full evidence record; paper trader unchanged from Phase 9 Session 1 | `docs/progress/phase_9.md` |
2 changes: 1 addition & 1 deletion nexus/graph/backfill.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
)
from nexus.graph.gnn.model import HGTLinkPredictor

MODEL_VERSION = "hgt_link_pred_v1"
MODEL_VERSION = "hgt_link_pred_v2"
CHECKPOINT_PATH = Path(__file__).resolve().parents[2] / "models" / "hgt_link_pred.pt"


Expand Down
128 changes: 128 additions & 0 deletions research/_validate_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""Phase 9 Session 2 — embedding validation spot-checks.

Ephemeral helper; not part of the permanent code path. Confirms the
retrained HGT embeddings are not degenerate and that semiconductor
peer/distributor cosine ordering matches structural intuition before
running the IC backtest.
"""
from __future__ import annotations

import sys
from datetime import date

import numpy as np
import pandas as pd
from sqlalchemy import create_engine, text

from nexus.config import settings


def _load_latest_panel(engine) -> tuple[pd.DataFrame, date]:
with engine.connect() as c:
latest = c.execute(text("SELECT MAX(snapshot_date) FROM node_embeddings")).scalar()
df = pd.read_sql(
text(
"""
SELECT c.ticker, ne.embedding
FROM node_embeddings ne
JOIN companies c ON c.id = ne.company_id
WHERE ne.snapshot_date = :snap
"""
),
c,
params={"snap": latest},
)
return df, latest


def _cos(a: np.ndarray, b: np.ndarray) -> float:
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))


def main() -> None:
engine = create_engine(settings.database_url_sync)

# Global stats
with engine.connect() as c:
stats = c.execute(
text(
"""
SELECT COUNT(*) n_rows,
COUNT(DISTINCT company_id) n_co,
COUNT(DISTINCT snapshot_date) n_snap,
MIN(snapshot_date) min_snap,
MAX(snapshot_date) max_snap,
MIN(embedding_dim) min_dim,
MAX(embedding_dim) max_dim,
MIN(model_version) ver
FROM node_embeddings
"""
)
).one()
print("[GLOBAL]")
print(f" rows = {stats.n_rows}")
print(f" companies = {stats.n_co}")
print(f" snapshot dates = {stats.n_snap}")
print(f" date range = {stats.min_snap} → {stats.max_snap}")
print(f" dim range = [{stats.min_dim}, {stats.max_dim}]")
print(f" model_version = {stats.ver}")

# Latest snapshot panel
df, latest = _load_latest_panel(engine)
print(f"\n[LATEST SNAPSHOT] {latest} n_tickers={len(df)}")

emb = np.vstack([np.asarray(e, dtype=np.float32) for e in df["embedding"]])
print(f" emb shape = {emb.shape}")
print(f" per-component std (median across dims) = {float(np.median(emb.std(axis=0))):.6f}")
print(f" per-component std (min / max) = {float(emb.std(axis=0).min()):.6f} / {float(emb.std(axis=0).max()):.6f}")
print(f" per-row L2 norm (min / median / max) = {float(np.linalg.norm(emb, axis=1).min()):.4f} / {float(np.median(np.linalg.norm(emb, axis=1))):.4f} / {float(np.linalg.norm(emb, axis=1).max()):.4f}")

tickers = df["ticker"].tolist()
by_t = {t: emb[i] for i, t in enumerate(tickers)}

def pair(a: str, b: str) -> float | None:
if a not in by_t or b not in by_t:
return None
return _cos(by_t[a], by_t[b])

print("\n[PEER vs DISTRIBUTOR — gate: peer cos > distributor cos]")
pairs = [
("NVDA", "AMD", "fab-less GPU/CPU peers"),
("NVDA", "ARW", "GPU designer vs electronics distributor"),
("AMD", "INTC", "x86 CPU peers"),
("AMD", "AVT", "CPU designer vs distributor"),
("LRCX", "AMAT", "wafer-fab equipment peers"),
("LRCX", "ARW", "equipment maker vs distributor"),
]
for a, b, note in pairs:
v = pair(a, b)
print(f" cos({a:>6}, {b:>6}) = {v:.4f} [{note}]" if v is not None else f" cos({a:>6}, {b:>6}) = N/A")

print("\n[SUPPLY-CHAIN — gate: documented supply-chain partner higher than unrelated]")
chain_pairs = [
("NVDA", "TSM", "fab customer ↔ foundry"), # TSMC ADR ticker in companies?
("NVDA", "KO", "unrelated baseline"),
("AMD", "TSM", "fab customer ↔ foundry"),
("ASML", "TSM", "EUV supplier ↔ foundry customer"),
("LRCX", "TSM", "equipment supplier ↔ foundry customer"),
("LRCX", "KO", "unrelated baseline"),
]
for a, b, note in chain_pairs:
v = pair(a, b)
print(f" cos({a:>6}, {b:>6}) = {v:.4f} [{note}]" if v is not None else f" cos({a:>6}, {b:>6}) = N/A [{note}]")

# Sanity: full cosine matrix - find NVDA's top-5 nearest
print("\n[NVDA top-5 nearest neighbours]")
nvda = by_t.get("NVDA")
if nvda is not None:
sims = {t: _cos(nvda, e) for t, e in by_t.items() if t != "NVDA"}
top = sorted(sims.items(), key=lambda kv: kv[1], reverse=True)[:5]
for t, s in top:
print(f" {t:>6} cos={s:.4f}")

engine.dispose()
print("\n[OK] validation finished")


if __name__ == "__main__":
main()
183 changes: 183 additions & 0 deletions research/phase9_gnn_drift_backtest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
"""Phase 9 Session 2 — Multi-horizon IC backtest for graph_gnn_embedding_drift.

Reads node_embeddings (140 companies x 58 monthly snapshots, retrained HGT
hgt_link_pred_v1, best val AUC 0.9807). Computes IC at 21d / 63d / 126d
horizons on the 140-ticker UNIVERSE. Reports per-horizon IC table,
HLZ Bonferroni correction, CALM/NON-CALM regime split for |t| > 1.5,
and sub-window stability (early/mid/late thirds).

Read-only — no DB writes. signal_registry untouched.
"""
from __future__ import annotations

import math
from datetime import date

import numpy as np
from sqlalchemy import create_engine

from nexus.config import settings
from nexus.signals.backtest import (
CALM_FSI_THRESHOLD,
FactorBacktest,
_aggregate,
_compute_forward_returns,
_load_fsi_by_date,
_load_fundamentals_panel,
compute_factor_ics,
load_all_panels,
)
from nexus.signals.hlz import format_table, hlz_correct

HORIZONS: list[int] = [21, 63, 126]
FACTOR = "graph_gnn_embedding_drift"


def _t_stat(arr: np.ndarray) -> float:
if len(arr) < 2:
return float("nan")
s = arr.std(ddof=1)
if not (s > 0):
return float("nan")
return float(math.sqrt(len(arr)) * arr.mean() / s)


def _regime_split(periods: list[date], ics: list[float],
fsi_by_date: dict[date, float | None]) -> None:
calm, stressed = [], []
for p, ic in zip(periods, ics):
fsi = fsi_by_date.get(p)
if fsi is None:
continue
(calm if fsi < CALM_FSI_THRESHOLD else stressed).append(ic)
for label, vals in [("CALM (FSI<0)", calm), ("NON-CALM(FSI>=0)", stressed)]:
if len(vals) < 2:
print(f" {label:18s} N={len(vals):>3d} (insufficient)")
continue
arr = np.asarray(vals)
print(
f" {label:18s} N={len(arr):>3d} "
f"mean_IC={arr.mean():>+7.4f} t={_t_stat(arr):>+6.3f}"
)


def _subwindow_thirds(periods: list[date], ics: list[float]) -> None:
n = len(ics)
if n < 9:
print(" (insufficient for thirds)")
return
third = n // 3
chunks = [
("Early", 0, third),
("Mid", third, 2 * third),
("Late", 2 * third, n),
]
for label, lo, hi in chunks:
arr = np.asarray(ics[lo:hi])
if len(arr) < 2:
continue
win = f"{periods[lo]} → {periods[hi - 1]}"
print(
f" {label:5s} {win} N={len(arr):>3d} "
f"mean_IC={arr.mean():>+7.4f} t={_t_stat(arr):>+6.3f}"
)


def main() -> None:
engine = create_engine(settings.database_url_sync)
print("[*] Loading panels...", flush=True)
price_panel, centrality_panel, embedding_panel = load_all_panels(engine)
fundamentals_panel = _load_fundamentals_panel(engine)
snap_dates = centrality_panel.sorted_dates
fsi_by_date = _load_fsi_by_date(snap_dates, engine)
engine.dispose()

n_emb_tickers = len({k[0] for k in embedding_panel.by_pair.keys()}) \
if hasattr(embedding_panel, "by_pair") else "?"
print(
f"[*] Snapshots: {len(snap_dates)} "
f"({snap_dates[0]} → {snap_dates[-1]})", flush=True,
)
print(f"[*] Factor: {FACTOR}", flush=True)

t_by_horizon: dict[int, float] = {}
per_horizon_records: dict[int, tuple[list[float], list[date], FactorBacktest]] = {}

for horizon_days in HORIZONS:
fwd = _compute_forward_returns(
snap_dates, price_panel, forward_days=horizon_days
)
n_pairs = sum(len(v) for v in fwd.values())
print(
f"\n[*] horizon={horizon_days}d — forward return pairs: {n_pairs}",
flush=True,
)

ics, sizes, periods = compute_factor_ics(
FACTOR,
snap_dates,
fwd,
price_panel,
centrality_panel,
embedding_panel,
fundamentals_panel=fundamentals_panel,
)
r = _aggregate(FACTOR, ics, sizes, periods)
per_horizon_records[horizon_days] = (ics, periods, r)

print(
f" {'factor':38s} {'N':>4s} {'mean_IC':>8s} "
f"{'std_IC':>8s} {'t_stat':>7s} {'avg_xs':>7s} period"
)
print(" " + "-" * 96)
period_s = (
f"{r.first_period} → {r.last_period}"
if r.first_period and r.last_period else "(insufficient)"
)
print(
f" {r.name:38s} {r.n_periods:>4d} {r.mean_ic:>+8.4f} "
f"{r.std_ic:>8.4f} {r.t_stat:>+7.3f} {r.avg_xs_size:>7.1f} {period_s}"
)
if math.isfinite(r.t_stat):
t_by_horizon[horizon_days] = r.t_stat
df_for_t = max(r.n_periods - 1, 1)
print(f"\n HLZ correction (df={df_for_t})")
hlz_out = hlz_correct({FACTOR: r.t_stat}, df=df_for_t)
print("\n" + format_table(hlz_out))

if math.isfinite(r.t_stat) and abs(r.t_stat) > 1.5:
print("\n Regime split (|t| > 1.5):")
_regime_split(periods, ics, fsi_by_date)
print("\n Sub-window thirds:")
_subwindow_thirds(periods, ics)

print(f"\n{'=' * 96}")
print(" Phase 9 Session 2 — Decision matrix verdict")
print(f"{'=' * 96}")
print(
f" {'factor':38s} {'t@21d':>7s} {'t@63d':>7s} "
f"{'t@126d':>7s} verdict"
)
print(" " + "-" * 92)
cells = []
for h in HORIZONS:
t = t_by_horizon.get(h, float("nan"))
cells.append(f"{t:>+7.3f}" if math.isfinite(t) else " nan")
finite = [t for t in t_by_horizon.values() if math.isfinite(t)]
if not finite:
verdict = "INSUFFICIENT_DATA"
else:
abs_max = max(abs(t) for t in finite)
if abs_max > 3.0:
verdict = "RECOMMEND_REGISTER (|t|>3 at one horizon)"
elif abs_max >= 2.0:
verdict = "RECOMMEND_MONITOR (2<=|t|<=3)"
else:
verdict = "NULL (|t|<2 all horizons)"
print(f" {FACTOR:38s} {cells[0]} {cells[1]} {cells[2]} {verdict}")

print("\n[*] Done. No DB writes performed. signal_registry untouched.")


if __name__ == "__main__":
main()
Loading
Loading