Skip to content
Merged
1 change: 1 addition & 0 deletions PROGRESS.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,4 @@ Phase docs live in `docs/progress/` (untracked, local only).
| 8 | Universe expansion Session 1 | COMPLETE — UNIVERSE 30 → **140** tickers; AMAT CIK bug fixed (was Adobe's `796343`, now `6951`); price_history backfilled to 2018-01-01; ANSS/JNPR/INFN excluded (acquired during window; yfinance dropped historical series); 60,567 → **283,150** rows | `docs/progress/phase_8.md` |
| 8 | Universe expansion Session 2 | COMPLETE — graph + filings re-ingestion on 140 tickers; supply_edges 247→**1,659**, ownership_edges 7,446→**34,843** (140/140 covered), board_edges 104→**326** (39/140), centrality_history 1,740→**8,120**; ISSUER_NAME_MAP 37→150 fragments + AMAT CIK flip leftover from Session 1; HGT embeddings deferred to Session 3 retrain | `docs/progress/phase_8.md` |
| 8 | Factor backtest Session 3 | COMPLETE — multi-horizon IC backtest on 140-ticker universe (21d/63d/126d); Phase 5 t-stats shown to be N=30 artifacts (momentum_12_1 126d: 4.149→1.506); graph_delta_eigenvector sign-flipped (t=−3.194 at 63d) → wired as centrality penalty overlay in portfolio.py; graph_customer_momentum definitively null (CLOSED); rolling_registry rebuilt (406 rows); **Phase 8 paper trader CAGR +5.97%, Sharpe 0.374, Max DD −36.68%** — degradation vs Phase 7A reflects accurate N=140 factor gates + extended cash periods | `docs/progress/phase_8.md` |
| 9 | EDGAR XBRL fundamentals Session 1 | COMPLETE — migration 0011 adds form_type/accession_number/UNIQUE; `form_xbrl.py` parser (filed≥end integrity invariant, 5 metrics, 4 revenue aliases); 74,662 rows / 137 of 140 tickers covered; 4 factors; **`fundamental_margin_compression` t=+4.834 at 126d — first NEXUS factor to PASS HLZ M=400 Bonferroni** (composer sign-flipped vs literature: compression = buy; CALM-regime t=+6.35; sub-window late-third t=+5.13 — strengthening, not decaying); registered status='approved' in signal_registry; rolling_registry refreshed (464 rows); ROA decayed (late-third t=-0.06) → NOT registered; rd_intensity / asset_growth null; **paper trader CAGR +8.72%, Sharpe 0.488, Max DD -32.68%** (vs Phase 8 baseline +5.97% / 0.374 / -36.68%) | `docs/progress/phase_9.md` |
50 changes: 50 additions & 0 deletions migrations/versions/0011_fundamentals_xbrl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""Add form_type, accession_number, and UNIQUE constraint to fundamentals.

Revision ID: 0011
Revises: 0010
Create Date: 2026-05-17

Phase 9 Session 1: EDGAR XBRL fundamental data layer. The fundamentals
table itself was created in 0001 but lacked the provenance columns
required to dedupe across restatements. accession_number is the SEC
filing identifier; form_type distinguishes 10-K (annual) from 10-Q
(quarterly) facts. The UNIQUE constraint keys on accession_number so
restated values (10-K/A) are kept as separate rows — point-in-time
scoring selects the latest filed value at decision date.
"""
from collections.abc import Sequence
from typing import Union

import sqlalchemy as sa
from alembic import op

revision: str = "0011"
down_revision: Union[str, Sequence[str], None] = "0010"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
op.add_column(
"fundamentals",
sa.Column("form_type", sa.String(16), nullable=True),
)
op.add_column(
"fundamentals",
sa.Column("accession_number", sa.String(25), nullable=True),
)
op.create_unique_constraint(
"uq_fundamentals_company_period_metric_accession",
"fundamentals",
["company_id", "period_end_date", "metric", "accession_number"],
)


def downgrade() -> None:
op.drop_constraint(
"uq_fundamentals_company_period_metric_accession",
"fundamentals",
type_="unique",
)
op.drop_column("fundamentals", "accession_number")
op.drop_column("fundamentals", "form_type")
9 changes: 9 additions & 0 deletions nexus/data/edgar/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,15 @@ def get_company_submissions(self, cik: str) -> dict:
padded = cik.zfill(10)
return self._get(f"{_BASE}/submissions/CIK{padded}.json").json()

def get_company_facts(self, cik: str) -> dict:
"""Return the EDGAR XBRL companyfacts JSON for a given CIK.

Raises requests.HTTPError on 404 — callers must handle missing-XBRL
filers (foreign 20-F filers, recent IPOs without filings).
"""
padded = cik.zfill(10)
return self._get(f"{_BASE}/api/xbrl/companyfacts/CIK{padded}.json").json()

def get_recent_filings(self, cik: str, form_type: str, limit: int = 5) -> list[FilingRecord]:
"""
Return up to `limit` most recent filings of a given form type for a CIK.
Expand Down
129 changes: 129 additions & 0 deletions nexus/data/edgar/forms/form_xbrl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"""EDGAR XBRL companyfacts fetcher + parser.

Two-layer module to keep IO and parsing separable for tests:

- ``fetch_company_facts`` does the network round-trip and caches raw JSON
to ``data/cache/xbrl/CIK{cik:010d}.json``. Returns ``None`` on 404 so
callers can mark a CIK as "no XBRL" (foreign 20-F filers, recent IPOs).

- ``parse_facts`` is pure: dict in, list[FundamentalFact] out.

Point-in-time integrity
-----------------------
``FundamentalFact.available_as_of_date`` is always the XBRL ``filed`` field
— the SEC filing date for that specific fact. It is NEVER the ``end`` field
(period end date). Using ``end`` would back-date the availability of values
that were only published months later, the canonical look-ahead-bias bug
for fundamental data.
"""
from __future__ import annotations

import json
from dataclasses import dataclass
from datetime import date
from pathlib import Path

import requests

from nexus.data.edgar.client import EDGARClient

CACHE_DIR = Path("data/cache/xbrl")

# Canonical metric name → XBRL US-GAAP concept identifiers tried in order.
# Revenue has two common spellings; iterate until one is present.
CONCEPT_MAP: dict[str, list[str]] = {
"gross_profit": ["GrossProfit"],
"revenue": [
"Revenues",
"RevenueFromContractWithCustomerExcludingAssessedTax",
# Post-ASC 606 variant used by some filers (COHU, CRWD, VECO).
"RevenueFromContractWithCustomerIncludingAssessedTax",
# Legacy pre-2018 tag still used by some filings.
"SalesRevenueNet",
],
"rd_expense": ["ResearchAndDevelopmentExpense"],
"total_assets": ["Assets"],
"operating_income": ["OperatingIncomeLoss"],
}

# Annual + quarterly only. 10-K/A (amended 10-K) is treated as 10-K-class
# for point-in-time scoring: it carries its own filed date and accession,
# and the parser keeps it as a distinct row from the original 10-K.
ALLOWED_FORMS = frozenset({"10-K", "10-K/A", "10-Q", "10-Q/A"})

# Only monetary USD facts are extracted. Per-share metrics and ratios are
# out of scope for Phase 9 Session 1.
_ALLOWED_UNITS = frozenset({"USD"})


@dataclass(frozen=True)
class FundamentalFact:
metric: str
period_end_date: date
available_as_of_date: date
value: float
form_type: str
accession_number: str


def fetch_company_facts(
client: EDGARClient, cik: str, use_cache: bool = True
) -> dict | None:
"""Fetch companyfacts JSON, caching to ``CACHE_DIR``. Returns ``None`` on 404."""
padded = cik.zfill(10)
cache_path = CACHE_DIR / f"CIK{padded}.json"
if use_cache and cache_path.exists():
return json.loads(cache_path.read_text(encoding="utf-8"))
try:
data = client.get_company_facts(cik)
except requests.HTTPError as e:
if e.response is not None and e.response.status_code == 404:
return None
raise
CACHE_DIR.mkdir(parents=True, exist_ok=True)
cache_path.write_text(json.dumps(data), encoding="utf-8")
return data


def parse_facts(data: dict) -> list[FundamentalFact]:
"""Extract FundamentalFacts from a companyfacts dict. Pure function."""
out: list[FundamentalFact] = []
facts_root = data.get("facts", {}).get("us-gaap", {})
for metric, concepts in CONCEPT_MAP.items():
concept_obj = None
for c in concepts:
if c in facts_root:
concept_obj = facts_root[c]
break
if concept_obj is None:
continue
for unit, entries in concept_obj.get("units", {}).items():
if unit not in _ALLOWED_UNITS:
continue
for entry in entries:
form = entry.get("form")
if form not in ALLOWED_FORMS:
continue
try:
end = date.fromisoformat(entry["end"])
filed = date.fromisoformat(entry["filed"])
value = float(entry["val"])
accn = str(entry["accn"])
except (KeyError, ValueError, TypeError):
continue
# Integrity invariant: a fact about period ending at `end`
# cannot have been filed before that period closed. Observed
# in the wild on a 2009 ROP 10-Q where an Assets fact was
# tagged end=2009-12-31 in a filing dated 2009-11-02 — an
# SEC-side data error that would corrupt point-in-time scoring.
if filed < end:
continue
out.append(FundamentalFact(
metric=metric,
period_end_date=end,
available_as_of_date=filed,
value=value,
form_type=form,
accession_number=accn,
))
return out
8 changes: 6 additions & 2 deletions nexus/execution/paper_trader.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,11 @@
_factor_xs_dispatch,
_load_centrality_panel,
_load_embedding_panel,
_load_fundamentals_panel,
_load_price_panel,
_CentralityPanel,
_EmbeddingPanel,
_FundamentalsPanel,
_PricePanel,
)

Expand Down Expand Up @@ -142,13 +144,14 @@ def _factor_panel(
price: _PricePanel,
cent: _CentralityPanel,
emb: _EmbeddingPanel,
fund: _FundamentalsPanel | None = None,
) -> pl.DataFrame:
"""Wide z-score panel: rows = tickers, cols = factor names + 'ticker'.

Cross-sectional z-score within each factor; factors with zero or near-zero
std at this snapshot contribute zeros (they cannot rank anything).
"""
dispatch = _factor_xs_dispatch()
dispatch = _factor_xs_dispatch(fundamentals_panel=fund)
dispatch.pop("graph_gnn_embedding_drift", None) # stale N=30 embeddings; 110 new tickers would get z=0.0
cols: dict[str, dict[str, float]] = {}
all_tickers: set[str] = set()
Expand Down Expand Up @@ -399,6 +402,7 @@ def run_backtest(
price = _load_price_panel(engine)
cent = _load_centrality_panel(engine)
emb = _load_embedding_panel(engine)
fund = _load_fundamentals_panel(engine)
crowding_by_q = _load_crowding(engine)

rebalance_grid = _rebalance_dates(cent, start, end)
Expand All @@ -424,7 +428,7 @@ def run_backtest(

for d in trading_dates:
if d in rebalance_set:
z_panel = _factor_panel(d, price, cent, emb)
z_panel = _factor_panel(d, price, cent, emb, fund=fund)
if z_panel.height > 0:
records = load_factor_records(as_of=d, engine=engine)
weights_by_factor = compute_factor_weights(records)
Expand Down
Loading
Loading