diff --git a/.gitignore b/.gitignore index 3858e46..2e4027e 100644 --- a/.gitignore +++ b/.gitignore @@ -12,13 +12,15 @@ user_data/* !user_data/models !user_data/freqaimodels !user_data/data/ -!user_data/run_custom.py # Downloaded data (from Google Drive) and synthetic test data — not tracked in git user_data/data/usstock/ user_data/data/polymarket/ user_data/data/portfoliobench/ # Generated ML artifacts (regenerable via scripts/prepare_event_model.py) user_data/data/polymarket_ml/ +user_data/data/polymarket_ml_real/ +# Report / thesis drafts +report/ # Local development data, model checkpoints, and notebooks mycode/ # Virtual environment @@ -130,6 +132,3 @@ target/ !config_examples/config_freqai.example.json docker-compose-*.yml - -# Workflow results (regenerable via make e2e-*) -results/ diff --git a/docs/polymarket/README.md b/docs/polymarket/README.md index 0f16858..3565f93 100644 --- a/docs/polymarket/README.md +++ b/docs/polymarket/README.md @@ -127,10 +127,11 @@ Run the preparation script to rebuild everything from the raw BTC price series: python scripts/prepare_event_model.py ``` -This executes three steps automatically: +This executes four steps automatically: | Step | What it does | Output | |---|---|---| +| 0 — Build feather files | Generates OHLCV feather files per contract (synthetic by default; see [Using Real Data](#using-real-polymarket-data)) | `*.feather` files in `--output-dir` | | 1 — Build training data | Constructs synthetic weekly BTC binary events from `data_1h.csv` | `event_model_training.parquet` | | 2 — Train model | Fits and calibrates a logistic regression on the training data | `event_model.pkl` | | 3 — Generate predictions | Runs the model at every hourly bar before each contract's expiry | Per-contract `*-event_probs.csv` files | @@ -151,13 +152,63 @@ python scripts/prepare_event_model.py \ | Flag | Default | Description | |---|---|---| | `--btc-csv` | `mycode/data/data_1h.csv` | Path to hourly BTC OHLCV CSV | -| `--contracts` | `user_data/data/polymarket_contracts/jan20.jsonl` | Contract metadata JSONL | +| `--contracts` | `user_data/data/polymarket_contracts/jan20.jsonl` | Contract metadata JSONL (ignored when `--use-real-data` is set) | | `--output-dir` | `user_data/data/polymarket_ml` | Where all artefacts are written | | `--start-date` | `2018-01-01` | Earliest settlement date for training samples | | `--end-date` | `2025-06-01` | Latest settlement date (exclusive) | | `--val-cutoff` | `2024-01-01` | Events before this date → training; after → validation | | `--model-type` | `logistic` | `logistic` (default) or `xgboost` | +| `--skip-feathers` | off | Skip step 0 if feather files already exist in `--output-dir` | | `--skip-training-data` | off | Skip step 1 if the Parquet already exists | +| `--use-real-data` | off | Step 0: build feathers from real Polymarket trade data instead of synthetic prices | +| `--parquet-path` | `mycode/data/combined_filtered_data.paquet` | Path to the Polymarket trade-history parquet (only used with `--use-real-data`) | + +### Using Real Polymarket Data + +By default, step 0 generates **synthetic** OHLCV price series for each contract. If you +have a Polymarket trade-history parquet file, you can use real market prices instead: + +```bash +python scripts/prepare_event_model.py \ + --use-real-data \ + --parquet-path mycode/data/combined_filtered_data.paquet \ + --output-dir user_data/data/polymarket_ml_real +``` + +This runs `polymarket/real_data_builder.py` which: + +1. Loads the parquet and filters to BTC/Bitcoin YES-side rows. +2. Parses each market's strike (`$88K`, `$90,000`, etc.) and expiry date from the + `question` text. +3. Checks that the last 7-day window has at least 60% hourly coverage (≥ 101 of 168 + candles). +4. Forward-fills gaps of up to 6 consecutive hours; excludes contracts with longer gaps. +5. Writes a freqtrade-compatible `*.feather` file for each qualifying contract. +6. Serialises all parsed contracts to `real_contracts.jsonl` in the output directory. + +The event model and downstream prediction steps (steps 1–3) are unchanged — they work +with the same feather files regardless of whether they were generated synthetically or +from real data. + +**Parquet format requirements:** + +| Column | Type | Description | +|---|---|---| +| `timestamp` | datetime (UTC) | Candle open time | +| `condition_id` | string | Market identifier | +| `side` | string | `"Yes"` or `"No"` — only YES-side rows are used | +| `open`, `high`, `low`, `close` | string-encoded float | OHLC prices (0–1 range) | +| `volume` | int64 | Traded volume | +| `question` | string | Human-readable market question | + +**Coverage thresholds:** + +| Gap length | Treatment | +|---|---| +| 1–6 hours | Forward-filled (price unchanged) | +| 7–23 hours | Forward-filled, but a `WARNING` is logged | +| ≥ 24 hours consecutively | Contract excluded (price too stale) | +| < 60% of window rows present | Contract excluded | For the full training reference — feature engineering, label construction, temporal splits, calibration, and inference — see [training-guide.md](training-guide.md). @@ -194,6 +245,37 @@ At each hourly candle, the strategy: To override parameters without modifying the strategy file, subclass `DualModelPolymarketPortfolio` and set the class attributes. +### Configurable data paths (via config JSON) + +Two file paths used by the strategy can be overridden through the freqtrade config JSON +without touching the strategy source: + +| Config key | Default | Description | +|---|---|---| +| `contracts_jsonl` | `user_data/data/polymarket_contracts/jan20.jsonl` | Path to the contract metadata JSONL file; relative paths are resolved against `user_data/` | +| `predictions_dir` | `user_data/data/polymarket_ml` | Directory containing per-contract `*-event_probs.csv` files; relative paths are resolved against `user_data/` | + +Example config for a real-data backtest: + +```json +{ + "contracts_jsonl": "data/polymarket_ml_real/real_contracts.jsonl", + "predictions_dir": "data/polymarket_ml_real", + "exchange": { + "name": "polymarket", + "pair_whitelist": [ + "BTCABOVE108K-SEP5-YES/USDT", + "BTCABOVE110K-SEP5-YES/USDT" + ] + } +} +``` + +Contracts loaded from `real_contracts.jsonl` (produced by `--use-real-data`) may +contain question patterns that the minimal regex in the synthetic pipeline did not +support (e.g. "reach $108K"). The loader automatically skips unparseable lines with a +warning, so a mixed JSONL file will not crash the strategy. + ### Position sizing (Kelly formula) For a contract priced at `p_market` and model probability `p_model`: @@ -358,8 +440,21 @@ are: ``` The `load_contracts()` function extracts the strike and direction from the `question` -field using a regex match for `"above $X,XXX"` or `"below $X,XXX"`. Settlement is -determined from `outcomePrices`. +field. Supported question patterns include: + +| Pattern | Direction | Example | +|---|---|---| +| `above $X,XXX` / `above $XXK` | above | "Bitcoin above $90,000 on January 20?" | +| `reach/hit/exceed/surpass $X,XXX` | above | "Will Bitcoin reach $120,000 by December?" | +| `below $X,XXX` / `less than $X,XXX` | below | "Bitcoin below $80,000 on March 1?" | +| `dips to $X,XXX` | below | "Will Bitcoin dip to $70K in October?" | + +Strikes can be written with or without a `K`/`k` suffix (e.g. `$88K` = `$88,000`). +Settlement is determined from `outcomePrices`. + +Lines whose question cannot be parsed are skipped with a warning (see `skip_unparseable` +in `load_contracts`) rather than raising an error, so a JSONL file with mixed contract +types will not crash the strategy. Place the file at `user_data/data/polymarket_contracts/.jsonl` and update `--contracts` in the prepare script. @@ -384,13 +479,17 @@ Timestamp,Open,High,Low,Close,Volume - Optional on-chain columns: `mvrv`, `hash-rate`, `difficulty` (omit if unavailable; the corresponding features will be NaN and effectively ignored by the model). -### 3 — Feather files (synthetic OHLCV) +### 3 — Feather files (synthetic or real OHLCV) + +The backtester requires an OHLCV feather file for each pair. Two sources are supported: -The backtester requires a synthetic OHLCV feather file for each pair. These are -generated by the existing `build_all_feathers()` function. They are **not** real -Polymarket quotes — they are synthetic price series used solely to satisfy freqtrade's -data loading requirements. The strategy ignores OHLCV values other than `close` for -market price. +- **Synthetic (default):** generated by `build_all_feathers()` from `data_builder.py`. + These are modelled price series used solely to satisfy freqtrade's data loading + requirements. The strategy ignores OHLCV values other than `close` for market price. +- **Real Polymarket data:** generated by `build_all_feathers_from_parquet()` from + `real_data_builder.py` (requires a trade-history parquet). These contain actual + historical Polymarket prices from the YES-side order book. Use `--use-real-data` in + `prepare_event_model.py` to build these instead of synthetic files. ### 4 — Per-contract predictions diff --git a/docs/polymarket/training-guide.md b/docs/polymarket/training-guide.md index 80edb5f..8362704 100644 --- a/docs/polymarket/training-guide.md +++ b/docs/polymarket/training-guide.md @@ -25,6 +25,21 @@ start. ## 1. Architecture Overview ``` + ┌─────────────────────────────────────────────┐ + │ Step 0: OHLCV feather files per contract │ + │ │ + │ Option A (default — synthetic): │ + │ build_all_feathers() (data_builder.py) │ + │ • Log-normal price simulation │ + │ │ + │ Option B (--use-real-data): │ + │ build_all_feathers_from_parquet() │ + │ (real_data_builder.py) │ + │ • Real Polymarket YES-side prices │ + │ • Forward-fill gaps ≤ 6h │ + │ • Writes real_contracts.jsonl │ + └─────────────────────────────────────────────┘ + data_1h.csv (BTC hourly OHLCV, 2018–present) │ ▼ @@ -119,11 +134,24 @@ One JSON object per line. Required fields per contract: ``` **Field notes:** -- `question`: must contain `"above $X,XXX"` or `"below $X,XXX"` for strike extraction. +- `question`: used for strike and direction extraction. Supported patterns: + - `"above $X,XXX"` / `"above $XXK"` — YES if BTC > K at expiry + - `"reach/hit/exceed/surpass $X,XXX"` — treated as "above" + - `"below $X,XXX"` / `"less than $X,XXX"` / `"dips to $X,XXX"` — YES if BTC < K + - K-suffix supported: `$88K` is parsed as `$88,000` + - Lines whose `question` cannot be parsed are skipped with a warning when + `skip_unparseable=True`; they raise `ValueError` by default. - `outcomePrices`: JSON-encoded array `["YES_price", "NO_price"]`. `"1"` = that outcome won. Used to determine `settlement` (1.0 or 0.0). - `endDate`: ISO-8601 UTC, determines the settlement timestamp `T`. +**Alternative: auto-generated JSONL from real data** + +When using `--use-real-data` in `prepare_event_model.py`, the JSONL file is generated +automatically by `real_data_builder.write_contracts_jsonl()` from the parquet. The +output `real_contracts.jsonl` uses the same schema and can be loaded by the strategy +via the `contracts_jsonl` config key. + --- ## 3. Feature Engineering @@ -482,12 +510,21 @@ used during fitting. It does not limit when the model can be deployed. After running `prepare_event_model.py`, the following files are written to `user_data/data/polymarket_ml/`: +**Synthetic mode (default):** + | File | Format | Description | |---|---|---| | `event_model_training.parquet` | Parquet | 363K+ training samples; all 15 features + label, K, T | | `event_model.pkl` | joblib pickle | Model package dict (see below) | | `{pair}-event_probs.csv` | CSV | Per-contract fair-value predictions; columns: `dt_utc`, `fair_value` | -| `{pair}.feather` | Feather | Synthetic OHLCV for backtester (generated by `build_all_feathers`) | +| `{pair}-1h.feather` | Feather | Synthetic OHLCV for backtester (generated by `build_all_feathers`) | + +**Real-data mode (`--use-real-data`):** Same artefacts, plus: + +| File | Format | Description | +|---|---|---| +| `{pair}-1h.feather` | Feather | Real Polymarket YES-side OHLCV, 7-day window before expiry | +| `real_contracts.jsonl` | JSONL | Auto-generated contract metadata; compatible with `load_contracts()` | ### Model package structure diff --git a/freqtrade b/freqtrade index ab093ff..5fb0011 160000 --- a/freqtrade +++ b/freqtrade @@ -1 +1 @@ -Subproject commit ab093ff0e1af445f0b8491ea1168c46e1a51b2c0 +Subproject commit 5fb00116889f8d1d67c817f8aa1eaeecfbbfbfbe diff --git a/polymarket/contracts.py b/polymarket/contracts.py index 6ad6bec..5a6a940 100644 --- a/polymarket/contracts.py +++ b/polymarket/contracts.py @@ -19,10 +19,14 @@ from __future__ import annotations import json +import logging import re from dataclasses import dataclass, field from pathlib import Path +logger = logging.getLogger(__name__) + + # --------------------------------------------------------------------------- # Pair naming helpers # --------------------------------------------------------------------------- @@ -100,26 +104,44 @@ class ContractMetadata: # Strike / direction extraction # --------------------------------------------------------------------------- -# Matches "above $90,000", "above $90k", "less than $84,000", etc. -_ABOVE_RE = re.compile(r"above\s+\$([0-9,]+(?:\.[0-9]+)?)", re.IGNORECASE) -_BELOW_RE = re.compile(r"(?:less than|below)\s+\$([0-9,]+(?:\.[0-9]+)?)", re.IGNORECASE) +# Matches "above $90,000", "above $90K", "reach $120,000", "less than $84K", etc. +# Group 1: numeric part; Group 2: optional K/k suffix. +_ABOVE_RE = re.compile( + r"\b(?:above|reach(?:es)?|hit(?:s)?|exceed(?:s)?|surpass(?:es)?)\s+\$([0-9,]+(?:\.[0-9]+)?)\s*([kK])?(?:\b|$)", + re.IGNORECASE, +) +_BELOW_RE = re.compile( + r"\b(?:less\s+than|below|dip(?:s)?\s+to)\s+\$([0-9,]+(?:\.[0-9]+)?)\s*([kK])?(?:\b|$)", + re.IGNORECASE, +) def _parse_strike_direction(question: str) -> tuple[float, str]: """Extract (strike, direction) from a Polymarket question string. + Handles: + - "above $90,000" / "above $90K" + - "reach/hit/exceed/surpass $120,000" (treated as 'above') + - "less than/below/dips to $84,000" (treated as 'below') + Raises: ValueError: If neither 'above' nor 'below/less than' is found. """ m = _ABOVE_RE.search(question) if m: - strike = float(m.group(1).replace(",", "")) - return strike, "above" + raw = m.group(1).replace(",", "") + has_k = m.group(2) is not None + strike = float(raw) * (1_000 if has_k else 1) + if strike >= 1_000: + return strike, "above" m = _BELOW_RE.search(question) if m: - strike = float(m.group(1).replace(",", "")) - return strike, "below" + raw = m.group(1).replace(",", "") + has_k = m.group(2) is not None + strike = float(raw) * (1_000 if has_k else 1) + if strike >= 1_000: + return strike, "below" raise ValueError(f"Cannot parse strike/direction from question: {question!r}") @@ -149,24 +171,34 @@ def _settlement_from_outcome_prices(outcome_prices_json: str) -> float: # Public loader # --------------------------------------------------------------------------- -def load_contracts(jsonl_path: str | Path) -> list[ContractMetadata]: +def load_contracts( + jsonl_path: str | Path, + *, + skip_unparseable: bool = False, +) -> list[ContractMetadata]: """Parse a Polymarket JSONL snapshot into a list of :class:`ContractMetadata`. Args: - jsonl_path: Path to the ``.jsonl`` file (one JSON object per line). + jsonl_path: Path to the ``.jsonl`` file (one JSON object per line). + skip_unparseable: When ``True``, log a warning and skip lines whose + question text cannot be parsed rather than raising. + Useful when loading ``real_contracts.jsonl`` which may + contain exotic question patterns. Returns: List of :class:`ContractMetadata`, sorted by strike ascending. Raises: FileNotFoundError: If ``jsonl_path`` does not exist. - ValueError: If a line cannot be parsed. + ValueError: If a line cannot be parsed and ``skip_unparseable`` is + ``False``. """ path = Path(jsonl_path) if not path.exists(): raise FileNotFoundError(f"Contract metadata file not found: {path}") contracts: list[ContractMetadata] = [] + skipped = 0 with path.open() as fh: for lineno, line in enumerate(fh, start=1): @@ -182,6 +214,10 @@ def load_contracts(jsonl_path: str | Path) -> list[ContractMetadata]: try: strike, direction = _parse_strike_direction(d["question"]) except ValueError as exc: + if skip_unparseable: + logger.debug("Skipping line %d (%s): %s", lineno, d.get("slug", "?"), exc) + skipped += 1 + continue raise ValueError(f"Line {lineno} ({d.get('slug', '?')}): {exc}") from exc settlement = _settlement_from_outcome_prices(d["outcomePrices"]) @@ -208,5 +244,8 @@ def load_contracts(jsonl_path: str | Path) -> list[ContractMetadata]: ) ) + if skipped: + logger.info("load_contracts: skipped %d unparseable lines in %s", skipped, path.name) + contracts.sort(key=lambda c: c.strike) return contracts diff --git a/polymarket/real_data_builder.py b/polymarket/real_data_builder.py new file mode 100644 index 0000000..8b66cfc --- /dev/null +++ b/polymarket/real_data_builder.py @@ -0,0 +1,467 @@ +"""Build backtesting feather files from real Polymarket trade data. + +This module replaces the synthetic log-normal price generator +(:mod:`polymarket.synthetic_prices`) with real hourly OHLCV data extracted +from a Polymarket trade-history parquet file. + +The parquet file is expected to have the following columns: + + timestamp – datetime (UTC-aware) + condition_id – market identifier string + side – outcome label, e.g. ``"Yes"`` / ``"No"`` + open – string-encoded float price + high – string-encoded float price + low – string-encoded float price + close – string-encoded float price + volume – int64 traded volume + question – human-readable market question + +Main entry points +----------------- +parse_btc_contracts(df) + Scan all rows for BTC binary markets and return a list of parsed + :class:`~polymarket.contracts.ContractMetadata` objects. + +build_feather_from_real_data(df, contract, output_dir) + Extract the 7-day window before expiry for one contract, forward-fill + gaps, and write a freqtrade-compatible feather file. + +build_all_feathers_from_parquet(parquet_path, output_dir, ...) + End-to-end convenience wrapper: load the parquet, parse contracts, + build feathers for all qualifying markets. +""" + +from __future__ import annotations + +import logging +import re +from pathlib import Path +from typing import Sequence + +import pandas as pd +import pyarrow.feather as feather + +from polymarket.contracts import ContractMetadata, _make_pair + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +# Backtesting window: last N hours before expiry +WINDOW_HOURS: int = 168 # 7 days + +# Polymarket resolves BTC contracts at noon Eastern = 17:00 UTC +RESOLUTION_HOUR_UTC: int = 17 + +# Forward-fill limit: gaps longer than this are left as NaN and the contract +# is flagged as "sparse" (but still usable if overall coverage ≥ MIN_COVERAGE) +MAX_FFILL_HOURS: int = 6 + +# Minimum fraction of hourly candles required in the window +MIN_COVERAGE: float = 0.60 # 60% = at least 101 of 168 hours + +# --------------------------------------------------------------------------- +# Question parsing +# --------------------------------------------------------------------------- + +# "above $88,000" / "above $88K" / "above $88k" +# Group 1: digits, Group 2: optional K/k suffix +_ABOVE_RE = re.compile(r"\babove\s+\$([0-9,]+(?:\.[0-9]+)?)\s*([kK])?(?:\b|$)", re.I) +# "below $84,000" / "less than $84K" / "dip to $84,000" +_BELOW_RE = re.compile( + r"\b(?:below|less than|dip(?:s)?\s+to)\s+\$([0-9,]+(?:\.[0-9]+)?)\s*([kK])?(?:\b|$)", re.I +) +# "reach $150,000" / "reach $100k" / "hit $100k" / "exceed $90,000" +_REACH_RE = re.compile( + r"\b(?:reach|hit|exceed|surpass)\s+\$([0-9,]+(?:\.[0-9]+)?)\s*([kK])?(?:\b|$)", re.I +) + +# Full month names and 3-letter abbreviations +_MONTH_MAP: dict[str, int] = { + "january": 1, "february": 2, "march": 3, "april": 4, + "may": 5, "june": 6, "july": 7, "august": 8, + "september": 9, "october": 10, "november": 11, "december": 12, + "jan": 1, "feb": 2, "mar": 3, "apr": 4, + "jun": 6, "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12, +} + +_DATE_RE = re.compile( + r"\b(january|february|march|april|may|june|july|august|" + r"september|october|november|december|" + r"jan|feb|mar|apr|jun|jul|aug|sep|oct|nov|dec)" + r"\s+(\d{1,2})(?:st|nd|rd|th)?(?:[,\s]+(\d{4}))?", + re.I, +) + +_YEAR_RE = re.compile(r"\b(20\d{2})\b") + + +def _parse_strike_direction(question: str) -> tuple[float, str] | None: + """Return (strike_usd, direction) or None if unparseable.""" + for pattern, direction in ( + (_ABOVE_RE, "above"), + (_REACH_RE, "above"), # "reach $X" treated as above (YES if BTC > X) + (_BELOW_RE, "below"), + ): + m = pattern.search(question) + if m: + raw = m.group(1).replace(",", "") + has_k = m.group(2) is not None # True when K/k suffix was captured + strike = float(raw) * (1_000 if has_k else 1) + # Skip implausible BTC prices (< $1,000 without K suffix) + if strike < 1_000: + return None + return strike, direction + return None + + +def _parse_expiry(question: str, last_timestamp: pd.Timestamp) -> str | None: + """Return ISO-8601 UTC expiry string or None if unparseable. + + Resolution time is always 17:00 UTC (Polymarket noon ET rule). + If the year is absent from the question we infer it from last_timestamp. + """ + m = _DATE_RE.search(question) + if not m: + return None + + month_str, day_str, year_str = m.group(1), m.group(2), m.group(3) + month = _MONTH_MAP.get(month_str.lower()) + if month is None: + return None + day = int(day_str) + + if year_str: + year = int(year_str) + try: + expiry = pd.Timestamp(year=year, month=month, day=day, + hour=RESOLUTION_HOUR_UTC, tz="UTC") + except ValueError: + return None + return expiry.strftime("%Y-%m-%dT%H:%M:%SZ") + + # No year in question — try years from last_timestamp.year down to -2. + # Pick the latest year whose parsed date is ≤ last_timestamp + 3 days + # (small slack for contracts whose last candle is just after expiry). + slack = pd.Timedelta(days=3) + expiry = None + for y in range(last_timestamp.year, last_timestamp.year - 3, -1): + try: + candidate = pd.Timestamp(year=y, month=month, day=day, + hour=RESOLUTION_HOUR_UTC, tz="UTC") + except ValueError: + continue + if candidate <= last_timestamp + slack: + expiry = candidate + break + + if expiry is None: + return None + + return expiry.strftime("%Y-%m-%dT%H:%M:%SZ") + + +# --------------------------------------------------------------------------- +# Parquet scanning +# --------------------------------------------------------------------------- + +def parse_btc_contracts( + df: pd.DataFrame, + min_rows_last7d: int = int(WINDOW_HOURS * MIN_COVERAGE), +) -> list[ContractMetadata]: + """Scan the parquet DataFrame and return parseable BTC binary contracts. + + Args: + df: Full parquet DataFrame (all columns). + min_rows_last7d: Minimum rows in the last 7-day window to include. + + Returns: + List of :class:`~polymarket.contracts.ContractMetadata` objects, one + per qualifying (condition_id, YES-side) market. + """ + # Keep only BTC/Bitcoin YES-side rows + btc_mask = ( + df["question"].str.contains("Bitcoin|BTC", case=False, na=False) + & (df["side"].str.strip().str.lower() == "yes") + ) + btc = df[btc_mask].copy() + btc["timestamp"] = pd.to_datetime(btc["timestamp"], utc=True) + + contracts: list[ContractMetadata] = [] + skipped = {"no_strike": 0, "no_date": 0, "sparse": 0} + + for cid, grp in btc.groupby("condition_id"): + question = grp["question"].iloc[0] + last_ts = grp["timestamp"].max() + + # Parse strike and direction + result = _parse_strike_direction(question) + if result is None: + skipped["no_strike"] += 1 + continue + strike, direction = result + + # Parse expiry + end_date_utc = _parse_expiry(question, last_ts) + if end_date_utc is None: + skipped["no_date"] += 1 + continue + + # Check coverage in the last 7-day window + window_start = last_ts - pd.Timedelta(hours=WINDOW_HOURS) + rows_in_window = (grp["timestamp"] >= window_start).sum() + if rows_in_window < min_rows_last7d: + skipped["sparse"] += 1 + continue + + # Determine settlement: final close ≥ 0.5 → YES won + final_close = pd.to_numeric(grp["close"], errors="coerce").dropna() + if final_close.empty: + skipped["sparse"] += 1 + continue + # Use the last candle's close as settlement proxy + settlement = 1.0 if float(final_close.iloc[-1]) >= 0.5 else 0.0 + + pair_yes = _make_pair(strike, direction, end_date_utc, "YES") + pair_no = _make_pair(strike, direction, end_date_utc, "NO") + + contracts.append( + ContractMetadata( + id=str(cid), + question=question, + slug=str(cid), + strike=strike, + direction=direction, + end_date_utc=end_date_utc, + start_date_utc=( + grp["timestamp"].min().strftime("%Y-%m-%dT%H:%M:%SZ") + ), + settlement=settlement, + volume_usd=float(grp["volume"].sum()), + pair_yes=pair_yes, + pair_no=pair_no, + raw={"condition_id": cid, "rows_in_window": int(rows_in_window)}, + ) + ) + + logger.info( + "parse_btc_contracts: %d contracts parsed, skipped %d (no_strike=%d, " + "no_date=%d, sparse=%d)", + len(contracts), sum(skipped.values()), + skipped["no_strike"], skipped["no_date"], skipped["sparse"], + ) + return contracts + + +# --------------------------------------------------------------------------- +# Feather builder +# --------------------------------------------------------------------------- + +def build_feather_from_real_data( + df: pd.DataFrame, + contract: ContractMetadata, + output_dir: str | Path, + *, + max_ffill_hours: int = MAX_FFILL_HOURS, +) -> Path: + """Extract real OHLCV for one contract and write a freqtrade feather file. + + The function: + + 1. Filters the parquet to the contract's ``condition_id`` and ``side="Yes"``. + 2. Selects the last :data:`WINDOW_HOURS` hours before expiry. + 3. Converts string prices to float and timestamps to millisecond integers. + 4. Reindexes to a full hourly grid and forward-fills gaps up to + ``max_ffill_hours`` consecutive missing candles. + 5. Writes ``{pair_yes}-1h.feather`` to ``output_dir``. + + Args: + df: Full parquet DataFrame. + contract: Parsed contract metadata. + output_dir: Directory to write the feather file. + max_ffill_hours: Maximum consecutive hours to forward-fill. + + Returns: + Path to the written feather file. + """ + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + cid = contract.raw.get("condition_id", contract.id) + + # Filter to this contract's YES side + mask = ( + (df["condition_id"] == cid) + & (df["side"].str.strip().str.lower() == "yes") + ) + grp = df[mask].copy() + if grp.empty: + raise ValueError(f"No rows found for condition_id={cid!r}") + + grp["timestamp"] = pd.to_datetime(grp["timestamp"], utc=True) + + # 7-day window before expiry + expiry_ts = pd.Timestamp(contract.end_date_utc.replace("Z", "+00:00")) + window_start = expiry_ts - pd.Timedelta(hours=WINDOW_HOURS) + grp = grp[(grp["timestamp"] >= window_start) & (grp["timestamp"] < expiry_ts)] + + if grp.empty: + raise ValueError( + f"No rows in 7-day window for {contract.pair_yes} " + f"({window_start} – {expiry_ts})" + ) + + # Convert price strings to float + for col in ("open", "high", "low", "close"): + grp[col] = pd.to_numeric(grp[col], errors="coerce") + + grp = grp.set_index("timestamp").sort_index() + + # Reindex to full hourly grid + full_index = pd.date_range(start=window_start, end=expiry_ts - pd.Timedelta(hours=1), + freq="h", tz="UTC") + ohlcv = grp[["open", "high", "low", "close", "volume"]].reindex(full_index) + + # Forward-fill gaps up to max_ffill_hours + ohlcv = ohlcv.ffill(limit=max_ffill_hours) + + # Log coverage + n_filled = ohlcv["close"].notna().sum() + coverage = n_filled / len(ohlcv) + if coverage < MIN_COVERAGE: + logger.warning( + "%s: coverage %.0f%% is below %.0f%% threshold — feather written but " + "results may be unreliable", + contract.pair_yes, coverage * 100, MIN_COVERAGE * 100, + ) + else: + logger.info("%s: %.0f%% hourly coverage (%d/%d candles)", + contract.pair_yes, coverage * 100, n_filled, len(ohlcv)) + + # Fill any remaining NaN with 0 for volume, and carry-forward for prices + ohlcv["volume"] = ohlcv["volume"].fillna(0.0) + ohlcv[["open", "high", "low", "close"]] = ( + ohlcv[["open", "high", "low", "close"]].ffill().bfill() + ) + + # Convert to freqtrade format: date as ms int64 + ohlcv = ohlcv.reset_index().rename(columns={"index": "date"}) + ohlcv["date"] = ohlcv["date"].astype("int64") // 1_000_000 + + # Write feather (pair name → safe filename) + pair_safe = contract.pair_yes.replace("/", "_") + out_path = output_dir / f"{pair_safe}-1h.feather" + feather.write_feather(ohlcv[["date", "open", "high", "low", "close", "volume"]], str(out_path)) + logger.info("Wrote %s (%d rows)", out_path, len(ohlcv)) + return out_path + + +# --------------------------------------------------------------------------- +# End-to-end convenience wrapper +# --------------------------------------------------------------------------- + +def build_all_feathers_from_parquet( + parquet_path: str | Path, + output_dir: str | Path, + *, + min_rows_last7d: int = int(WINDOW_HOURS * MIN_COVERAGE), + max_ffill_hours: int = MAX_FFILL_HOURS, + filter_condition_ids: Sequence[str] | None = None, + write_jsonl: bool = True, +) -> list[ContractMetadata]: + """Load the parquet, parse BTC contracts, and build feather files. + + Args: + parquet_path: Path to the Polymarket parquet file. + output_dir: Directory to write feather files. + min_rows_last7d: Minimum rows in last 7d window to include. + max_ffill_hours: Maximum gap fill length (hours). + filter_condition_ids: If given, only process these condition IDs. + + Returns: + List of :class:`~polymarket.contracts.ContractMetadata` for all + contracts successfully written. + """ + logger.info("Loading parquet from %s …", parquet_path) + df = pd.read_parquet( + str(parquet_path), + columns=["timestamp", "condition_id", "side", + "open", "high", "low", "close", "volume", "question"], + ) + + if filter_condition_ids is not None: + df = df[df["condition_id"].isin(filter_condition_ids)] + + contracts = parse_btc_contracts(df, min_rows_last7d=min_rows_last7d) + logger.info("Building feathers for %d contracts …", len(contracts)) + + written: list[ContractMetadata] = [] + for contract in contracts: + try: + build_feather_from_real_data( + df, contract, output_dir, max_ffill_hours=max_ffill_hours + ) + written.append(contract) + except Exception as exc: + logger.warning("Skipping %s: %s", contract.pair_yes, exc) + + if write_jsonl and written: + jsonl_path = Path(output_dir) / "real_contracts.jsonl" + write_contracts_jsonl(written, jsonl_path) + + logger.info( + "Done. Wrote feathers for %d/%d contracts into %s", + len(written), len(contracts), output_dir, + ) + return written + + +# --------------------------------------------------------------------------- +# JSONL serialisation (for strategy contract registry) +# --------------------------------------------------------------------------- + +def write_contracts_jsonl( + contracts: list[ContractMetadata], + output_path: str | Path, +) -> Path: + """Serialise a list of :class:`~polymarket.contracts.ContractMetadata` to JSONL. + + The output format mirrors the Polymarket REST API schema expected by + :func:`polymarket.contracts.load_contracts`, so the strategy can load + real-data contracts through the same code path. + + Args: + contracts: Parsed contracts (e.g. from :func:`parse_btc_contracts`). + output_path: Path to write the ``.jsonl`` file. + + Returns: + Path to the written file. + """ + import json as _json + + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + with output_path.open("w") as fh: + for c in contracts: + # Encode settlement as Polymarket outcomePrices JSON string + if c.settlement >= 0.5: + outcome_prices = '["1.0", "0.0"]' + else: + outcome_prices = '["0.0", "1.0"]' + + record = { + "id": c.id, + "question": c.question, + "slug": c.slug, + "endDate": c.end_date_utc, + "startDate": c.start_date_utc, + "outcomePrices": outcome_prices, + "volume": str(c.volume_usd), + } + fh.write(_json.dumps(record) + "\n") + + logger.info("Wrote %d contracts to %s", len(contracts), output_path) + return output_path diff --git a/scripts/prepare_event_model.py b/scripts/prepare_event_model.py index 264188d..de02d78 100644 --- a/scripts/prepare_event_model.py +++ b/scripts/prepare_event_model.py @@ -3,8 +3,8 @@ Run this script once before backtesting with DualModelPolymarketPortfolio. It performs four steps: - 0. Build feather files — synthetic OHLCV price series per contract (required by - the freqtrade backtester). + 0. Build feather files — OHLCV price series per contract (synthetic by default; + use ``--use-real-data`` for real Polymarket prices). 1. Build training data — synthetic weekly BTC events from data_1h.csv. 2. Train event model — calibrated logistic regression, saved to pkl. 3. Generate predictions — per-contract fair_value CSVs for the backtester. @@ -13,8 +13,15 @@ ----- From the repo root:: + # Default: synthetic OHLCV from a JSONL contracts file python scripts/prepare_event_model.py + # Real Polymarket prices from a trade-history parquet + python scripts/prepare_event_model.py \\ + --use-real-data \\ + --parquet-path mycode/data/combined_filtered_data.paquet \\ + --output-dir user_data/data/polymarket_ml_real + All paths default to the standard repo layout. Override via CLI flags:: python scripts/prepare_event_model.py \\ @@ -61,6 +68,13 @@ def parse_args() -> argparse.Namespace: help="Skip step 0 if feather files already exist in output-dir.") p.add_argument("--skip-training-data", action="store_true", help="Skip step 1 if training_data.parquet already exists.") + # Real-data mode + p.add_argument("--use-real-data", action="store_true", + help="Step 0: build feathers from real Polymarket trade data " + "instead of synthetic prices. Requires --parquet-path.") + p.add_argument("--parquet-path", default="mycode/data/combined_filtered_data.paquet", + help="Path to the Polymarket trade-history parquet file " + "(used only with --use-real-data).") return p.parse_args() @@ -77,10 +91,23 @@ def main() -> None: output_dir.mkdir(parents=True, exist_ok=True) # ------------------------------------------------------------------ - # Step 0: Build synthetic feather files (required by backtester) + # Step 0: Build feather files (synthetic or real) # ------------------------------------------------------------------ if args.skip_feathers: logger.info("Step 0 skipped — assuming feather files already exist in %s", output_dir) + elif args.use_real_data: + logger.info("Step 0/3 — Building real-data OHLCV feather files from parquet") + parquet_path = REPO_ROOT / args.parquet_path + from polymarket.real_data_builder import build_all_feathers_from_parquet + written = build_all_feathers_from_parquet( + parquet_path=parquet_path, + output_dir=output_dir, + ) + logger.info("Step 0 complete: %d feather files written", len(written)) + # In real-data mode, the contracts come from the parquet itself; skip + # the JSONL-based contracts file for step 3 (we'll derive them below). + contracts_path = None # signal to step 3 to use the parsed list + _real_contracts = written else: logger.info("Step 0/3 — Building synthetic OHLCV feather files") from polymarket.data_builder import build_all_feathers @@ -122,11 +149,22 @@ def main() -> None: # Step 3: Generate per-contract predictions # ------------------------------------------------------------------ logger.info("Step 3/3 — Generating per-contract predictions") - from polymarket.contracts import load_contracts from polymarket.data_builder import build_event_predictions - contracts = load_contracts(contracts_path) - logger.info(" Loaded %d contracts from %s", len(contracts), contracts_path) + if args.use_real_data and not args.skip_feathers: + # Contracts were parsed from the parquet in step 0; JSONL also written + contracts = _real_contracts + logger.info(" Using %d contracts parsed from parquet", len(contracts)) + elif args.use_real_data and args.skip_feathers: + # Feathers already built — reload contracts from the written JSONL + from polymarket.contracts import load_contracts as _load + real_jsonl = output_dir / "real_contracts.jsonl" + contracts = _load(real_jsonl) + logger.info(" Loaded %d contracts from %s", len(contracts), real_jsonl) + else: + from polymarket.contracts import load_contracts + contracts = load_contracts(contracts_path) + logger.info(" Loaded %d contracts from %s", len(contracts), contracts_path) build_event_predictions( btc_csv_path=btc_csv, diff --git a/user_data/strategies/DualModelPolymarketPortfolio.py b/user_data/strategies/DualModelPolymarketPortfolio.py index 8b323f4..2e508de 100644 --- a/user_data/strategies/DualModelPolymarketPortfolio.py +++ b/user_data/strategies/DualModelPolymarketPortfolio.py @@ -39,11 +39,12 @@ import json import logging -from datetime import UTC, datetime +from datetime import datetime, timezone from pathlib import Path from typing import Optional import pandas as pd + from freqtrade.persistence import Trade from freqtrade.strategy import IStrategy @@ -56,7 +57,7 @@ SETTLE_YES = 0.999 SETTLE_NO = 0.001 -UTC = UTC +UTC = timezone.utc class DualModelPolymarketPortfolio(IStrategy): @@ -101,10 +102,19 @@ def _resolve_data_root(self) -> Path: return Path(__file__).resolve().parents[1] def _load_contracts_registry(self) -> dict[str, ContractMetadata]: - """Load contract metadata and return a dict keyed by pair_yes.""" + """Load contract metadata and return a dict keyed by pair_yes. + + The JSONL path can be overridden via ``config["contracts_jsonl"]``. + Defaults to ``user_data/data/polymarket_contracts/jan20.jsonl``. + """ data_root = self._resolve_data_root() - jsonl_path = data_root / "data" / "polymarket_contracts" / "jan20.jsonl" - contracts = load_contracts(jsonl_path) + if hasattr(self, "config") and "contracts_jsonl" in self.config: + jsonl_path = Path(self.config["contracts_jsonl"]) + if not jsonl_path.is_absolute(): + jsonl_path = data_root / jsonl_path + else: + jsonl_path = data_root / "data" / "polymarket_contracts" / "jan20.jsonl" + contracts = load_contracts(jsonl_path, skip_unparseable=True) return {c.pair_yes: c for c in contracts} def _get_registry(self) -> dict[str, ContractMetadata]: @@ -112,15 +122,27 @@ def _get_registry(self) -> dict[str, ContractMetadata]: self._contract_registry = self._load_contracts_registry() return self._contract_registry + def _get_predictions_dir(self) -> Path: + """Return the directory containing event_probs CSVs. + + Defaults to ``user_data/data/polymarket_ml``. + Override via ``config["predictions_dir"]`` (relative paths are resolved + against ``user_data/``). + """ + data_root = self._resolve_data_root() + if hasattr(self, "config") and "predictions_dir" in self.config: + p = Path(self.config["predictions_dir"]) + return p if p.is_absolute() else data_root / p + return data_root / "data" / "polymarket_ml" + def _load_event_probs(self, pair: str) -> pd.DataFrame | None: """Load per-contract event probability CSV produced by build_event_predictions. Returns a DataFrame indexed by UTC timestamps with a ``fair_value`` column, or ``None`` if the file does not exist. """ - data_root = self._resolve_data_root() filename = pair.replace("/", "_") + "-event_probs.csv" - csv_path = data_root / "data" / "polymarket_ml" / filename + csv_path = self._get_predictions_dir() / filename if not csv_path.exists(): return None