mlsys-io · EvW1329 · Apr 16, 2026
diff --git a/.gitignore b/.gitignore
@@ -12,13 +12,15 @@ user_data/*
 !user_data/models
 !user_data/freqaimodels
 !user_data/data/
-!user_data/run_custom.py
 # Downloaded data (from Google Drive) and synthetic test data — not tracked in git
 user_data/data/usstock/
 user_data/data/polymarket/
 user_data/data/portfoliobench/
 # Generated ML artifacts (regenerable via scripts/prepare_event_model.py)
 user_data/data/polymarket_ml/
+user_data/data/polymarket_ml_real/
+# Report / thesis drafts
+report/
 # Local development data, model checkpoints, and notebooks
 mycode/
 # Virtual environment
@@ -130,6 +132,3 @@ target/
 !config_examples/config_freqai.example.json
 
 docker-compose-*.yml
-
-# Workflow results (regenerable via make e2e-*)
-results/
diff --git a/docs/polymarket/README.md b/docs/polymarket/README.md
@@ -127,10 +127,11 @@ Run the preparation script to rebuild everything from the raw BTC price series:
 python scripts/prepare_event_model.py
 ```
 
-This executes three steps automatically:
+This executes four steps automatically:
 
 | Step | What it does | Output |
 |---|---|---|
+| 0 — Build feather files | Generates OHLCV feather files per contract (synthetic by default; see [Using Real Data](#using-real-polymarket-data)) | `*.feather` files in `--output-dir` |
 | 1 — Build training data | Constructs synthetic weekly BTC binary events from `data_1h.csv` | `event_model_training.parquet` |
 | 2 — Train model | Fits and calibrates a logistic regression on the training data | `event_model.pkl` |
 | 3 — Generate predictions | Runs the model at every hourly bar before each contract's expiry | Per-contract `*-event_probs.csv` files |
@@ -151,13 +152,63 @@ python scripts/prepare_event_model.py \
 | Flag | Default | Description |
 |---|---|---|
 | `--btc-csv` | `mycode/data/data_1h.csv` | Path to hourly BTC OHLCV CSV |
-| `--contracts` | `user_data/data/polymarket_contracts/jan20.jsonl` | Contract metadata JSONL |
+| `--contracts` | `user_data/data/polymarket_contracts/jan20.jsonl` | Contract metadata JSONL (ignored when `--use-real-data` is set) |
 | `--output-dir` | `user_data/data/polymarket_ml` | Where all artefacts are written |
 | `--start-date` | `2018-01-01` | Earliest settlement date for training samples |
 | `--end-date` | `2025-06-01` | Latest settlement date (exclusive) |
 | `--val-cutoff` | `2024-01-01` | Events before this date → training; after → validation |
 | `--model-type` | `logistic` | `logistic` (default) or `xgboost` |
+| `--skip-feathers` | off | Skip step 0 if feather files already exist in `--output-dir` |
 | `--skip-training-data` | off | Skip step 1 if the Parquet already exists |
+| `--use-real-data` | off | Step 0: build feathers from real Polymarket trade data instead of synthetic prices |
+| `--parquet-path` | `mycode/data/combined_filtered_data.paquet` | Path to the Polymarket trade-history parquet (only used with `--use-real-data`) |
+
+### Using Real Polymarket Data
+
+By default, step 0 generates **synthetic** OHLCV price series for each contract. If you
+have a Polymarket trade-history parquet file, you can use real market prices instead:
+
+```bash
+python scripts/prepare_event_model.py \
+    --use-real-data \
+    --parquet-path mycode/data/combined_filtered_data.paquet \
+    --output-dir   user_data/data/polymarket_ml_real
+```
+
+This runs `polymarket/real_data_builder.py` which:
+
+1. Loads the parquet and filters to BTC/Bitcoin YES-side rows.
+2. Parses each market's strike (`$88K`, `$90,000`, etc.) and expiry date from the
+   `question` text.
+3. Checks that the last 7-day window has at least 60% hourly coverage (≥ 101 of 168
+   candles).
+4. Forward-fills gaps of up to 6 consecutive hours; excludes contracts with longer gaps.
+5. Writes a freqtrade-compatible `*.feather` file for each qualifying contract.
+6. Serialises all parsed contracts to `real_contracts.jsonl` in the output directory.
+
+The event model and downstream prediction steps (steps 1–3) are unchanged — they work
+with the same feather files regardless of whether they were generated synthetically or
+from real data.
+
+**Parquet format requirements:**
+
+| Column | Type | Description |
+|---|---|---|
+| `timestamp` | datetime (UTC) | Candle open time |
+| `condition_id` | string | Market identifier |
+| `side` | string | `"Yes"` or `"No"` — only YES-side rows are used |
+| `open`, `high`, `low`, `close` | string-encoded float | OHLC prices (0–1 range) |
+| `volume` | int64 | Traded volume |
+| `question` | string | Human-readable market question |
+
+**Coverage thresholds:**
+
+| Gap length | Treatment |
+|---|---|
+| 1–6 hours | Forward-filled (price unchanged) |
+| 7–23 hours | Forward-filled, but a `WARNING` is logged |
+| ≥ 24 hours consecutively | Contract excluded (price too stale) |
+| < 60% of window rows present | Contract excluded |
 
 For the full training reference — feature engineering, label construction, temporal
 splits, calibration, and inference — see [training-guide.md](training-guide.md).
@@ -194,6 +245,37 @@ At each hourly candle, the strategy:
 To override parameters without modifying the strategy file, subclass
 `DualModelPolymarketPortfolio` and set the class attributes.
 
+### Configurable data paths (via config JSON)
+
+Two file paths used by the strategy can be overridden through the freqtrade config JSON
+without touching the strategy source:
+
+| Config key | Default | Description |
+|---|---|---|
+| `contracts_jsonl` | `user_data/data/polymarket_contracts/jan20.jsonl` | Path to the contract metadata JSONL file; relative paths are resolved against `user_data/` |
+| `predictions_dir` | `user_data/data/polymarket_ml` | Directory containing per-contract `*-event_probs.csv` files; relative paths are resolved against `user_data/` |
+
+Example config for a real-data backtest:
+
+```json
+{
+  "contracts_jsonl": "data/polymarket_ml_real/real_contracts.jsonl",
+  "predictions_dir": "data/polymarket_ml_real",
+  "exchange": {
+    "name": "polymarket",
+    "pair_whitelist": [
+      "BTCABOVE108K-SEP5-YES/USDT",
+      "BTCABOVE110K-SEP5-YES/USDT"
+    ]
+  }
+}
+```
+
+Contracts loaded from `real_contracts.jsonl` (produced by `--use-real-data`) may
+contain question patterns that the minimal regex in the synthetic pipeline did not
+support (e.g. "reach $108K"). The loader automatically skips unparseable lines with a
+warning, so a mixed JSONL file will not crash the strategy.
+
 ### Position sizing (Kelly formula)
 
 For a contract priced at `p_market` and model probability `p_model`:
@@ -358,8 +440,21 @@ are:
 ```
 
 The `load_contracts()` function extracts the strike and direction from the `question`
-field using a regex match for `"above $X,XXX"` or `"below $X,XXX"`. Settlement is
-determined from `outcomePrices`.
+field. Supported question patterns include:
+
+| Pattern | Direction | Example |
+|---|---|---|
+| `above $X,XXX` / `above $XXK` | above | "Bitcoin above $90,000 on January 20?" |
+| `reach/hit/exceed/surpass $X,XXX` | above | "Will Bitcoin reach $120,000 by December?" |
+| `below $X,XXX` / `less than $X,XXX` | below | "Bitcoin below $80,000 on March 1?" |
+| `dips to $X,XXX` | below | "Will Bitcoin dip to $70K in October?" |
+
+Strikes can be written with or without a `K`/`k` suffix (e.g. `$88K` = `$88,000`).
+Settlement is determined from `outcomePrices`.
+
+Lines whose question cannot be parsed are skipped with a warning (see `skip_unparseable`
+in `load_contracts`) rather than raising an error, so a JSONL file with mixed contract
+types will not crash the strategy.
 
 Place the file at `user_data/data/polymarket_contracts/<name>.jsonl` and update
 `--contracts` in the prepare script.
@@ -384,13 +479,17 @@ Timestamp,Open,High,Low,Close,Volume
 - Optional on-chain columns: `mvrv`, `hash-rate`, `difficulty` (omit if unavailable;
   the corresponding features will be NaN and effectively ignored by the model).
 
-### 3 — Feather files (synthetic OHLCV)
+### 3 — Feather files (synthetic or real OHLCV)
+
+The backtester requires an OHLCV feather file for each pair. Two sources are supported:
 
-The backtester requires a synthetic OHLCV feather file for each pair. These are
-generated by the existing `build_all_feathers()` function. They are **not** real
-Polymarket quotes — they are synthetic price series used solely to satisfy freqtrade's
-data loading requirements. The strategy ignores OHLCV values other than `close` for
-market price.
+- **Synthetic (default):** generated by `build_all_feathers()` from `data_builder.py`.
+  These are modelled price series used solely to satisfy freqtrade's data loading
+  requirements. The strategy ignores OHLCV values other than `close` for market price.
+- **Real Polymarket data:** generated by `build_all_feathers_from_parquet()` from
+  `real_data_builder.py` (requires a trade-history parquet). These contain actual
+  historical Polymarket prices from the YES-side order book. Use `--use-real-data` in
+  `prepare_event_model.py` to build these instead of synthetic files.
 
 ### 4 — Per-contract predictions
 

diff --git a/docs/polymarket/training-guide.md b/docs/polymarket/training-guide.md
@@ -25,6 +25,21 @@ start.
 ## 1. Architecture Overview
 
 ```
+                ┌─────────────────────────────────────────────┐
+                │  Step 0: OHLCV feather files per contract    │
+                │                                              │
+                │  Option A (default — synthetic):             │
+                │    build_all_feathers()    (data_builder.py) │
+                │    • Log-normal price simulation             │
+                │                                              │
+                │  Option B (--use-real-data):                 │
+                │    build_all_feathers_from_parquet()         │
+                │                    (real_data_builder.py)    │
+                │    • Real Polymarket YES-side prices         │
+                │    • Forward-fill gaps ≤ 6h                  │
+                │    • Writes real_contracts.jsonl             │
+                └─────────────────────────────────────────────┘
+
 data_1h.csv                     (BTC hourly OHLCV, 2018–present)
       │
       ▼
@@ -119,11 +134,24 @@ One JSON object per line. Required fields per contract:
 ```
 
 **Field notes:**
-- `question`: must contain `"above $X,XXX"` or `"below $X,XXX"` for strike extraction.
+- `question`: used for strike and direction extraction. Supported patterns:
+  - `"above $X,XXX"` / `"above $XXK"` — YES if BTC > K at expiry
+  - `"reach/hit/exceed/surpass $X,XXX"` — treated as "above"
+  - `"below $X,XXX"` / `"less than $X,XXX"` / `"dips to $X,XXX"` — YES if BTC < K
+  - K-suffix supported: `$88K` is parsed as `$88,000`
+  - Lines whose `question` cannot be parsed are skipped with a warning when
+    `skip_unparseable=True`; they raise `ValueError` by default.
 - `outcomePrices`: JSON-encoded array `["YES_price", "NO_price"]`. `"1"` = that outcome
   won. Used to determine `settlement` (1.0 or 0.0).
 - `endDate`: ISO-8601 UTC, determines the settlement timestamp `T`.
 
+**Alternative: auto-generated JSONL from real data**
+
+When using `--use-real-data` in `prepare_event_model.py`, the JSONL file is generated
+automatically by `real_data_builder.write_contracts_jsonl()` from the parquet. The
+output `real_contracts.jsonl` uses the same schema and can be loaded by the strategy
+via the `contracts_jsonl` config key.
+
 ---
 
 ## 3. Feature Engineering
@@ -482,12 +510,21 @@ used during fitting. It does not limit when the model can be deployed.
 After running `prepare_event_model.py`, the following files are written to
 `user_data/data/polymarket_ml/`:
 
+**Synthetic mode (default):**
+
 | File | Format | Description |
 |---|---|---|
 | `event_model_training.parquet` | Parquet | 363K+ training samples; all 15 features + label, K, T |
 | `event_model.pkl` | joblib pickle | Model package dict (see below) |
 | `{pair}-event_probs.csv` | CSV | Per-contract fair-value predictions; columns: `dt_utc`, `fair_value` |
-| `{pair}.feather` | Feather | Synthetic OHLCV for backtester (generated by `build_all_feathers`) |
+| `{pair}-1h.feather` | Feather | Synthetic OHLCV for backtester (generated by `build_all_feathers`) |
+
+**Real-data mode (`--use-real-data`):** Same artefacts, plus:
+
+| File | Format | Description |
+|---|---|---|
+| `{pair}-1h.feather` | Feather | Real Polymarket YES-side OHLCV, 7-day window before expiry |
+| `real_contracts.jsonl` | JSONL | Auto-generated contract metadata; compatible with `load_contracts()` |
 
 ### Model package structure
 

diff --git a/freqtrade b/freqtrade
diff --git a/polymarket/contracts.py b/polymarket/contracts.py
@@ -19,10 +19,14 @@
 from __future__ import annotations
 
 import json
+import logging
 import re
 from dataclasses import dataclass, field
 from pathlib import Path
 
+logger = logging.getLogger(__name__)
+
+
 # ---------------------------------------------------------------------------
 # Pair naming helpers
 # ---------------------------------------------------------------------------
@@ -100,26 +104,44 @@ class ContractMetadata:
 # Strike / direction extraction
 # ---------------------------------------------------------------------------
 
-# Matches "above $90,000", "above $90k", "less than $84,000", etc.
-_ABOVE_RE = re.compile(r"above\s+\$([0-9,]+(?:\.[0-9]+)?)", re.IGNORECASE)
-_BELOW_RE = re.compile(r"(?:less than|below)\s+\$([0-9,]+(?:\.[0-9]+)?)", re.IGNORECASE)
+# Matches "above $90,000", "above $90K", "reach $120,000", "less than $84K", etc.
+# Group 1: numeric part; Group 2: optional K/k suffix.
+_ABOVE_RE = re.compile(
+    r"\b(?:above|reach(?:es)?|hit(?:s)?|exceed(?:s)?|surpass(?:es)?)\s+\$([0-9,]+(?:\.[0-9]+)?)\s*([kK])?(?:\b|$)",
+    re.IGNORECASE,
+)
+_BELOW_RE = re.compile(
+    r"\b(?:less\s+than|below|dip(?:s)?\s+to)\s+\$([0-9,]+(?:\.[0-9]+)?)\s*([kK])?(?:\b|$)",
+    re.IGNORECASE,
+)
 
 
 def _parse_strike_direction(question: str) -> tuple[float, str]:
     """Extract (strike, direction) from a Polymarket question string.
 
+    Handles:
+    - "above $90,000" / "above $90K"
+    - "reach/hit/exceed/surpass $120,000" (treated as 'above')
+    - "less than/below/dips to $84,000" (treated as 'below')
+
     Raises:
         ValueError: If neither 'above' nor 'below/less than' is found.
     """
     m = _ABOVE_RE.search(question)
     if m:
-        strike = float(m.group(1).replace(",", ""))
-        return strike, "above"
+        raw = m.group(1).replace(",", "")
+        has_k = m.group(2) is not None
+        strike = float(raw) * (1_000 if has_k else 1)
+        if strike >= 1_000:
+            return strike, "above"
 
     m = _BELOW_RE.search(question)
     if m:
-        strike = float(m.group(1).replace(",", ""))
-        return strike, "below"
+        raw = m.group(1).replace(",", "")
+        has_k = m.group(2) is not None
+        strike = float(raw) * (1_000 if has_k else 1)
+        if strike >= 1_000:
+            return strike, "below"
 
     raise ValueError(f"Cannot parse strike/direction from question: {question!r}")
 
@@ -149,24 +171,34 @@ def _settlement_from_outcome_prices(outcome_prices_json: str) -> float:
 # Public loader
 # ---------------------------------------------------------------------------
 
-def load_contracts(jsonl_path: str | Path) -> list[ContractMetadata]:
+def load_contracts(
+    jsonl_path: str | Path,
+    *,
+    skip_unparseable: bool = False,
+) -> list[ContractMetadata]:
     """Parse a Polymarket JSONL snapshot into a list of :class:`ContractMetadata`.
 
     Args:
-        jsonl_path: Path to the ``.jsonl`` file (one JSON object per line).
+        jsonl_path:        Path to the ``.jsonl`` file (one JSON object per line).
+        skip_unparseable:  When ``True``, log a warning and skip lines whose
+                           question text cannot be parsed rather than raising.
+                           Useful when loading ``real_contracts.jsonl`` which may
+                           contain exotic question patterns.
 
     Returns:
         List of :class:`ContractMetadata`, sorted by strike ascending.
 
     Raises:
         FileNotFoundError: If ``jsonl_path`` does not exist.
-        ValueError:        If a line cannot be parsed.
+        ValueError:        If a line cannot be parsed and ``skip_unparseable`` is
+                           ``False``.
     """
     path = Path(jsonl_path)
     if not path.exists():
         raise FileNotFoundError(f"Contract metadata file not found: {path}")
 
     contracts: list[ContractMetadata] = []
+    skipped = 0
 
     with path.open() as fh:
         for lineno, line in enumerate(fh, start=1):
@@ -182,6 +214,10 @@ def load_contracts(jsonl_path: str | Path) -> list[ContractMetadata]:
             try:
                 strike, direction = _parse_strike_direction(d["question"])
             except ValueError as exc:
+                if skip_unparseable:
+                    logger.debug("Skipping line %d (%s): %s", lineno, d.get("slug", "?"), exc)
+                    skipped += 1
+                    continue
                 raise ValueError(f"Line {lineno} ({d.get('slug', '?')}): {exc}") from exc
 
             settlement = _settlement_from_outcome_prices(d["outcomePrices"])
@@ -208,5 +244,8 @@ def load_contracts(jsonl_path: str | Path) -> list[ContractMetadata]:
                 )
             )
 
+    if skipped:
+        logger.info("load_contracts: skipped %d unparseable lines in %s", skipped, path.name)
+
     contracts.sort(key=lambda c: c.strike)
     return contracts
+22 −0		freqtrade/commands/data_commands.py
+1 −1		freqtrade/exchange/polymarket.py