From 5f5d057f4fd7087ab11a2c1d3efe774f7878dd5a Mon Sep 17 00:00:00 2001 From: Bartolomej Kozorog Date: Wed, 18 Mar 2026 14:02:27 +0100 Subject: [PATCH 1/5] update & rerun golf forecasting --- .../fine_tuning/01_golf_forecasting.ipynb | 1488 ++++++++++++----- 1 file changed, 1040 insertions(+), 448 deletions(-) diff --git a/notebooks/fine_tuning/01_golf_forecasting.ipynb b/notebooks/fine_tuning/01_golf_forecasting.ipynb index 63b47af..a6d7def 100644 --- a/notebooks/fine_tuning/01_golf_forecasting.ipynb +++ b/notebooks/fine_tuning/01_golf_forecasting.ipynb @@ -1,491 +1,1083 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "header", - "metadata": {}, - "source": [ - "# Golf Forecasting\n", - "\n", - "Generate a forecasting dataset about professional golf (tournaments, majors, rankings) using the LightningRod SDK. This example showcases dataset generation, preparation with SDK utils, and training results from our experiments." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "aaddd17b", - "metadata": {}, - "outputs": [ + "cells": [ { - "data": { - "text/plain": [ - "True" + "cell_type": "markdown", + "id": "header", + "metadata": {}, + "source": [ + "# Golf Forecasting\n", + "\n", + "Generate a forecasting dataset about professional golf (tournaments, majors, rankings) using the LightningRod SDK. This example showcases dataset generation, preparation with SDK utils, and training results from our experiments." ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%pip install lightningrod-ai python-dotenv pandas\n", - "\n", - "from IPython.display import clear_output\n", - "clear_output()\n", - "\n", - "from datetime import datetime\n", - "\n", - "import pandas as pd\n", - "from dotenv import load_dotenv\n", - "\n", - "load_dotenv()" - ] - }, - { - "cell_type": "markdown", - "id": "part1", - "metadata": {}, - "source": [ - "## Set up the client\n", - "\n", - "Sign up at [dashboard.lightningrod.ai](https://dashboard.lightningrod.ai/?redirect=/api) to get your API key and **$50 of free credits**." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "sdk-setup", - "metadata": {}, - "outputs": [], - "source": [ - "from lightningrod import LightningRod\n", - "from lightningrod.utils import config\n", - "\n", - "api_key = config.get_config_value(\"LIGHTNINGROD_API_KEY\")\n", - "lr = LightningRod(api_key=api_key)" - ] - }, - { - "cell_type": "markdown", - "id": "a04964c0", - "metadata": {}, - "source": [ - "## Build the pipeline\n", - "\n", - "Configure the pipeline with domain-specific instructions and examples for golf forecasting." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "config", - "metadata": {}, - "outputs": [], - "source": [ - "instructions = \"\"\"\n", - "Generate binary forecasting questions about professional golf across all major tours and events.\n", - "\n", - "Cover what golf fans bet on: tournament outcomes, cuts, matchups, majors, team events, season races, world rankings, and player milestones.\n", - "\n", - "Questions should be specific, verifiable, and span the full probability spectrum.\n", - "\"\"\"\n", - "\n", - "good_examples = [\n", - " \"Will Scottie Scheffler win the 2025 Masters?\",\n", - " \"Will the 2025 US Open winning score be under par?\",\n", - " \"Will Tiger Woods make the cut at the 2025 Masters?\",\n", - " \"Will Rory McIlroy finish top 5 at the 2025 US Open?\",\n", - " \"Will any LIV player win a major championship in 2025?\",\n", - " \"Will Europe win the 2025 Ryder Cup?\",\n", - " \"Will any player win 4+ PGA Tour events in 2025?\",\n", - " \"Will Scottie Scheffler remain world #1 through June 2025?\",\n", - " \"Will a first-time major winner emerge at the 2025 PGA Championship?\",\n", - " \"Will Nelly Korda win the 2025 US Women's Open?\",\n", - "]\n", - "\n", - "bad_examples = [\n", - " \"Will someone win the tournament? (obvious)\",\n", - " \"Will golf be exciting? (subjective)\",\n", - " \"Will there be birdies? (trivial)\",\n", - "]\n", - "\n", - "search_queries = [\n", - " \"PGA Tour\",\n", - " \"LIV Golf\",\n", - " \"LPGA\",\n", - " \"golf major championship\",\n", - " \"Ryder Cup Presidents Cup\",\n", - " \"golf world rankings\",\n", - " \"professional golf\",\n", - " \"women's golf\",\n", - " \"European Tour golf\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "pipeline", - "metadata": {}, - "outputs": [], - "source": [ - "from lightningrod import (\n", - " BinaryAnswerType,\n", - " NewsSeedGenerator,\n", - " ForwardLookingQuestionGenerator,\n", - " NewsContextGenerator,\n", - " WebSearchLabeler,\n", - " QuestionPipeline,\n", - ")\n", - "\n", - "answer_type = BinaryAnswerType()\n", - "\n", - "pipeline = QuestionPipeline(\n", - " seed_generator=NewsSeedGenerator(\n", - " start_date=datetime(2024, 6, 1),\n", - " end_date=datetime(2026, 1, 1),\n", - " interval_duration_days=14,\n", - " search_query=search_queries,\n", - " articles_per_search=10,\n", - " ),\n", - " question_generator=ForwardLookingQuestionGenerator(\n", - " instructions=instructions,\n", - " examples=good_examples,\n", - " bad_examples=bad_examples,\n", - " answer_type=answer_type,\n", - " questions_per_seed=5,\n", - " ),\n", - " context_generators=[\n", - " NewsContextGenerator(\n", - " articles_per_query=3,\n", - " num_search_queries=3,\n", - " num_articles=5,\n", - " )\n", - " ],\n", - " labeler=WebSearchLabeler(answer_type=answer_type),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "324a35cc", - "metadata": {}, - "source": [ - "## Run the pipeline\n", - "\n", - "This will collect news articles, generate questions, and find answers. Use `max_questions` to limit the run for testing." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "7f62e9c4", - "metadata": {}, - "outputs": [ + }, { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f25abaaeb92e42f1bca02f0ea69c7f15", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" + "cell_type": "code", + "execution_count": 1, + "id": "aaddd17b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%pip install lightningrod-ai python-dotenv pandas openai\n", + "\n", + "from IPython.display import clear_output\n", + "clear_output()\n", + "\n", + "from datetime import datetime\n", + "\n", + "import pandas as pd\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
+      "cell_type": "markdown",
+      "id": "part1",
+      "metadata": {},
+      "source": [
+        "## Set up the client\n",
+        "\n",
+        "Sign up at [dashboard.lightningrod.ai](https://dashboard.lightningrod.ai/?redirect=/api) to get your API key and **$50 of free credits**."
+      ]
     },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "100 samples (87.0% valid)\n"
-     ]
-    }
-   ],
-   "source": [
-    "dataset = lr.transforms.run(pipeline, max_questions=100, name=\"Golf forecasting\")\n",
-    "samples = dataset.download()\n",
-    "\n",
-    "pct = (sum(1 for s in samples if s.is_valid is True) / len(samples) * 100) if samples else 0\n",
-    "print(f\"{len(samples)} samples ({pct:.1f}% valid)\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "fdbi5exhd6c",
-   "metadata": {},
-   "source": [
-    "## Prepare the dataset\n",
-    "\n",
-    "Use SDK utils to filter valid samples, deduplicate, and split into train/test sets. We filter by `date_close <= today` to only include questions that have already resolved."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "upload",
-   "metadata": {},
-   "outputs": [
+      "cell_type": "code",
+      "execution_count": 2,
+      "id": "sdk-setup",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from lightningrod import LightningRod\n",
+        "from lightningrod.utils import config\n",
+        "\n",
+        "api_key = config.get_config_value(\"LIGHTNINGROD_API_KEY\")\n",
+        "lr = LightningRod(api_key=api_key)"
+      ]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Train: 37 rows, 32.4% yes\n",
-      "Test: 18 rows, 44.4% yes\n"
-     ]
-    }
-   ],
-   "source": [
-    "from lightningrod import filter_and_split\n",
-    "\n",
-    "train_dataset, test_dataset = filter_and_split(\n",
-    "    dataset,\n",
-    "    test_size=0.2,\n",
-    "    split_strategy=\"temporal\",\n",
-    "    days_to_resolution_range=(1, None),  # at least 1 day to resolution\n",
-    ")\n",
-    "\n",
-    "for name, ds in [(\"Train\", train_dataset), (\"Test\", test_dataset)]:\n",
-    "    data = ds.flattened()\n",
-    "    yes_count = sum(1 for s in data if s.get(\"label\") in (1, \"1\", 1.0))\n",
-    "    print(f\"{name}: {len(data)} rows, {yes_count/len(data)*100:.1f}% yes\")\n",
-    "    display(pd.DataFrame(data).head())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b2e9efba",
-   "metadata": {},
-   "source": [
-    "## Uploading the dataset to HuggingFace\n",
-    "\n",
-    "Once we have a training-ready dataset, we can push it to Hugging Face for sharing or downstream use."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "ae7c826b",
-   "metadata": {},
-   "outputs": [
+      "cell_type": "markdown",
+      "id": "a04964c0",
+      "metadata": {},
+      "source": [
+        "## Build the pipeline\n",
+        "\n",
+        "Configure the pipeline with domain-specific instructions and examples for golf forecasting."
+      ]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m26.0.1\u001b[0m\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
-      "Note: you may need to restart the kernel to use updated packages.\n",
-      "Train: 37 rows, Test: 18 rows\n",
-      "Columns: ['question_text', 'date_close', 'event_date', 'resolution_criteria', 'prediction_date', 'label', 'answer_type', 'label_confidence'] ...\n"
-     ]
+      "cell_type": "code",
+      "execution_count": 3,
+      "id": "config",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "instructions = \"\"\"\n",
+        "Generate binary forecasting questions about professional golf across all major tours and events.\n",
+        "\n",
+        "Cover what golf fans bet on: tournament outcomes, cuts, matchups, majors, team events, season races, world rankings, and player milestones.\n",
+        "\n",
+        "Questions should be specific, verifiable, and span the full probability spectrum.\n",
+        "\"\"\"\n",
+        "\n",
+        "good_examples = [\n",
+        "    \"Will Scottie Scheffler win the 2025 Masters?\",\n",
+        "    \"Will the 2025 US Open winning score be under par?\",\n",
+        "    \"Will Tiger Woods make the cut at the 2025 Masters?\",\n",
+        "    \"Will Rory McIlroy finish top 5 at the 2025 US Open?\",\n",
+        "    \"Will any LIV player win a major championship in 2025?\",\n",
+        "    \"Will Europe win the 2025 Ryder Cup?\",\n",
+        "    \"Will any player win 4+ PGA Tour events in 2025?\",\n",
+        "    \"Will Scottie Scheffler remain world #1 through June 2025?\",\n",
+        "    \"Will a first-time major winner emerge at the 2025 PGA Championship?\",\n",
+        "    \"Will Nelly Korda win the 2025 US Women's Open?\",\n",
+        "]\n",
+        "\n",
+        "bad_examples = [\n",
+        "    \"Will someone win the tournament? (obvious)\",\n",
+        "    \"Will golf be exciting? (subjective)\",\n",
+        "    \"Will there be birdies? (trivial)\",\n",
+        "]\n",
+        "\n",
+        "search_queries = [\n",
+        "    \"PGA Tour\",\n",
+        "    \"LIV Golf\",\n",
+        "    \"LPGA\",\n",
+        "    \"golf major championship\",\n",
+        "    \"Ryder Cup Presidents Cup\",\n",
+        "    \"golf world rankings\",\n",
+        "    \"professional golf\",\n",
+        "    \"women's golf\",\n",
+        "    \"European Tour golf\",\n",
+        "]"
+      ]
     },
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "453c59f9daf345828e087cc2a47af33f",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Uploading the dataset shards:   0%|          | 0/1 [00:00╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+              "                                                                                                                 \n",
+              "  >> Pipeline Completed                                                                                          \n",
+              "                                                                                                                 \n",
+              "    Total cost: $47.67                                                                                           \n",
+              "                                                                                                                 \n",
+              "  ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━┳━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓  \n",
+              " Step                Progress               In  Out  Rejected  Errors  Rejection Reasons   Duration \n",
+              "  ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━╇━━━━━╇━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩  \n",
+              " NewsSeedGenerator… Complete             │  20 │ 200 │        0     0-                  │      19s │  \n",
+              " ForwardLookingQue… Complete             │ 200 │ 965 │       33     0date_close not     │      15s │  \n",
+              "                    │                      │     │     │          │        │ after event_date   │          │  \n",
+              "                    │                      │     │     │          │        │ (33)               │          │  \n",
+              " WebSearchLabelerT… Complete             │ 965 │ 796 │      169     0Undetermined label │    1m 8s │  \n",
+              "                    │                      │     │     │          │        │ (164), Resolution  │          │  \n",
+              "                    │                      │     │     │          │        │ date is before     │          │  \n",
+              "                    │                      │     │     │          │        │ seed creation date │          │  \n",
+              "                    │                      │     │     │          │        │ (5)                │          │  \n",
+              " NewsContextGenera… Complete             │ 796 │ 795 │        1     0<failed_attempts>  │  11m 37s │  \n",
+              "                    │                      │     │     │          │        │                    │          │  \n",
+              "                    │                      │     │     │          │        │ <generation        │          │  \n",
+              "                    │                      │     │     │          │        │ number=\"1\">        │          │  \n",
+              "                    │                      │     │     │          │        │ <exception>        │          │  \n",
+              "                    │                      │     │     │          │        │     Request timed  │          │  \n",
+              "                    │                      │     │     │          │        │ out.               │          │  \n",
+              "                    │                      │     │     │          │        │ </exception>       │          │  \n",
+              "                    │                      │     │     │          │        │ <completion>       │          │  \n",
+              "                    │                      │     │     │          │        │     None           │          │  \n",
+              "                    │                      │     │     │          │        │ </completion>      │          │  \n",
+              "                    │                      │     │     │          │        │ </generation>      │          │  \n",
+              "                    │                      │     │     │          │        │                    │          │  \n",
+              "                    │                      │     │     │          │        │ <generation        │          │  \n",
+              "                    │                      │     │     │          │        │ number=\"2\">        │          │  \n",
+              "                    │                      │     │     │          │        │ <exception>        │          │  \n",
+              "                    │                      │     │     │          │        │     Connection     │          │  \n",
+              "                    │                      │     │     │          │        │ error.             │          │  \n",
+              "                    │                      │     │     │          │        │ </exception>       │          │  \n",
+              "                    │                      │     │     │          │        │ <completion>       │          │  \n",
+              "                    │                      │     │     │          │        │     None           │          │  \n",
+              "                    │                      │     │     │          │        │ </completion>      │          │  \n",
+              "                    │                      │     │     │          │        │ </generation>      │          │  \n",
+              "                    │                      │     │     │          │        │                    │          │  \n",
+              "                    │                      │     │     │          │        │ </failed_attempts> │          │  \n",
+              "                    │                      │     │     │          │        │                    │          │  \n",
+              "                    │                      │     │     │          │        │ <last_exception>   │          │  \n",
+              "                    │                      │     │     │          │        │     Connection     │          │  \n",
+              "                    │                      │     │     │          │        │ error.             │          │  \n",
+              "                    │                      │     │     │          │        │ </last_exception>  │          │  \n",
+              "                    │                      │     │     │          │        │ (1)                │          │  \n",
+              "  └────────────────────┴──────────────────────┴─────┴─────┴──────────┴────────┴────────────────────┴──────────┘  \n",
+              "                                                                                                                 \n",
+              "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+              "\n"
+            ],
+            "text/plain": [
+              "\u001b[92m╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n",
+              "\u001b[92m│\u001b[0m                                                                                                                 \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  \u001b[1;92m>> Pipeline Completed\u001b[0m                                                                                          \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m                                                                                                                 \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m    \u001b[1mTotal cost:\u001b[0m \u001b[92m$47.67\u001b[0m                                                                                           \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m                                                                                                                 \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━┳━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  ┃\u001b[1;36m \u001b[0m\u001b[1;36mStep              \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mProgress            \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36m In\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mOut\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mRejected\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mErrors\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mRejection Reasons \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mDuration\u001b[0m\u001b[1;36m \u001b[0m┃  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━╇━━━━━╇━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m \u001b[0m\u001b[1mNewsSeedGenerator…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete            \u001b[0m │  20 │ 200 │ \u001b[2m       0\u001b[0m │ \u001b[2m     0\u001b[0m │ \u001b[2m-                 \u001b[0m │      19s │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m \u001b[0m\u001b[1mForwardLookingQue…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete            \u001b[0m │ 200 │ 965 │ \u001b[91m      33\u001b[0m │ \u001b[2m     0\u001b[0m │ \u001b[2mdate_close not    \u001b[0m │      15s │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2mafter event_date  \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m(33)              \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m \u001b[0m\u001b[1mWebSearchLabelerT…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete            \u001b[0m │ 965 │ 796 │ \u001b[91m     169\u001b[0m │ \u001b[2m     0\u001b[0m │ \u001b[2mUndetermined label\u001b[0m │    1m 8s │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m(164), Resolution \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2mdate is before    \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2mseed creation date\u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m(5)               \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m \u001b[0m\u001b[1mNewsContextGenera…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete            \u001b[0m │ 796 │ 795 │ \u001b[91m       1\u001b[0m │ \u001b[2m     0\u001b[0m │ \u001b[2m \u001b[0m │  11m 37s │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m                  \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m       \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m       \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m    Request timed \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2mout.              \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m      \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m      \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m    None          \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m     \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m     \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m                  \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m       \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m       \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m    Connection    \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2merror.            \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m      \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m      \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m    None          \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m     \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m     \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m                  \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m\u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m                  \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m  \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m    Connection    \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2merror.            \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m(1)               \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  └────────────────────┴──────────────────────┴─────┴─────┴──────────┴────────┴────────────────────┴──────────┘  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m                                                                                                                 \u001b[92m│\u001b[0m\n",
+              "\u001b[92m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "948 samples (78.6% valid)\n"
+          ]
+        }
+      ],
+      "source": [
+        "dataset = lr.transforms.run(pipeline, max_questions=1000, name=\"Golf forecasting\")\n",
+        "samples = dataset.download()\n",
+        "\n",
+        "pct = (sum(1 for s in samples if s.is_valid is True) / len(samples) * 100) if samples else 0\n",
+        "print(f\"{len(samples)} samples ({pct:.1f}% valid)\")"
       ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
     },
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "586e0e7d869d495b9daacd16e3389bcf",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "New Data Upload: |          |  0.00B /  0.00B            "
+      "cell_type": "markdown",
+      "id": "fdbi5exhd6c",
+      "metadata": {},
+      "source": [
+        "## Prepare the dataset\n",
+        "\n",
+        "Use SDK utils to filter valid samples, deduplicate, and split into train/test sets. We filter by `date_close <= today` to only include questions that have already resolved."
       ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
     },
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f7672f6275a44c16983dc4a5c2c1df27",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Uploading the dataset shards:   0%|          | 0/1 [00:00\n",
+              "\n",
+              "\n",
+              "  \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "  \n",
+              "  \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "  \n",
+              "
sample_idis_validquestion_textdate_closeevent_dateresolution_criteriaprediction_datelabelanswer_typelabel_confidence...reasoninganswer_sourcesseed_textseed_urlseed_creation_dateseed_search_querycontextmeta_sample_idmeta_parent_sample_idmeta_processing_time_ms
023a607a2-e9db-45a9-8e33-67cd16a32b56TrueWill the Eastern Michigan University women's g...2025-05-10T00:00:002024-07-15T00:00:00The question resolves to 'Yes' if Eastern Mich...2024-07-15T00:00:000binary1.00...The Eastern Michigan University (EMU) women's ...https://vertexaisearch.cloud.google.com/ground...Eastern Michigan Athletics\\nCaterina Don Named...https://emueagles.com/news/2024/7/9/womens-gol...2024-07-15T00:00:00women's golf[{'rendered_context': '', 'search_query': 'Eas...fa146afa-b53f-48c1-8d2d-a81ce2dec41b0107be94-88d9-4068-a355-ec38b8691376844641.292
12736b5ea-b6b0-4fde-a237-96f6a3d9ee86TrueWill an Arizona Wildcats player be named the B...2025-05-01T00:00:002024-07-15T00:00:00The question resolves to 'Yes' if the Big 12 C...2024-07-15T00:00:001binary1.00...The Arizona Wildcats officially joined the Big...https://vertexaisearch.cloud.google.com/ground...TUCSON, Ariz. – Arizona Women's Golf Head Coac...https://arizonawildcats.com/news/2024/7/15/bra...2024-07-15T00:00:00women's golf[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Arizon...ece95e4d-151b-4af8-936a-5c7e18276b97f7429e77-a524-46dd-ae47-821794e79938988488.435
22eae4276-f449-45e9-8973-e760b6d36d61TrueWill Caterina Don remain in her role as the As...2025-05-31T00:00:002024-07-15T00:00:00The question resolves to 'Yes' if Caterina Don...2024-07-15T00:00:001binary0.95...Caterina Don was hired as the first full-time ...https://vertexaisearch.cloud.google.com/ground...Eastern Michigan Athletics\\nCaterina Don Named...https://emueagles.com/news/2024/7/9/womens-gol...2024-07-15T00:00:00women's golf[{'rendered_context': '', 'search_query': 'Cat...4fbfad5d-b425-4cfe-b04d-1db1279c80a80107be94-88d9-4068-a355-ec38b8691376485377.689
335b2be70-0b7c-4b1b-b82f-2e3f0d3b61d8TrueWill the University of North Carolina women's ...2025-04-15T00:00:002024-07-15T00:00:00The question resolves to Yes if the UNC women'...2024-07-15T00:00:001binary1.00...The University of North Carolina women's golf ...https://vertexaisearch.cloud.google.com/ground...University of North Carolina Athletics\\nNeff's...https://goheels.com/news/2024/7/15/neffs-contr...2024-07-15T00:00:00women's golf[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] 2024-2...05a3aff2-dee2-4111-a0f4-c085ba138679c9839639-2fd2-4f0a-ad52-4566bbde89be1052828.897
4382c5c2e-db8f-4b8c-bbe8-6fc4e55edf53TrueWill the University of Arizona Women's Golf te...2025-04-15T00:00:002024-07-15T00:00:00The question resolves to 'Yes' if the Universi...2024-07-15T00:00:001binary1.00...The University of Arizona Women's Golf team wo...https://vertexaisearch.cloud.google.com/ground...TUCSON, Ariz. – Arizona Women's Golf Head Coac...https://arizonawildcats.com/news/2024/7/15/bra...2024-07-15T00:00:00women's golf[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Arizon...1fd71efb-9506-4c76-b1b6-d8eed1a30ac7f7429e77-a524-46dd-ae47-821794e799381089828.234
\n", + "

5 rows × 21 columns

\n", + "" + ], + "text/plain": [ + " sample_id is_valid \\\n", + "0 23a607a2-e9db-45a9-8e33-67cd16a32b56 True \n", + "1 2736b5ea-b6b0-4fde-a237-96f6a3d9ee86 True \n", + "2 2eae4276-f449-45e9-8973-e760b6d36d61 True \n", + "3 35b2be70-0b7c-4b1b-b82f-2e3f0d3b61d8 True \n", + "4 382c5c2e-db8f-4b8c-bbe8-6fc4e55edf53 True \n", + "\n", + " question_text date_close \\\n", + "0 Will the Eastern Michigan University women's g... 2025-05-10T00:00:00 \n", + "1 Will an Arizona Wildcats player be named the B... 2025-05-01T00:00:00 \n", + "2 Will Caterina Don remain in her role as the As... 2025-05-31T00:00:00 \n", + "3 Will the University of North Carolina women's ... 2025-04-15T00:00:00 \n", + "4 Will the University of Arizona Women's Golf te... 2025-04-15T00:00:00 \n", + "\n", + " event_date resolution_criteria \\\n", + "0 2024-07-15T00:00:00 The question resolves to 'Yes' if Eastern Mich... \n", + "1 2024-07-15T00:00:00 The question resolves to 'Yes' if the Big 12 C... \n", + "2 2024-07-15T00:00:00 The question resolves to 'Yes' if Caterina Don... \n", + "3 2024-07-15T00:00:00 The question resolves to Yes if the UNC women'... \n", + "4 2024-07-15T00:00:00 The question resolves to 'Yes' if the Universi... \n", + "\n", + " prediction_date label answer_type label_confidence ... \\\n", + "0 2024-07-15T00:00:00 0 binary 1.00 ... \n", + "1 2024-07-15T00:00:00 1 binary 1.00 ... \n", + "2 2024-07-15T00:00:00 1 binary 0.95 ... \n", + "3 2024-07-15T00:00:00 1 binary 1.00 ... \n", + "4 2024-07-15T00:00:00 1 binary 1.00 ... \n", + "\n", + " reasoning \\\n", + "0 The Eastern Michigan University (EMU) women's ... \n", + "1 The Arizona Wildcats officially joined the Big... \n", + "2 Caterina Don was hired as the first full-time ... \n", + "3 The University of North Carolina women's golf ... \n", + "4 The University of Arizona Women's Golf team wo... \n", + "\n", + " answer_sources \\\n", + "0 https://vertexaisearch.cloud.google.com/ground... \n", + "1 https://vertexaisearch.cloud.google.com/ground... \n", + "2 https://vertexaisearch.cloud.google.com/ground... \n", + "3 https://vertexaisearch.cloud.google.com/ground... \n", + "4 https://vertexaisearch.cloud.google.com/ground... \n", + "\n", + " seed_text \\\n", + "0 Eastern Michigan Athletics\\nCaterina Don Named... \n", + "1 TUCSON, Ariz. – Arizona Women's Golf Head Coac... \n", + "2 Eastern Michigan Athletics\\nCaterina Don Named... \n", + "3 University of North Carolina Athletics\\nNeff's... \n", + "4 TUCSON, Ariz. – Arizona Women's Golf Head Coac... \n", + "\n", + " seed_url seed_creation_date \\\n", + "0 https://emueagles.com/news/2024/7/9/womens-gol... 2024-07-15T00:00:00 \n", + "1 https://arizonawildcats.com/news/2024/7/15/bra... 2024-07-15T00:00:00 \n", + "2 https://emueagles.com/news/2024/7/9/womens-gol... 2024-07-15T00:00:00 \n", + "3 https://goheels.com/news/2024/7/15/neffs-contr... 2024-07-15T00:00:00 \n", + "4 https://arizonawildcats.com/news/2024/7/15/bra... 2024-07-15T00:00:00 \n", + "\n", + " seed_search_query context \\\n", + "0 women's golf [{'rendered_context': '', 'search_query': 'Eas... \n", + "1 women's golf [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Arizon... \n", + "2 women's golf [{'rendered_context': '', 'search_query': 'Cat... \n", + "3 women's golf [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] 2024-2... \n", + "4 women's golf [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Arizon... \n", + "\n", + " meta_sample_id meta_parent_sample_id \\\n", + "0 fa146afa-b53f-48c1-8d2d-a81ce2dec41b 0107be94-88d9-4068-a355-ec38b8691376 \n", + "1 ece95e4d-151b-4af8-936a-5c7e18276b97 f7429e77-a524-46dd-ae47-821794e79938 \n", + "2 4fbfad5d-b425-4cfe-b04d-1db1279c80a8 0107be94-88d9-4068-a355-ec38b8691376 \n", + "3 05a3aff2-dee2-4111-a0f4-c085ba138679 c9839639-2fd2-4f0a-ad52-4566bbde89be \n", + "4 1fd71efb-9506-4c76-b1b6-d8eed1a30ac7 f7429e77-a524-46dd-ae47-821794e79938 \n", + "\n", + " meta_processing_time_ms \n", + "0 844641.292 \n", + "1 988488.435 \n", + "2 485377.689 \n", + "3 1052828.897 \n", + "4 1089828.234 \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test: 143 rows, 32.2% yes\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idis_validquestion_textdate_closeevent_dateresolution_criteriaprediction_datelabelanswer_typelabel_confidence...reasoninganswer_sourcesseed_textseed_urlseed_creation_dateseed_search_querycontextmeta_sample_idmeta_parent_sample_idmeta_processing_time_ms
069d062b4-681c-431f-9e01-e13befba3ea0TrueWill Luke Clanton finish in the top 10 of the ...2025-07-07T00:00:002025-06-24T00:00:00The question resolves to 'Yes' if Luke Clanton...2025-06-24T00:00:000binary1.0...Luke Clanton participated in the 2025 John Dee...https://vertexaisearch.cloud.google.com/ground...Title: No. 15 Ben Griffin, rising star Luke Cl...https://www.wqad.com/article/sports/john-deere...2025-06-24T00:00:00golf world rankings[{'rendered_context': '', 'search_query': 'Luk...7cdbf935-029e-4ce5-bcb4-d6cbf016303ac96f580b-8dff-42a0-90dc-3e5c888680c6489996.714
1c3ef9b69-79e5-42c4-a26c-b0d02bf82abbTrueWill Ben Griffin be ranked in the top 10 of th...2025-07-07T00:00:002025-06-24T00:00:00The question resolves to 'Yes' if Ben Griffin'...2025-06-24T00:00:000binary1.0...Ben Griffin was ranked No. 17 in the Official ...https://vertexaisearch.cloud.google.com/ground...Title: No. 15 Ben Griffin, rising star Luke Cl...https://www.wqad.com/article/sports/john-deere...2025-06-24T00:00:00golf world rankings[{'rendered_context': '', 'search_query': 'Ben...c3a577ab-069d-4857-8c52-9fce6f44e876c96f580b-8dff-42a0-90dc-3e5c888680c6493772.150
2e1d9f94a-8fa0-4caf-a85b-2f2602e7e9aeTrueWill Luke Clanton outscore Ben Griffin in the ...2025-07-04T00:00:002025-06-24T00:00:00The question resolves to 'Yes' if Luke Clanton...2025-06-24T00:00:001binary1.0...The first round of the 2025 John Deere Classic...https://vertexaisearch.cloud.google.com/ground...Title: No. 15 Ben Griffin, rising star Luke Cl...https://www.wqad.com/article/sports/john-deere...2025-06-24T00:00:00golf world rankings[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Player...41c1a085-7b54-4a59-b7c8-145a2c9f225fc96f580b-8dff-42a0-90dc-3e5c888680c6693496.445
3093a04a1-03f4-429c-a8cd-c2f7c8ea098cTrueWill Jordan Smith win the 2025 Italian Open?2025-06-30T00:00:002025-06-25T00:00:00This question resolves to Yes if Jordan Smith ...2025-06-25T00:00:000binary1.0...The 2025 Italian Open (golf) took place from J...https://vertexaisearch.cloud.google.com/ground...Title: 2025 Italian Open betting tips: Our exp...https://www.todays-golfer.com/news-and-events/...2025-06-25T00:00:00European Tour golf[{'rendered_context': '', 'search_query': 'Jor...cebd9023-809d-441b-9ef5-251381880f5c20e82969-aad5-477a-8e2e-cd641f7d7eec493786.983
424bfbd2f-7a6c-4383-82aa-94d73f429685TrueWill Eddie Pepperell win at least one tourname...2025-11-30T00:00:002025-06-25T00:00:00The question resolves to 'Yes' if Eddie Pepper...2025-06-25T00:00:000binary0.9...The close date is 2025-11-30, and the question...https://vertexaisearch.cloud.google.com/ground...Title: Eddie Pepperell feeling refreshed after...https://www.europeantour.com/dpworld-tour/news...2025-06-25T00:00:00European Tour golf[{'rendered_context': '', 'search_query': 'Edd...a43d140a-a541-4537-8aab-6523ecbf79ce3fa59bd7-f4b8-4ea9-b5b2-d8cba7d5ce0d512717.986
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " sample_id is_valid \\\n", + "0 69d062b4-681c-431f-9e01-e13befba3ea0 True \n", + "1 c3ef9b69-79e5-42c4-a26c-b0d02bf82abb True \n", + "2 e1d9f94a-8fa0-4caf-a85b-2f2602e7e9ae True \n", + "3 093a04a1-03f4-429c-a8cd-c2f7c8ea098c True \n", + "4 24bfbd2f-7a6c-4383-82aa-94d73f429685 True \n", + "\n", + " question_text date_close \\\n", + "0 Will Luke Clanton finish in the top 10 of the ... 2025-07-07T00:00:00 \n", + "1 Will Ben Griffin be ranked in the top 10 of th... 2025-07-07T00:00:00 \n", + "2 Will Luke Clanton outscore Ben Griffin in the ... 2025-07-04T00:00:00 \n", + "3 Will Jordan Smith win the 2025 Italian Open? 2025-06-30T00:00:00 \n", + "4 Will Eddie Pepperell win at least one tourname... 2025-11-30T00:00:00 \n", + "\n", + " event_date resolution_criteria \\\n", + "0 2025-06-24T00:00:00 The question resolves to 'Yes' if Luke Clanton... \n", + "1 2025-06-24T00:00:00 The question resolves to 'Yes' if Ben Griffin'... \n", + "2 2025-06-24T00:00:00 The question resolves to 'Yes' if Luke Clanton... \n", + "3 2025-06-25T00:00:00 This question resolves to Yes if Jordan Smith ... \n", + "4 2025-06-25T00:00:00 The question resolves to 'Yes' if Eddie Pepper... \n", + "\n", + " prediction_date label answer_type label_confidence ... \\\n", + "0 2025-06-24T00:00:00 0 binary 1.0 ... \n", + "1 2025-06-24T00:00:00 0 binary 1.0 ... \n", + "2 2025-06-24T00:00:00 1 binary 1.0 ... \n", + "3 2025-06-25T00:00:00 0 binary 1.0 ... \n", + "4 2025-06-25T00:00:00 0 binary 0.9 ... \n", + "\n", + " reasoning \\\n", + "0 Luke Clanton participated in the 2025 John Dee... \n", + "1 Ben Griffin was ranked No. 17 in the Official ... \n", + "2 The first round of the 2025 John Deere Classic... \n", + "3 The 2025 Italian Open (golf) took place from J... \n", + "4 The close date is 2025-11-30, and the question... \n", + "\n", + " answer_sources \\\n", + "0 https://vertexaisearch.cloud.google.com/ground... \n", + "1 https://vertexaisearch.cloud.google.com/ground... \n", + "2 https://vertexaisearch.cloud.google.com/ground... \n", + "3 https://vertexaisearch.cloud.google.com/ground... \n", + "4 https://vertexaisearch.cloud.google.com/ground... \n", + "\n", + " seed_text \\\n", + "0 Title: No. 15 Ben Griffin, rising star Luke Cl... \n", + "1 Title: No. 15 Ben Griffin, rising star Luke Cl... \n", + "2 Title: No. 15 Ben Griffin, rising star Luke Cl... \n", + "3 Title: 2025 Italian Open betting tips: Our exp... \n", + "4 Title: Eddie Pepperell feeling refreshed after... \n", + "\n", + " seed_url seed_creation_date \\\n", + "0 https://www.wqad.com/article/sports/john-deere... 2025-06-24T00:00:00 \n", + "1 https://www.wqad.com/article/sports/john-deere... 2025-06-24T00:00:00 \n", + "2 https://www.wqad.com/article/sports/john-deere... 2025-06-24T00:00:00 \n", + "3 https://www.todays-golfer.com/news-and-events/... 2025-06-25T00:00:00 \n", + "4 https://www.europeantour.com/dpworld-tour/news... 2025-06-25T00:00:00 \n", + "\n", + " seed_search_query context \\\n", + "0 golf world rankings [{'rendered_context': '', 'search_query': 'Luk... \n", + "1 golf world rankings [{'rendered_context': '', 'search_query': 'Ben... \n", + "2 golf world rankings [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Player... \n", + "3 European Tour golf [{'rendered_context': '', 'search_query': 'Jor... \n", + "4 European Tour golf [{'rendered_context': '', 'search_query': 'Edd... \n", + "\n", + " meta_sample_id meta_parent_sample_id \\\n", + "0 7cdbf935-029e-4ce5-bcb4-d6cbf016303a c96f580b-8dff-42a0-90dc-3e5c888680c6 \n", + "1 c3a577ab-069d-4857-8c52-9fce6f44e876 c96f580b-8dff-42a0-90dc-3e5c888680c6 \n", + "2 41c1a085-7b54-4a59-b7c8-145a2c9f225f c96f580b-8dff-42a0-90dc-3e5c888680c6 \n", + "3 cebd9023-809d-441b-9ef5-251381880f5c 20e82969-aad5-477a-8e2e-cd641f7d7eec \n", + "4 a43d140a-a541-4537-8aab-6523ecbf79ce 3fa59bd7-f4b8-4ea9-b5b2-d8cba7d5ce0d \n", + "\n", + " meta_processing_time_ms \n", + "0 489996.714 \n", + "1 493772.150 \n", + "2 693496.445 \n", + "3 493786.983 \n", + "4 512717.986 \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from lightningrod import filter_and_split\n", + "\n", + "train_dataset, test_dataset = filter_and_split(\n", + " dataset,\n", + " test_size=0.2,\n", + " split_strategy=\"temporal\",\n", + " days_to_resolution_range=(1, None), # at least 1 day to resolution\n", + ")\n", + "\n", + "for name, ds in [(\"Train\", train_dataset), (\"Test\", test_dataset)]:\n", + " data = ds.flattened()\n", + " yes_count = sum(1 for s in data if s.get(\"label\") in (1, \"1\", 1.0))\n", + " print(f\"{name}: {len(data)} rows, {yes_count/len(data)*100:.1f}% yes\")\n", + " display(pd.DataFrame(data).head())" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "41a845cd3c3a44f3bd221b60dc5316ac", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Creating parquet from Arrow format: 0%| | 0/1 [00:00╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n", + " \n", + " >> Training COMPLETED \n", + " \n", + " Job: Golf forecasting \n", + " \n", + " Reward: latest -0.9948 avg -0.8261 (11 steps) (higher is better) \n", + " \n", + " Cost: $0.19 \n", + " \n", + " \n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n", + "\n" + ], + "text/plain": [ + "\u001b[94m╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1;92m>> Training COMPLETED\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mJob:\u001b[0m Golf forecasting \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mReward:\u001b[0m latest -0.9948 avg -0.8261 (11 steps) \u001b[2m(higher is better)\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mCost:\u001b[0m $0.19 \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Job 6c82c197-0627-4ee8-954c-d5ddb93e66f2 completed with status: COMPLETED\n", + "Trained model ID: checkpoint:6c82c197-0627-4ee8-954c-d5ddb93e66f2\n" + ] + } + ], + "source": [ + "job = lr.training.run(config, dataset=train_dataset, name=\"Golf forecasting\")\n", + "print(f\"Job {job.id} completed with status: {job.status}\")\n", + "print(f\"Trained model ID: {job.model_id}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4c74226d", + "metadata": {}, + "source": [ + "## Inference with your trained model\n", + "\n", + "Use `lr.predict()` to run inference with your trained model." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "ba1dcfc5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.35\n" + ] + } + ], + "source": [ + "print(lr.predict(job.model_id, \"Will Scottie Scheffler win the 2026 Masters?\"))" + ] + }, + { + "cell_type": "markdown", + "id": "76781ccb", + "metadata": {}, + "source": [ + "## Run evals on trained model\n", + "\n", + "Run test evals on your trained model against the test dataset. The eval job runs the model on the dataset and reports metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e81dc80", + "metadata": {}, + "outputs": [], + "source": [ + "eval_job = lr.evals.run(model_id=job.model_id, dataset=test_dataset)" + ] + }, + { + "cell_type": "markdown", + "id": "62718605", + "metadata": {}, + "source": [ + "> Note: the trained model checkpoint will only be available for 7 days. If you wish to host this model long-term, reach out to us at support@lightningrod.ai." ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" } - ], - "source": [ - "%pip install datasets -q\n", - "\n", - "from datasets import Dataset, DatasetDict\n", - "from lightningrod.utils import config\n", - "\n", - "dataset = DatasetDict({\n", - " \"train\": Dataset.from_list(train_dataset.flattened()),\n", - " \"test\": Dataset.from_list(test_dataset.flattened()),\n", - "})\n", - "print(f\"Train: {len(dataset['train'])} rows, Test: {len(dataset['test'])} rows\")\n", - "print(\"Columns:\", dataset[\"train\"].column_names[:8], \"...\")\n", - "\n", - "DATASET_PATH = f\"{config.get_config_value('HF_USERNAME')}/golf-forecasting-demo\"\n", - "dataset.push_to_hub(DATASET_PATH, token=config.get_config_value(\"HF_ACCESS_TOKEN\"))" - ] - }, - { - "cell_type": "markdown", - "id": "part2", - "metadata": {}, - "source": [ - "## Model Training\n", - "\n", - "We used the generated dataset above to fine-tune a forecasting model via RL on 3,178 forecasting questions, surpassing GPT-5 performance.\n", - "\n", - "**For more details on methods, results, and data:**\n", - "- **[Golf-Forecaster Model](https://huggingface.co/LightningRodLabs/Golf-Forecaster)**\n", - "- **[Golf-Forecaster Dataset](https://huggingface.co/datasets/LightningRodLabs/GolfForecasting)**\n", - "\n", - "![Brier Skill Score](https://huggingface.co/datasets/LightningRodLabs/GolfForecasting/resolve/main/brier_skill_score.png)\n", - "\n", - "**Coming Soon:** Seamlessly generate datasets, fine-tune, and evaluate your own forecasting models end-to-end on the Lightningrod platform.\n", - " \n", - "\ud83d\udc49 [Sign up to get early access and updates.](https://lightningrod.ai/)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python (lightningrod-sdk)", - "language": "python", - "name": "lightningrod-sdk" + ], + "metadata": { + "kernelspec": { + "display_name": "Python (lightningrod-sdk)", + "language": "python", + "name": "lightningrod-sdk" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "nbformat": 4, + "nbformat_minor": 5 +} From daddd24af3f25a6f45d33d765d8d87cd79b7641b Mon Sep 17 00:00:00 2001 From: Bartolomej Kozorog Date: Wed, 18 Mar 2026 17:20:24 +0100 Subject: [PATCH 2/5] update & rerun trump example --- .../fine_tuning/02_trump_forecasting.ipynb | 1563 ++++++++++++----- 1 file changed, 1127 insertions(+), 436 deletions(-) diff --git a/notebooks/fine_tuning/02_trump_forecasting.ipynb b/notebooks/fine_tuning/02_trump_forecasting.ipynb index 3028466..b2abe08 100644 --- a/notebooks/fine_tuning/02_trump_forecasting.ipynb +++ b/notebooks/fine_tuning/02_trump_forecasting.ipynb @@ -1,479 +1,1170 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "7d4d26cb", - "metadata": {}, - "source": [ - "# WWTD-2025 (What Would Trump Do?)\n", - "\n", - "Generate a forecasting dataset about Trump's actions, decisions, and statements using the LightningRod SDK. This example showcases dataset generation, preparation with SDK utils, and training results from our experiments\u2014including evaluation with and without context." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "6f2c4443", - "metadata": {}, - "outputs": [ + "cells": [ { - "data": { - "text/plain": [ - "True" + "cell_type": "markdown", + "id": "7d4d26cb", + "metadata": {}, + "source": [ + "# WWTD-2025 (What Would Trump Do?)\n", + "\n", + "Generate a forecasting dataset about Trump's actions, decisions, and statements using the LightningRod SDK. This example showcases dataset generation, preparation with SDK utils, and training results from our experiments—including evaluation with and without context." ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%pip install lightningrod-ai python-dotenv pandas\n", - "\n", - "from IPython.display import clear_output\n", - "clear_output()\n", - "\n", - "from datetime import datetime\n", - "\n", - "import pandas as pd\n", - "from dotenv import load_dotenv\n", - "\n", - "load_dotenv()" - ] - }, - { - "cell_type": "markdown", - "id": "f523d274", - "metadata": {}, - "source": [ - "## Set up the client\n", - "\n", - "Sign up at [dashboard.lightningrod.ai](https://dashboard.lightningrod.ai/?redirect=/api) to get your API key and **$50 of free credits**." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "7ca71c31", - "metadata": {}, - "outputs": [], - "source": [ - "from lightningrod import LightningRod\n", - "from lightningrod.utils import config\n", - "\n", - "api_key = config.get_config_value(\"LIGHTNINGROD_API_KEY\")\n", - "lr = LightningRod(api_key=api_key)" - ] - }, - { - "cell_type": "markdown", - "id": "082d4f24", - "metadata": {}, - "source": [ - "## Build the pipeline\n", - "\n", - "Configure the pipeline with domain-specific instructions and examples for Trump-related forecasting." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "faccbe0a", - "metadata": {}, - "outputs": [], - "source": [ - "instructions = \"\"\"\n", - "Generate binary forecasting questions about Trump's actions, decisions, positions, and statements.\n", - "Questions should be diverse, related to the content, and should evenly cover the full range from very likely to very unlikely.\n", - "Horizon: outcomes should be known within 2 months of the question date, and may be known much sooner.\n", - "Criteria: binary outcome, exact dates, self-contained, verifiable via web search, newsworthy.\n", - "\"\"\"\n", - "\n", - "good_examples = [\n", - " \"Will Trump impose 25% tariffs on all goods from Canada by February 1, 2025?\",\n", - " \"Will Trump issue pardons to January 6 defendants within his first week in office?\",\n", - " \"Will Pete Hegseth be confirmed as Secretary of Defense by February 15, 2025?\",\n", - " \"Will Trump sign an executive order to keep TikTok operational in the US by January 31, 2025?\",\n", - " \"Will Kash Patel be confirmed as FBI Director by March 1, 2025?\",\n", - "]\n", - "\n", - "bad_examples = [\n", - " \"Will Trump do something controversial? (too vague)\",\n", - " \"Will Trump be in the news? (obvious)\",\n", - " \"Will tariffs be imposed? (needs specifics)\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "4ce2d710", - "metadata": {}, - "outputs": [], - "source": [ - "from lightningrod import (\n", - " BinaryAnswerType,\n", - " NewsSeedGenerator,\n", - " ForwardLookingQuestionGenerator,\n", - " NewsContextGenerator,\n", - " WebSearchLabeler,\n", - " QuestionPipeline,\n", - ")\n", - "\n", - "answer_type = BinaryAnswerType()\n", - "\n", - "pipeline = QuestionPipeline(\n", - " seed_generator=NewsSeedGenerator(\n", - " start_date=datetime(2025, 1, 1),\n", - " end_date=datetime(2026, 1, 1),\n", - " interval_duration_days=7,\n", - " search_query=[\n", - " \"Donald Trump domestic policy agenda\",\n", - " \"Donald Trump trade and tariff actions\",\n", - " \"Donald Trump foreign policy decisions\",\n", - " \"Donald Trump interviews and press appearances\",\n", - " \"Donald Trump lawsuits and court rulings\",\n", - " ],\n", - " articles_per_search=10,\n", - " ),\n", - " question_generator=ForwardLookingQuestionGenerator(\n", - " instructions=instructions,\n", - " examples=good_examples,\n", - " bad_examples=bad_examples,\n", - " answer_type=answer_type,\n", - " questions_per_seed=20,\n", - " ),\n", - " context_generators=[\n", - " NewsContextGenerator(\n", - " articles_per_query=3,\n", - " num_search_queries=1,\n", - " num_articles=5,\n", - " )\n", - " ],\n", - " labeler=WebSearchLabeler(answer_type=answer_type),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "1603b3de", - "metadata": {}, - "source": [ - "## Run the pipeline\n", - "\n", - "This will collect news articles, generate questions, and find answers. Use `max_questions` to limit the run for testing." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "4de4b87c", - "metadata": {}, - "outputs": [ + }, { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "80f2463c20044d2c9ab2ace831e2adff", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" + "cell_type": "code", + "execution_count": 13, + "id": "6f2c4443", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%pip install lightningrod-ai python-dotenv pandas openai\n", + "\n", + "from IPython.display import clear_output\n", + "clear_output()\n", + "\n", + "from datetime import datetime\n", + "\n", + "import pandas as pd\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
+      "cell_type": "markdown",
+      "id": "f523d274",
+      "metadata": {},
+      "source": [
+        "## Set up the client\n",
+        "\n",
+        "Sign up at [dashboard.lightningrod.ai](https://dashboard.lightningrod.ai/?redirect=/api) to get your API key and **$50 of free credits**."
+      ]
     },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "178 samples (46.1% valid)\n"
-     ]
-    }
-   ],
-   "source": [
-    "dataset = lr.transforms.run(pipeline, max_questions=500, name=\"WWTD-2025\")\n",
-    "\n",
-    "samples = dataset.download()\n",
-    "pct = (sum(1 for s in samples if s.is_valid is True) / len(samples) * 100) if samples else 0\n",
-    "print(f\"{len(samples)} samples ({pct:.1f}% valid)\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "91866bf7",
-   "metadata": {},
-   "source": [
-    "## Prepare the dataset\n",
-    "\n",
-    "Use SDK utils to filter valid samples, deduplicate, and split into train/test sets. We filter by `date_close <= today` to only include questions that have already resolved."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5e4d3f2a",
-   "metadata": {},
-   "outputs": [
+      "cell_type": "code",
+      "execution_count": 14,
+      "id": "7ca71c31",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from lightningrod import LightningRod\n",
+        "from lightningrod.utils import config\n",
+        "\n",
+        "api_key = config.get_config_value(\"LIGHTNINGROD_API_KEY\")\n",
+        "lr = LightningRod(api_key=api_key)"
+      ]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Train: 39 rows, 12.8% yes\n",
-      "Test: 13 rows, 38.5% yes\n"
-     ]
-    }
-   ],
-   "source": [
-    "from lightningrod import filter_and_split\n",
-    "\n",
-    "train_dataset, test_dataset = filter_and_split(\n",
-    "    dataset,\n",
-    "    test_size=0.2,\n",
-    "    split_strategy=\"temporal\",\n",
-    "    days_to_resolution_range=(1, 60),  # horizon within 2 months\n",
-    ")\n",
-    "\n",
-    "for name, ds in [(\"Train\", train_dataset), (\"Test\", test_dataset)]:\n",
-    "    data = ds.flattened()\n",
-    "    yes_count = sum(1 for s in data if s.get(\"label\") in (1, \"1\", 1.0))\n",
-    "    print(f\"{name}: {len(data)} rows, {yes_count/len(data)*100:.1f}% yes\")\n",
-    "    display(pd.DataFrame(data).head())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0e799cfe",
-   "metadata": {},
-   "source": [
-    "## Uploading the dataset to HuggingFace\n",
-    "\n",
-    "Once we have a training-ready dataset, we can push it to Hugging Face for sharing or downstream use."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "91093200",
-   "metadata": {},
-   "outputs": [
+      "cell_type": "markdown",
+      "id": "082d4f24",
+      "metadata": {},
+      "source": [
+        "## Build the pipeline\n",
+        "\n",
+        "Configure the pipeline with domain-specific instructions and examples for Trump-related forecasting."
+      ]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m26.0.1\u001b[0m\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
-      "Note: you may need to restart the kernel to use updated packages.\n",
-      "Train: 58 rows, Test: 17 rows\n",
-      "Columns: ['question_text', 'date_close', 'event_date', 'resolution_criteria', 'prediction_date', 'label', 'answer_type', 'label_confidence'] ...\n"
-     ]
+      "cell_type": "code",
+      "execution_count": 15,
+      "id": "faccbe0a",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "instructions = \"\"\"\n",
+        "Generate binary forecasting questions about Trump's actions, decisions, positions, and statements.\n",
+        "Questions should be diverse, related to the content, and should evenly cover the full range from very likely to very unlikely.\n",
+        "Horizon: outcomes should be known within 2 months of the question date, and may be known much sooner.\n",
+        "Criteria: binary outcome, exact dates, self-contained, verifiable via web search, newsworthy.\n",
+        "\"\"\"\n",
+        "\n",
+        "good_examples = [\n",
+        "    \"Will Trump impose 25% tariffs on all goods from Canada by February 1, 2025?\",\n",
+        "    \"Will Trump issue pardons to January 6 defendants within his first week in office?\",\n",
+        "    \"Will Pete Hegseth be confirmed as Secretary of Defense by February 15, 2025?\",\n",
+        "    \"Will Trump sign an executive order to keep TikTok operational in the US by January 31, 2025?\",\n",
+        "    \"Will Kash Patel be confirmed as FBI Director by March 1, 2025?\",\n",
+        "]\n",
+        "\n",
+        "bad_examples = [\n",
+        "    \"Will Trump do something controversial? (too vague)\",\n",
+        "    \"Will Trump be in the news? (obvious)\",\n",
+        "    \"Will tariffs be imposed? (needs specifics)\",\n",
+        "]"
+      ]
     },
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0b949581ce354458a957d2f6047eb184",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Uploading the dataset shards:   0%|          | 0/1 [00:00╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+              "                                                                                                                 \n",
+              "  >> Pipeline Completed                                                                                          \n",
+              "                                                                                                                 \n",
+              "    Total cost: $47.21                                                                                           \n",
+              "                                                                                                                 \n",
+              "  ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━┳━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓  \n",
+              " Step                Progress               In  Out  Rejected  Errors  Rejection Reasons   Duration \n",
+              "  ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━╇━━━━━╇━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩  \n",
+              " NewsSeedGenerator… Complete             │  20 │ 195 │        0     0-                  │       7s │  \n",
+              " ForwardLookingQue… Complete             │ 195 │ 912 │       57     0date_close not     │      12s │  \n",
+              "                    │                      │     │     │          │        │ after event_date   │          │  \n",
+              "                    │                      │     │     │          │        │ (57)               │          │  \n",
+              " WebSearchLabelerT… Complete             │ 912 │ 775 │      137     0Resolution date is │    1m 9s │  \n",
+              "                    │                      │     │     │          │        │ before seed        │          │  \n",
+              "                    │                      │     │     │          │        │ creation date      │          │  \n",
+              "                    │                      │     │     │          │        │ (96), Undetermined │          │  \n",
+              "                    │                      │     │     │          │        │ label (40), Low    │          │  \n",
+              "                    │                      │     │     │          │        │ confidence: 0.80 < │          │  \n",
+              "                    │                      │     │     │          │        │ 0.9 (1)            │          │  \n",
+              " NewsContextGenera… Complete             │ 775 │ 756 │       19     0<failed_attempts>  │  11m 32s │  \n",
+              "                    │                      │     │     │          │        │                    │          │  \n",
+              "                    │                      │     │     │          │        │ <generation        │          │  \n",
+              "                    │                      │     │     │          │        │ number=\"1\">        │          │  \n",
+              "                    │                      │     │     │          │        │ <exception>        │          │  \n",
+              "                    │                      │     │     │          │        │     Connection     │          │  \n",
+              "                    │                      │     │     │          │        │ error.             │          │  \n",
+              "                    │                      │     │     │          │        │ </exception>       │          │  \n",
+              "                    │                      │     │     │          │        │ <completion>       │          │  \n",
+              "                    │                      │     │     │          │        │     None           │          │  \n",
+              "                    │                      │     │     │          │        │ </completion>      │          │  \n",
+              "                    │                      │     │     │          │        │ </generation>      │          │  \n",
+              "                    │                      │     │     │          │        │                    │          │  \n",
+              "                    │                      │     │     │          │        │ <generation        │          │  \n",
+              "                    │                      │     │     │          │        │ number=\"2\">        │          │  \n",
+              "                    │                      │     │     │          │        │ <exception>        │          │  \n",
+              "                    │                      │     │     │          │        │     Connection     │          │  \n",
+              "                    │                      │     │     │          │        │ error.             │          │  \n",
+              "                    │                      │     │     │          │        │ </exception>       │          │  \n",
+              "                    │                      │     │     │          │        │ <completion>       │          │  \n",
+              "                    │                      │     │     │          │        │     None           │          │  \n",
+              "                    │                      │     │     │          │        │ </completion>      │          │  \n",
+              "                    │                      │     │     │          │        │ </generation>      │          │  \n",
+              "                    │                      │     │     │          │        │                    │          │  \n",
+              "                    │                      │     │     │          │        │ </failed_attempts> │          │  \n",
+              "                    │                      │     │     │          │        │                    │          │  \n",
+              "                    │                      │     │     │          │        │ <last_exception>   │          │  \n",
+              "                    │                      │     │     │          │        │     Connection     │          │  \n",
+              "                    │                      │     │     │          │        │ error.             │          │  \n",
+              "                    │                      │     │     │          │        │ </last_exception>  │          │  \n",
+              "                    │                      │     │     │          │        │ (19)               │          │  \n",
+              "  └────────────────────┴──────────────────────┴─────┴─────┴──────────┴────────┴────────────────────┴──────────┘  \n",
+              "                                                                                                                 \n",
+              "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+              "\n"
+            ],
+            "text/plain": [
+              "\u001b[92m╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n",
+              "\u001b[92m│\u001b[0m                                                                                                                 \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  \u001b[1;92m>> Pipeline Completed\u001b[0m                                                                                          \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m                                                                                                                 \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m    \u001b[1mTotal cost:\u001b[0m \u001b[92m$47.21\u001b[0m                                                                                           \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m                                                                                                                 \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━┳━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  ┃\u001b[1;36m \u001b[0m\u001b[1;36mStep              \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mProgress            \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36m In\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mOut\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mRejected\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mErrors\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mRejection Reasons \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mDuration\u001b[0m\u001b[1;36m \u001b[0m┃  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━╇━━━━━╇━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m \u001b[0m\u001b[1mNewsSeedGenerator…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete            \u001b[0m │  20 │ 195 │ \u001b[2m       0\u001b[0m │ \u001b[2m     0\u001b[0m │ \u001b[2m-                 \u001b[0m │       7s │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m \u001b[0m\u001b[1mForwardLookingQue…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete            \u001b[0m │ 195 │ 912 │ \u001b[91m      57\u001b[0m │ \u001b[2m     0\u001b[0m │ \u001b[2mdate_close not    \u001b[0m │      12s │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2mafter event_date  \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m(57)              \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m \u001b[0m\u001b[1mWebSearchLabelerT…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete            \u001b[0m │ 912 │ 775 │ \u001b[91m     137\u001b[0m │ \u001b[2m     0\u001b[0m │ \u001b[2mResolution date is\u001b[0m │    1m 9s │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2mbefore seed       \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2mcreation date     \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m(96), Undetermined\u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2mlabel (40), Low   \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2mconfidence: 0.80 <\u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m0.9 (1)           \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m \u001b[0m\u001b[1mNewsContextGenera…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete            \u001b[0m │ 775 │ 756 │ \u001b[91m      19\u001b[0m │ \u001b[2m     0\u001b[0m │ \u001b[2m \u001b[0m │  11m 32s │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m                  \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m       \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m       \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m    Connection    \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2merror.            \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m      \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m      \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m    None          \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m     \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m     \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m                  \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m       \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m       \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m    Connection    \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2merror.            \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m      \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m      \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m    None          \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m     \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m     \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m                  \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m\u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m                  \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m  \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m    Connection    \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2merror.            \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  │\u001b[1m                    \u001b[0m│                      │     │     │          │        │ \u001b[2m(19)              \u001b[0m │          │  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m  └────────────────────┴──────────────────────┴─────┴─────┴──────────┴────────┴────────────────────┴──────────┘  \u001b[92m│\u001b[0m\n",
+              "\u001b[92m│\u001b[0m                                                                                                                 \u001b[92m│\u001b[0m\n",
+              "\u001b[92m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "969 samples (78.0% valid)\n"
+          ]
+        }
+      ],
+      "source": [
+        "dataset = lr.transforms.run(pipeline, max_questions=1000, name=\"WWTD-2025\")\n",
+        "\n",
+        "samples = dataset.download()\n",
+        "pct = (sum(1 for s in samples if s.is_valid is True) / len(samples) * 100) if samples else 0\n",
+        "print(f\"{len(samples)} samples ({pct:.1f}% valid)\")"
       ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
     },
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "54bf4eaee9e34b4db6e2b55054f4fc8f",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "New Data Upload: |          |  0.00B /  0.00B            "
+      "cell_type": "markdown",
+      "id": "91866bf7",
+      "metadata": {},
+      "source": [
+        "## Prepare the dataset\n",
+        "\n",
+        "Use SDK utils to filter valid samples, deduplicate, and split into train/test sets. We filter by `date_close <= today` to only include questions that have already resolved."
       ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
     },
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6e52a8a016c54295a661a53facb6eb69",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Uploading the dataset shards:   0%|          | 0/1 [00:00\n",
+              "\n",
+              "\n",
+              "  \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "  \n",
+              "  \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "  \n",
+              "
sample_idis_validquestion_textdate_closeevent_dateresolution_criteriaprediction_datelabelanswer_typelabel_confidence...reasoninganswer_sourcesseed_textseed_urlseed_creation_dateseed_search_querycontextmeta_sample_idmeta_parent_sample_idmeta_processing_time_ms
02835d25a-82fa-40e3-a706-d4b1cb202897TrueWill the 11th Circuit Court of Appeals issue a...2025-02-15T00:00:002025-01-08T00:00:00This question resolves to 'Yes' if the U.S. Co...2025-01-08T00:00:001binary1.00...On January 9, 2025, the U.S. Court of Appeals ...https://vertexaisearch.cloud.google.com/ground...Title: The Situation: Ending the Trump Cases t...https://www.lawfaremedia.org/article/the-situa...2025-01-08T00:00:00Donald Trump lawsuits and court rulings[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Judge ...8cae0711-1038-445c-9974-59615767f2094346dfbf-438e-4860-9ce6-57f80f34844c801080.135
18899746b-3c8f-4862-898e-1ad2cea7033eTrueWill Donald Trump grant a formal presidential ...2025-02-28T00:00:002025-01-08T00:00:00This question resolves to 'Yes' if the White H...2025-01-08T00:00:000binary0.95...The close date for this question is 2025-02-28...https://vertexaisearch.cloud.google.com/ground...Title: The Situation: Ending the Trump Cases t...https://www.lawfaremedia.org/article/the-situa...2025-01-08T00:00:00Donald Trump lawsuits and court rulings[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Trump ...a66dad42-22f8-4032-b9c6-b74d97342aa04346dfbf-438e-4860-9ce6-57f80f34844c484199.213
2990ae76f-9bcd-4e5d-9b45-70171de020c5TrueWill Justice Juan Merchan sentence Donald Trum...2025-03-01T00:00:002025-01-08T00:00:00This question resolves to 'Yes' if Justice Jua...2025-01-08T00:00:000binary1.00...The close date for this question is 2025-03-01...https://vertexaisearch.cloud.google.com/ground...Title: The Situation: Ending the Trump Cases t...https://www.lawfaremedia.org/article/the-situa...2025-01-08T00:00:00Donald Trump lawsuits and court rulings[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Judge ...6729b46e-fc9e-48fc-81b4-d72fd7bf0eff4346dfbf-438e-4860-9ce6-57f80f34844c844466.578
3b1e6954e-ebfb-4ca3-898f-b5f37834e7c3TrueWill the criminal charges against Carlos De Ol...2025-03-05T00:00:002025-01-08T00:00:00This question resolves to 'Yes' if a federal c...2025-01-08T00:00:001binary1.00...The criminal charges against Carlos De Oliveir...https://vertexaisearch.cloud.google.com/ground...Title: The Situation: Ending the Trump Cases t...https://www.lawfaremedia.org/article/the-situa...2025-01-08T00:00:00Donald Trump lawsuits and court rulings[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Judge ...0969a617-210f-4e8f-a337-12bec1836ab94346dfbf-438e-4860-9ce6-57f80f34844c156528.692
405cdc339-b9aa-4d88-8063-d841988ca680TrueWill Donald Trump announce a freeze on all new...2025-03-01T00:00:002025-01-10T00:00:00The question resolves to 'Yes' if Trump or the...2025-01-10T00:00:000binary0.95...Donald Trump was inaugurated on January 20, 20...https://vertexaisearch.cloud.google.com/ground...<html lang=\"en-US\"><head><title>Just a moment....https://www.politico.com/news/2025/01/10/spend...2025-01-10T00:00:00Donald Trump domestic policy agenda[{'rendered_context': '', 'search_query': 'Tru...b450791b-2a54-47b9-9258-eebe12232a838fab8604-34d6-46a1-9596-5de6247aa96e517310.074
\n", + "

5 rows × 21 columns

\n", + "" + ], + "text/plain": [ + " sample_id is_valid \\\n", + "0 2835d25a-82fa-40e3-a706-d4b1cb202897 True \n", + "1 8899746b-3c8f-4862-898e-1ad2cea7033e True \n", + "2 990ae76f-9bcd-4e5d-9b45-70171de020c5 True \n", + "3 b1e6954e-ebfb-4ca3-898f-b5f37834e7c3 True \n", + "4 05cdc339-b9aa-4d88-8063-d841988ca680 True \n", + "\n", + " question_text date_close \\\n", + "0 Will the 11th Circuit Court of Appeals issue a... 2025-02-15T00:00:00 \n", + "1 Will Donald Trump grant a formal presidential ... 2025-02-28T00:00:00 \n", + "2 Will Justice Juan Merchan sentence Donald Trum... 2025-03-01T00:00:00 \n", + "3 Will the criminal charges against Carlos De Ol... 2025-03-05T00:00:00 \n", + "4 Will Donald Trump announce a freeze on all new... 2025-03-01T00:00:00 \n", + "\n", + " event_date resolution_criteria \\\n", + "0 2025-01-08T00:00:00 This question resolves to 'Yes' if the U.S. Co... \n", + "1 2025-01-08T00:00:00 This question resolves to 'Yes' if the White H... \n", + "2 2025-01-08T00:00:00 This question resolves to 'Yes' if Justice Jua... \n", + "3 2025-01-08T00:00:00 This question resolves to 'Yes' if a federal c... \n", + "4 2025-01-10T00:00:00 The question resolves to 'Yes' if Trump or the... \n", + "\n", + " prediction_date label answer_type label_confidence ... \\\n", + "0 2025-01-08T00:00:00 1 binary 1.00 ... \n", + "1 2025-01-08T00:00:00 0 binary 0.95 ... \n", + "2 2025-01-08T00:00:00 0 binary 1.00 ... \n", + "3 2025-01-08T00:00:00 1 binary 1.00 ... \n", + "4 2025-01-10T00:00:00 0 binary 0.95 ... \n", + "\n", + " reasoning \\\n", + "0 On January 9, 2025, the U.S. Court of Appeals ... \n", + "1 The close date for this question is 2025-02-28... \n", + "2 The close date for this question is 2025-03-01... \n", + "3 The criminal charges against Carlos De Oliveir... \n", + "4 Donald Trump was inaugurated on January 20, 20... \n", + "\n", + " answer_sources \\\n", + "0 https://vertexaisearch.cloud.google.com/ground... \n", + "1 https://vertexaisearch.cloud.google.com/ground... \n", + "2 https://vertexaisearch.cloud.google.com/ground... \n", + "3 https://vertexaisearch.cloud.google.com/ground... \n", + "4 https://vertexaisearch.cloud.google.com/ground... \n", + "\n", + " seed_text \\\n", + "0 Title: The Situation: Ending the Trump Cases t... \n", + "1 Title: The Situation: Ending the Trump Cases t... \n", + "2 Title: The Situation: Ending the Trump Cases t... \n", + "3 Title: The Situation: Ending the Trump Cases t... \n", + "4 Just a moment.... \n", + "\n", + " seed_url seed_creation_date \\\n", + "0 https://www.lawfaremedia.org/article/the-situa... 2025-01-08T00:00:00 \n", + "1 https://www.lawfaremedia.org/article/the-situa... 2025-01-08T00:00:00 \n", + "2 https://www.lawfaremedia.org/article/the-situa... 2025-01-08T00:00:00 \n", + "3 https://www.lawfaremedia.org/article/the-situa... 2025-01-08T00:00:00 \n", + "4 https://www.politico.com/news/2025/01/10/spend... 2025-01-10T00:00:00 \n", + "\n", + " seed_search_query \\\n", + "0 Donald Trump lawsuits and court rulings \n", + "1 Donald Trump lawsuits and court rulings \n", + "2 Donald Trump lawsuits and court rulings \n", + "3 Donald Trump lawsuits and court rulings \n", + "4 Donald Trump domestic policy agenda \n", + "\n", + " context \\\n", + "0 [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Judge ... \n", + "1 [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Trump ... \n", + "2 [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Judge ... \n", + "3 [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Judge ... \n", + "4 [{'rendered_context': '', 'search_query': 'Tru... \n", + "\n", + " meta_sample_id meta_parent_sample_id \\\n", + "0 8cae0711-1038-445c-9974-59615767f209 4346dfbf-438e-4860-9ce6-57f80f34844c \n", + "1 a66dad42-22f8-4032-b9c6-b74d97342aa0 4346dfbf-438e-4860-9ce6-57f80f34844c \n", + "2 6729b46e-fc9e-48fc-81b4-d72fd7bf0eff 4346dfbf-438e-4860-9ce6-57f80f34844c \n", + "3 0969a617-210f-4e8f-a337-12bec1836ab9 4346dfbf-438e-4860-9ce6-57f80f34844c \n", + "4 b450791b-2a54-47b9-9258-eebe12232a83 8fab8604-34d6-46a1-9596-5de6247aa96e \n", + "\n", + " meta_processing_time_ms \n", + "0 801080.135 \n", + "1 484199.213 \n", + "2 844466.578 \n", + "3 156528.692 \n", + "4 517310.074 \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "113\n", + "Test: 113 rows, 30.1% yes\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sample_id</th>\n", + " <th>is_valid</th>\n", + " <th>question_text</th>\n", + " <th>date_close</th>\n", + " <th>event_date</th>\n", + " <th>resolution_criteria</th>\n", + " <th>prediction_date</th>\n", + " <th>label</th>\n", + " <th>answer_type</th>\n", + " <th>label_confidence</th>\n", + " <th>...</th>\n", + " <th>reasoning</th>\n", + " <th>answer_sources</th>\n", + " <th>seed_text</th>\n", + " <th>seed_url</th>\n", + " <th>seed_creation_date</th>\n", + " <th>seed_search_query</th>\n", + " <th>context</th>\n", + " <th>meta_sample_id</th>\n", + " <th>meta_parent_sample_id</th>\n", + " <th>meta_processing_time_ms</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>9983c1d7-3e95-4355-92b0-50bfd5ecea42</td>\n", + " <td>True</td>\n", + " <td>Will Donald Trump appear as a guest on The Pat...</td>\n", + " <td>2026-01-01T00:00:00</td>\n", + " <td>2025-11-11T00:00:00</td>\n", + " <td>The question resolves to 'Yes' if Donald Trump...</td>\n", + " <td>2025-11-11T00:00:00</td>\n", + " <td>0</td>\n", + " <td>binary</td>\n", + " <td>0.95</td>\n", + " <td>...</td>\n", + " <td>Donald Trump made his first appearance on The ...</td>\n", + " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", + " <td>Title: Pat McAfee's Interview With Trump On ES...</td>\n", + " <td>https://www.outkick.com/analysis/pat-mcafees-i...</td>\n", + " <td>2025-11-11T00:00:00</td>\n", + " <td>Donald Trump interviews and press appearances</td>\n", + " <td>[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Donald...</td>\n", + " <td>c4eab320-fa8f-4ed0-afa4-c1d529213a0e</td>\n", + " <td>7865fd43-363a-467b-ab41-231e9dbe82d0</td>\n", + " <td>1040341.956</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>b95dc1c3-4166-4a15-a998-a50c3aa749e3</td>\n", + " <td>True</td>\n", + " <td>Will Donald Trump attend an NFL regular-season...</td>\n", + " <td>2026-01-06T00:00:00</td>\n", + " <td>2025-11-11T00:00:00</td>\n", + " <td>The question resolves to 'Yes' if Donald Trump...</td>\n", + " <td>2025-11-11T00:00:00</td>\n", + " <td>0</td>\n", + " <td>binary</td>\n", + " <td>0.95</td>\n", + " <td>...</td>\n", + " <td>Donald Trump attended one NFL regular-season g...</td>\n", + " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", + " <td>Title: Pat McAfee's Interview With Trump On ES...</td>\n", + " <td>https://www.outkick.com/analysis/pat-mcafees-i...</td>\n", + " <td>2025-11-11T00:00:00</td>\n", + " <td>Donald Trump interviews and press appearances</td>\n", + " <td>[{'rendered_context': '', 'search_query': 'Don...</td>\n", + " <td>23daab19-f704-498d-b2ba-51882ec525b4</td>\n", + " <td>7865fd43-363a-467b-ab41-231e9dbe82d0</td>\n", + " <td>508128.710</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1d81af4a-4563-4e66-a8ac-db80996d2853</td>\n", + " <td>True</td>\n", + " <td>Will the 'National Center for Warrior Independ...</td>\n", + " <td>2025-12-15T00:00:00</td>\n", + " <td>2025-11-12T00:00:00</td>\n", + " <td>The question resolves as 'Yes' if there is a v...</td>\n", + " <td>2025-11-12T00:00:00</td>\n", + " <td>0</td>\n", + " <td>binary</td>\n", + " <td>1.00</td>\n", + " <td>...</td>\n", + " <td>The 'National Center for Warrior Independence'...</td>\n", + " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", + " <td>On November 11, 2025, Veterans Day in the Unit...</td>\n", + " <td>https://evrimagaci.org/gpt/trump-sparks-vetera...</td>\n", + " <td>2025-11-12T00:00:00</td>\n", + " <td>Donald Trump interviews and press appearances</td>\n", + " <td>[{'rendered_context': '', 'search_query': 'Nat...</td>\n", + " <td>a533895c-cdae-4737-865f-a9cd51f43c92</td>\n", + " <td>6508a630-bdab-4880-b24d-baf8a3e85cb6</td>\n", + " <td>506992.193</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>294259ba-7364-471c-85b5-ef69a1b93257</td>\n", + " <td>True</td>\n", + " <td>Will the United States federal government offi...</td>\n", + " <td>2025-12-31T00:00:00</td>\n", + " <td>2025-11-12T00:00:00</td>\n", + " <td>A 'Yes' resolution requires an signed executiv...</td>\n", + " <td>2025-11-12T00:00:00</td>\n", + " <td>0</td>\n", + " <td>binary</td>\n", + " <td>0.95</td>\n", + " <td>...</td>\n", + " <td>The United States federal government did not o...</td>\n", + " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", + " <td>On November 11, 2025, Veterans Day in the Unit...</td>\n", + " <td>https://evrimagaci.org/gpt/trump-sparks-vetera...</td>\n", + " <td>2025-11-12T00:00:00</td>\n", + " <td>Donald Trump interviews and press appearances</td>\n", + " <td>[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Congre...</td>\n", + " <td>2e7d1409-2e8e-4dd4-ad50-8daaba3c1814</td>\n", + " <td>6508a630-bdab-4880-b24d-baf8a3e85cb6</td>\n", + " <td>1238772.573</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>408de8f8-a4fe-4a5e-8185-74b21f577ae9</td>\n", + " <td>True</td>\n", + " <td>Will Doug Collins be the confirmed and serving...</td>\n", + " <td>2025-12-01T00:00:00</td>\n", + " <td>2025-11-12T00:00:00</td>\n", + " <td>This question resolves as 'Yes' if Doug Collin...</td>\n", + " <td>2025-11-12T00:00:00</td>\n", + " <td>1</td>\n", + " <td>binary</td>\n", + " <td>1.00</td>\n", + " <td>...</td>\n", + " <td>Doug Collins was confirmed by the United State...</td>\n", + " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", + " <td>On November 11, 2025, Veterans Day in the Unit...</td>\n", + " <td>https://evrimagaci.org/gpt/trump-sparks-vetera...</td>\n", + " <td>2025-11-12T00:00:00</td>\n", + " <td>Donald Trump interviews and press appearances</td>\n", + " <td>[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Congre...</td>\n", + " <td>49e6bfe8-dc16-4691-8258-45a190e788f1</td>\n", + " <td>6508a630-bdab-4880-b24d-baf8a3e85cb6</td>\n", + " <td>1236460.043</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 21 columns</p>\n", + "</div>" + ], + "text/plain": [ + " sample_id is_valid \\\n", + "0 9983c1d7-3e95-4355-92b0-50bfd5ecea42 True \n", + "1 b95dc1c3-4166-4a15-a998-a50c3aa749e3 True \n", + "2 1d81af4a-4563-4e66-a8ac-db80996d2853 True \n", + "3 294259ba-7364-471c-85b5-ef69a1b93257 True \n", + "4 408de8f8-a4fe-4a5e-8185-74b21f577ae9 True \n", + "\n", + " question_text date_close \\\n", + "0 Will Donald Trump appear as a guest on The Pat... 2026-01-01T00:00:00 \n", + "1 Will Donald Trump attend an NFL regular-season... 2026-01-06T00:00:00 \n", + "2 Will the 'National Center for Warrior Independ... 2025-12-15T00:00:00 \n", + "3 Will the United States federal government offi... 2025-12-31T00:00:00 \n", + "4 Will Doug Collins be the confirmed and serving... 2025-12-01T00:00:00 \n", + "\n", + " event_date resolution_criteria \\\n", + "0 2025-11-11T00:00:00 The question resolves to 'Yes' if Donald Trump... \n", + "1 2025-11-11T00:00:00 The question resolves to 'Yes' if Donald Trump... \n", + "2 2025-11-12T00:00:00 The question resolves as 'Yes' if there is a v... \n", + "3 2025-11-12T00:00:00 A 'Yes' resolution requires an signed executiv... \n", + "4 2025-11-12T00:00:00 This question resolves as 'Yes' if Doug Collin... \n", + "\n", + " prediction_date label answer_type label_confidence ... \\\n", + "0 2025-11-11T00:00:00 0 binary 0.95 ... \n", + "1 2025-11-11T00:00:00 0 binary 0.95 ... \n", + "2 2025-11-12T00:00:00 0 binary 1.00 ... \n", + "3 2025-11-12T00:00:00 0 binary 0.95 ... \n", + "4 2025-11-12T00:00:00 1 binary 1.00 ... \n", + "\n", + " reasoning \\\n", + "0 Donald Trump made his first appearance on The ... \n", + "1 Donald Trump attended one NFL regular-season g... \n", + "2 The 'National Center for Warrior Independence'... \n", + "3 The United States federal government did not o... \n", + "4 Doug Collins was confirmed by the United State... \n", + "\n", + " answer_sources \\\n", + "0 https://vertexaisearch.cloud.google.com/ground... \n", + "1 https://vertexaisearch.cloud.google.com/ground... \n", + "2 https://vertexaisearch.cloud.google.com/ground... \n", + "3 https://vertexaisearch.cloud.google.com/ground... \n", + "4 https://vertexaisearch.cloud.google.com/ground... \n", + "\n", + " seed_text \\\n", + "0 Title: Pat McAfee's Interview With Trump On ES... \n", + "1 Title: Pat McAfee's Interview With Trump On ES... \n", + "2 On November 11, 2025, Veterans Day in the Unit... \n", + "3 On November 11, 2025, Veterans Day in the Unit... \n", + "4 On November 11, 2025, Veterans Day in the Unit... \n", + "\n", + " seed_url seed_creation_date \\\n", + "0 https://www.outkick.com/analysis/pat-mcafees-i... 2025-11-11T00:00:00 \n", + "1 https://www.outkick.com/analysis/pat-mcafees-i... 2025-11-11T00:00:00 \n", + "2 https://evrimagaci.org/gpt/trump-sparks-vetera... 2025-11-12T00:00:00 \n", + "3 https://evrimagaci.org/gpt/trump-sparks-vetera... 2025-11-12T00:00:00 \n", + "4 https://evrimagaci.org/gpt/trump-sparks-vetera... 2025-11-12T00:00:00 \n", + "\n", + " seed_search_query \\\n", + "0 Donald Trump interviews and press appearances \n", + "1 Donald Trump interviews and press appearances \n", + "2 Donald Trump interviews and press appearances \n", + "3 Donald Trump interviews and press appearances \n", + "4 Donald Trump interviews and press appearances \n", + "\n", + " context \\\n", + "0 [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Donald... \n", + "1 [{'rendered_context': '', 'search_query': 'Don... \n", + "2 [{'rendered_context': '', 'search_query': 'Nat... \n", + "3 [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Congre... \n", + "4 [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Congre... \n", + "\n", + " meta_sample_id meta_parent_sample_id \\\n", + "0 c4eab320-fa8f-4ed0-afa4-c1d529213a0e 7865fd43-363a-467b-ab41-231e9dbe82d0 \n", + "1 23daab19-f704-498d-b2ba-51882ec525b4 7865fd43-363a-467b-ab41-231e9dbe82d0 \n", + "2 a533895c-cdae-4737-865f-a9cd51f43c92 6508a630-bdab-4880-b24d-baf8a3e85cb6 \n", + "3 2e7d1409-2e8e-4dd4-ad50-8daaba3c1814 6508a630-bdab-4880-b24d-baf8a3e85cb6 \n", + "4 49e6bfe8-dc16-4691-8258-45a190e788f1 6508a630-bdab-4880-b24d-baf8a3e85cb6 \n", + "\n", + " meta_processing_time_ms \n", + "0 1040341.956 \n", + "1 508128.710 \n", + "2 506992.193 \n", + "3 1238772.573 \n", + "4 1236460.043 \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from lightningrod import filter_and_split\n", + "\n", + "train_dataset, test_dataset = filter_and_split(\n", + " dataset,\n", + " test_size=0.2,\n", + " split_strategy=\"temporal\",\n", + " days_to_resolution_range=(1, 60), # horizon within 2 months\n", + ")\n", + "\n", + "for name, ds in [(\"Train\", train_dataset), (\"Test\", test_dataset)]:\n", + " data = ds.flattened()\n", + " print(len(data))\n", + " yes_count = sum(1 for s in data if s.get(\"label\") in (1, \"1\", 1.0))\n", + " print(f\"{name}: {len(data)} rows, {yes_count/len(data)*100:.1f}% yes\")\n", + " display(pd.DataFrame(data).head())" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "1f06b9ae97284e6a805550a9da18a4e1", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]" + "cell_type": "markdown", + "id": "49a3c7f8", + "metadata": {}, + "source": [ + "## Model Training\n", + "\n", + "Fine-tune a forecasting model on your dataset. For production training, generate more questions (increase `max_questions` or run without limit). Our reference experiments used 2,790 questions—see [Trump-Forecaster Model](https://huggingface.co/LightningRodLabs/Trump-Forecaster) and [Trump-Forecaster Dataset](https://huggingface.co/datasets/LightningRodLabs/WWTD-2025) for details." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5d211eb60a0f47fd86e98405674aefe8", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Processing Files (0 / 0): | | 0.00B / 0.00B " + "cell_type": "markdown", + "id": "e29c5c4c", + "metadata": {}, + "source": [ + "## Estimate training cost\n", + "\n", + "Before starting a job, use `estimate_cost` to see the expected cost and token usage." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c46677e536fa4aca865f80855d2da88b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "New Data Upload: | | 0.00B / 0.00B " + "cell_type": "code", + "execution_count": 19, + "id": "24376274", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Estimated cost: $0.32\n", + "Effective steps: 11\n", + "Train tokens: 1,073,959\n", + "Notes: Estimate uses per-answer-type output token estimates; actual may vary\n" + ] + } + ], + "source": [ + "from lightningrod import TrainingConfig\n", + "\n", + "config = TrainingConfig(\n", + " base_model=\"Qwen/Qwen3-4B-Instruct-2507\",\n", + " training_steps=50,\n", + ")\n", + "cost_estimate = lr.training.estimate_cost(config, dataset=train_dataset)\n", + "print(f\"Estimated cost: ${cost_estimate.total_cost_dollars:.2f}\")\n", + "print(f\"Effective steps: {cost_estimate.effective_steps}\")\n", + "print(f\"Train tokens: {cost_estimate.train_tokens:,}\")\n", + "print(f\"Notes: {cost_estimate.notes}\")" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "7a9be7fd4b7643f28b85bca3b6b2b3ff", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "README.md: 0.00B [00:00, ?B/s]" + "cell_type": "markdown", + "id": "cb7d8a3f", + "metadata": {}, + "source": [ + "## Start training\n", + "\n", + "`run` creates a job and polls until completion with a live progress display." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "CommitInfo(commit_url='https://huggingface.co/datasets/bart/wwtd-forecasting-demo/commit/cd7bfd6d7addc58cf2c3ac8f6677219a1ce91a91', commit_message='Upload dataset', commit_description='', oid='cd7bfd6d7addc58cf2c3ac8f6677219a1ce91a91', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/bart/wwtd-forecasting-demo', endpoint='https://huggingface.co', repo_type='dataset', repo_id='bart/wwtd-forecasting-demo'), pr_revision=None, pr_num=None)" + "cell_type": "code", + "execution_count": 20, + "id": "a8660faf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #0000ff; text-decoration-color: #0000ff\">╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">>> Training COMPLETED</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Job:</span> WWTD-2025 <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Reward:</span> latest -0.8786 avg -0.6684 (11 steps) <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">(higher is better)</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Cost:</span> $0.18 <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n", + "</pre>\n" + ], + "text/plain": [ + "\u001b[94m╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1;92m>> Training COMPLETED\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mJob:\u001b[0m WWTD-2025 \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mReward:\u001b[0m latest -0.8786 avg -0.6684 (11 steps) \u001b[2m(higher is better)\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mCost:\u001b[0m $0.18 \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Job 13fa02ec-27f4-47a9-84c9-762d91a1904a completed with status: COMPLETED\n", + "Trained model ID: checkpoint:13fa02ec-27f4-47a9-84c9-762d91a1904a\n" + ] + } + ], + "source": [ + "job = lr.training.run(config, dataset=train_dataset, name=\"WWTD-2025\")\n", + "print(f\"Job {job.id} completed with status: {job.status}\")\n", + "print(f\"Trained model ID: {job.model_id}\")" + ] + }, + { + "cell_type": "markdown", + "id": "6487c1d9", + "metadata": {}, + "source": [ + "## Inference with your trained model\n", + "\n", + "Use `lr.predict()` to run inference with your trained model." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "f013b514", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<answer>0.05</answer>\n" + ] + } + ], + "source": [ + "print(lr.predict(job.model_id, \"Will Trump impose 25% tariffs on all goods from Canada by February 1, 2027?\"))" + ] + }, + { + "cell_type": "markdown", + "id": "09e1c0d7", + "metadata": {}, + "source": [ + "## Run evals on trained model\n", + "\n", + "Run test evals on your trained model against the test dataset. The eval job runs the model on the dataset and reports metrics." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "853e7904", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #0000ff; text-decoration-color: #0000ff\">╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">>> Eval COMPLETED</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">ID:</span> 3ca94dc1-24fe-46ff-b5a5-c4621d0e9b54 <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Model:</span> checkpoint:13fa02ec-27f4-47a9-84c9-762d91a1904a <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Dataset:</span> 82186c26-a309-43a6-9543-37bdda38d41d <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> ┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> ┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Metric </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> base </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> trained </span>┃ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> brier_score </span>│ 0.2334 │ 0.1897 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> ece </span>│ 0.1442 │ 0.0892 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> mean_reward </span>│ -0.7850 │ -0.6088 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> mean_valid_reward </span>│ -0.7850 │ -0.6088 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> n_samples </span>│ 113 │ 113 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> n_valid </span>│ 113 │ 113 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> parse_rate </span>│ 1.0000 │ 1.0000 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> total_cost </span>│ 0.0068 │ 0.0068 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> total_input_tokens </span>│ 93344 │ 93344 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> total_output_tokens </span>│ 1111 │ 1101 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> └─────────────────────┴─────────┴─────────┘ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Cost:</span> $0.01 <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n", + "</pre>\n" + ], + "text/plain": [ + "\u001b[94m╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1;92m>> Eval COMPLETED\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mID:\u001b[0m 3ca94dc1-24fe-46ff-b5a5-c4621d0e9b54 \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mModel:\u001b[0m checkpoint:13fa02ec-27f4-47a9-84c9-762d91a1904a \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mDataset:\u001b[0m 82186c26-a309-43a6-9543-37bdda38d41d \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m ┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m ┃\u001b[1;36m \u001b[0m\u001b[1;36mMetric \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36m base\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mtrained\u001b[0m\u001b[1;36m \u001b[0m┃ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mbrier_score \u001b[0m\u001b[2m \u001b[0m│ 0.2334 │ 0.1897 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mece \u001b[0m\u001b[2m \u001b[0m│ 0.1442 │ 0.0892 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mmean_reward \u001b[0m\u001b[2m \u001b[0m│ -0.7850 │ -0.6088 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mmean_valid_reward \u001b[0m\u001b[2m \u001b[0m│ -0.7850 │ -0.6088 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mn_samples \u001b[0m\u001b[2m \u001b[0m│ 113 │ 113 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mn_valid \u001b[0m\u001b[2m \u001b[0m│ 113 │ 113 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mparse_rate \u001b[0m\u001b[2m \u001b[0m│ 1.0000 │ 1.0000 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mtotal_cost \u001b[0m\u001b[2m \u001b[0m│ 0.0068 │ 0.0068 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mtotal_input_tokens \u001b[0m\u001b[2m \u001b[0m│ 93344 │ 93344 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mtotal_output_tokens\u001b[0m\u001b[2m \u001b[0m│ 1111 │ 1101 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m └─────────────────────┴─────────┴─────────┘ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mCost:\u001b[0m $0.01 \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "eval_job = lr.evals.run(model_id=job.model_id, dataset=test_dataset)" + ] + }, + { + "cell_type": "markdown", + "id": "96d20f89", + "metadata": {}, + "source": [ + "> Note: the trained model checkpoint will only be available for 7 days. If you wish to host this model long-term, reach out to us at support@lightningrod.ai." ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" } - ], - "source": [ - "%pip install datasets -q\n", - "\n", - "from datasets import Dataset, DatasetDict\n", - "from lightningrod.utils import config\n", - "\n", - "dataset = DatasetDict({\n", - " \"train\": Dataset.from_list(train_dataset.flattened()),\n", - " \"test\": Dataset.from_list(test_dataset.flattened()),\n", - "})\n", - "print(f\"Train: {len(dataset['train'])} rows, Test: {len(dataset['test'])} rows\")\n", - "print(\"Columns:\", dataset[\"train\"].column_names[:8], \"...\")\n", - "\n", - "DATASET_PATH = f\"{config.get_config_value('HF_USERNAME')}/wwtd-forecasting-demo\"\n", - "dataset.push_to_hub(DATASET_PATH, token=config.get_config_value(\"HF_ACCESS_TOKEN\"))" - ] - }, - { - "cell_type": "markdown", - "id": "49a3c7f8", - "metadata": {}, - "source": [ - "## Model Training\n", - "\n", - "We used the generated dataset above to fine-tune a forecasting model via RL on 2,790 questions, surpassing GPT-5 performance.\n", - "\n", - "**For more details on methods, results, and data:**\n", - "- **[Trump-Forecaster Model](https://huggingface.co/LightningRodLabs/Trump-Forecaster)**\n", - "- **[Trump-Forecaster Dataset](https://huggingface.co/datasets/LightningRodLabs/WWTD-2025)**\n", - "\n", - "![Brier Skill Score](https://huggingface.co/datasets/LightningRodLabs/WWTD-2025/resolve/main/brier_skill_score.png)\n", - "\n", - "**Coming Soon:** Seamlessly generate datasets, fine-tune, and evaluate your own forecasting models end-to-end on the Lightningrod platform.\n", - " \n", - "\ud83d\udc49 [Sign up to get early access and updates.](https://lightningrod.ai/)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python (lightningrod-sdk)", - "language": "python", - "name": "lightningrod-sdk" + ], + "metadata": { + "kernelspec": { + "display_name": "Python (lightningrod-sdk)", + "language": "python", + "name": "lightningrod-sdk" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "nbformat": 4, + "nbformat_minor": 5 +} From c3577d80b2bc962ee9a7304591073d8cefb38cb2 Mon Sep 17 00:00:00 2001 From: Bartolomej Kozorog <bartolomej.kozorog@gmail.com> Date: Thu, 19 Mar 2026 14:14:18 +0100 Subject: [PATCH 3/5] first pass at prepare_for_training linter --- .../fine_tuning/01_golf_forecasting.ipynb | 102 ++- .../fine_tuning/02_trump_forecasting.ipynb | 789 ++---------------- .../getting_started/05_fine_tuning.ipynb | 624 +++++++------- src/lightningrod/__init__.py | 7 +- src/lightningrod/_display.py | 89 +- src/lightningrod/training/__init__.py | 10 +- src/lightningrod/training/samples.py | 381 ++++++--- 7 files changed, 875 insertions(+), 1127 deletions(-) diff --git a/notebooks/fine_tuning/01_golf_forecasting.ipynb b/notebooks/fine_tuning/01_golf_forecasting.ipynb index a6d7def..51ee05c 100644 --- a/notebooks/fine_tuning/01_golf_forecasting.ipynb +++ b/notebooks/fine_tuning/01_golf_forecasting.ipynb @@ -22,7 +22,7 @@ "True" ] }, - "execution_count": 1, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -325,7 +325,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "upload", "metadata": {}, "outputs": [ @@ -869,13 +869,12 @@ } ], "source": [ - "from lightningrod import filter_and_split\n", + "from lightningrod import prepare_for_training, FilterParams, SplitParams\n", "\n", - "train_dataset, test_dataset = filter_and_split(\n", + "train_dataset, test_dataset = prepare_for_training(\n", " dataset,\n", - " test_size=0.2,\n", - " split_strategy=\"temporal\",\n", - " days_to_resolution_range=(1, None), # at least 1 day to resolution\n", + " filter=FilterParams(days_to_resolution_range=(1, None)),\n", + " split=SplitParams(test_size=0.2),\n", ")\n", "\n", "for name, ds in [(\"Train\", train_dataset), (\"Test\", test_dataset)]:\n", @@ -1014,15 +1013,24 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 22, "id": "ba1dcfc5", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "<answer>0.35</answer>\n" + "ename": "InternalServerError", + "evalue": "Internal Server Error", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mInternalServerError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[22]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43mlr\u001b[49m\u001b[43m.\u001b[49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mjob\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmodel_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mWill Scottie Scheffler win the 2026 Masters?\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Projects/lightningrod-python-sdk/src/lightningrod/client.py:68\u001b[39m, in \u001b[36mLightningRod.predict\u001b[39m\u001b[34m(self, model_id, prompt, system_prompt, **kwargs)\u001b[39m\n\u001b[32m 66\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33mRun `pip install openai` to use lr.predict().\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 67\u001b[39m client = OpenAI(api_key=\u001b[38;5;28mself\u001b[39m.api_key, base_url=\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m.base_url\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/openai\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m68\u001b[39m response = \u001b[43mclient\u001b[49m\u001b[43m.\u001b[49m\u001b[43mchat\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcompletions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcreate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 69\u001b[39m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmodel_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 70\u001b[39m \u001b[43m \u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\n\u001b[32m 71\u001b[39m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrole\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43msystem\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcontent\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43msystem_prompt\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 72\u001b[39m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrole\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43muser\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcontent\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mprompt\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 73\u001b[39m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 74\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 75\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 76\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m response.choices[\u001b[32m0\u001b[39m].message.content\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Projects/lightningrod-python-sdk/venv/lib/python3.11/site-packages/openai/_utils/_utils.py:286\u001b[39m, in \u001b[36mrequired_args.<locals>.inner.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 284\u001b[39m msg = \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMissing required argument: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mquote(missing[\u001b[32m0\u001b[39m])\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 285\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(msg)\n\u001b[32m--> \u001b[39m\u001b[32m286\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Projects/lightningrod-python-sdk/venv/lib/python3.11/site-packages/openai/resources/chat/completions/completions.py:1204\u001b[39m, in \u001b[36mCompletions.create\u001b[39m\u001b[34m(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, prompt_cache_key, prompt_cache_retention, reasoning_effort, response_format, safety_identifier, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, verbosity, web_search_options, extra_headers, extra_query, extra_body, timeout)\u001b[39m\n\u001b[32m 1157\u001b[39m \u001b[38;5;129m@required_args\u001b[39m([\u001b[33m\"\u001b[39m\u001b[33mmessages\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mmodel\u001b[39m\u001b[33m\"\u001b[39m], [\u001b[33m\"\u001b[39m\u001b[33mmessages\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mmodel\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mstream\u001b[39m\u001b[33m\"\u001b[39m])\n\u001b[32m 1158\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcreate\u001b[39m(\n\u001b[32m 1159\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m (...)\u001b[39m\u001b[32m 1201\u001b[39m timeout: \u001b[38;5;28mfloat\u001b[39m | httpx.Timeout | \u001b[38;5;28;01mNone\u001b[39;00m | NotGiven = not_given,\n\u001b[32m 1202\u001b[39m ) -> ChatCompletion | Stream[ChatCompletionChunk]:\n\u001b[32m 1203\u001b[39m validate_response_format(response_format)\n\u001b[32m-> \u001b[39m\u001b[32m1204\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_post\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1205\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m/chat/completions\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 1206\u001b[39m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmaybe_transform\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1207\u001b[39m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[32m 1208\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmessages\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1209\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmodel\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1210\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43maudio\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43maudio\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1211\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfrequency_penalty\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrequency_penalty\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1212\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfunction_call\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunction_call\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1213\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfunctions\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunctions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1214\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mlogit_bias\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mlogit_bias\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1215\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mlogprobs\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mlogprobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1216\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmax_completion_tokens\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_completion_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1217\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmax_tokens\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1218\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmetadata\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1219\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmodalities\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodalities\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1220\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mn\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1221\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mparallel_tool_calls\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mparallel_tool_calls\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1222\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mprediction\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mprediction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1223\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mpresence_penalty\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mpresence_penalty\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1224\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mprompt_cache_key\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mprompt_cache_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1225\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mprompt_cache_retention\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mprompt_cache_retention\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1226\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mreasoning_effort\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mreasoning_effort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1227\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mresponse_format\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mresponse_format\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1228\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43msafety_identifier\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43msafety_identifier\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1229\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mseed\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mseed\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1230\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mservice_tier\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mservice_tier\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1231\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstop\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1232\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstore\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mstore\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1233\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstream\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1234\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstream_options\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1235\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtemperature\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemperature\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1236\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtool_choice\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtool_choice\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1237\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtools\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtools\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1238\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtop_logprobs\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_logprobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1239\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtop_p\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_p\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1240\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43muser\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43muser\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1241\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mverbosity\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbosity\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1242\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mweb_search_options\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mweb_search_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1243\u001b[39m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1244\u001b[39m \u001b[43m \u001b[49m\u001b[43mcompletion_create_params\u001b[49m\u001b[43m.\u001b[49m\u001b[43mCompletionCreateParamsStreaming\u001b[49m\n\u001b[32m 1245\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\n\u001b[32m 1246\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mcompletion_create_params\u001b[49m\u001b[43m.\u001b[49m\u001b[43mCompletionCreateParamsNonStreaming\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1247\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1248\u001b[39m \u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmake_request_options\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1249\u001b[39m \u001b[43m \u001b[49m\u001b[43mextra_headers\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextra_headers\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_query\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextra_query\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_body\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextra_body\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtimeout\u001b[49m\n\u001b[32m 1250\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1251\u001b[39m \u001b[43m \u001b[49m\u001b[43mcast_to\u001b[49m\u001b[43m=\u001b[49m\u001b[43mChatCompletion\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1252\u001b[39m \u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstream\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 1253\u001b[39m \u001b[43m \u001b[49m\u001b[43mstream_cls\u001b[49m\u001b[43m=\u001b[49m\u001b[43mStream\u001b[49m\u001b[43m[\u001b[49m\u001b[43mChatCompletionChunk\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1254\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Projects/lightningrod-python-sdk/venv/lib/python3.11/site-packages/openai/_base_client.py:1297\u001b[39m, in \u001b[36mSyncAPIClient.post\u001b[39m\u001b[34m(self, path, cast_to, body, content, options, files, stream, stream_cls)\u001b[39m\n\u001b[32m 1288\u001b[39m warnings.warn(\n\u001b[32m 1289\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mPassing raw bytes as `body` is deprecated and will be removed in a future version. \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1290\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mPlease pass raw bytes via the `content` parameter instead.\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 1291\u001b[39m \u001b[38;5;167;01mDeprecationWarning\u001b[39;00m,\n\u001b[32m 1292\u001b[39m stacklevel=\u001b[32m2\u001b[39m,\n\u001b[32m 1293\u001b[39m )\n\u001b[32m 1294\u001b[39m opts = FinalRequestOptions.construct(\n\u001b[32m 1295\u001b[39m method=\u001b[33m\"\u001b[39m\u001b[33mpost\u001b[39m\u001b[33m\"\u001b[39m, url=path, json_data=body, content=content, files=to_httpx_files(files), **options\n\u001b[32m 1296\u001b[39m )\n\u001b[32m-> \u001b[39m\u001b[32m1297\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m cast(ResponseT, \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcast_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mopts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstream\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream_cls\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstream_cls\u001b[49m\u001b[43m)\u001b[49m)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Projects/lightningrod-python-sdk/venv/lib/python3.11/site-packages/openai/_base_client.py:1070\u001b[39m, in \u001b[36mSyncAPIClient.request\u001b[39m\u001b[34m(self, cast_to, options, stream, stream_cls)\u001b[39m\n\u001b[32m 1067\u001b[39m err.response.read()\n\u001b[32m 1069\u001b[39m log.debug(\u001b[33m\"\u001b[39m\u001b[33mRe-raising status error\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m1070\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m._make_status_error_from_response(err.response) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1072\u001b[39m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[32m 1074\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m response \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[33m\"\u001b[39m\u001b[33mcould not resolve response (should never happen)\u001b[39m\u001b[33m\"\u001b[39m\n", + "\u001b[31mInternalServerError\u001b[39m: Internal Server Error" ] } ], @@ -1042,10 +1050,76 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "0e81dc80", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #0000ff; text-decoration-color: #0000ff\">╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">>> Eval COMPLETED</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">ID:</span> cd970c00-d5b9-4db3-ac1f-f4815960abb0 <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Model:</span> checkpoint:6c82c197-0627-4ee8-954c-d5ddb93e66f2 <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Dataset:</span> 708f1623-6f06-4897-bb2e-dd58b7aebd45 <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> ┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> ┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Metric </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> base </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> trained </span>┃ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> brier_score </span>│ 0.2784 │ 0.2377 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> ece </span>│ 0.2207 │ 0.1597 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> mean_reward </span>│ -0.9210 │ -0.8026 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> mean_valid_reward </span>│ -0.9210 │ -0.8026 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> n_samples </span>│ 143 │ 143 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> n_valid </span>│ 143 │ 143 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> parse_rate </span>│ 1.0000 │ 1.0000 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> total_cost </span>│ 0.0084 │ 0.0084 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> total_input_tokens </span>│ 115968 │ 115968 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> total_output_tokens </span>│ 1403 │ 1416 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> └─────────────────────┴─────────┴─────────┘ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Cost:</span> $0.02 <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n", + "</pre>\n" + ], + "text/plain": [ + "\u001b[94m╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1;92m>> Eval COMPLETED\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mID:\u001b[0m cd970c00-d5b9-4db3-ac1f-f4815960abb0 \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mModel:\u001b[0m checkpoint:6c82c197-0627-4ee8-954c-d5ddb93e66f2 \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mDataset:\u001b[0m 708f1623-6f06-4897-bb2e-dd58b7aebd45 \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m ┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m ┃\u001b[1;36m \u001b[0m\u001b[1;36mMetric \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36m base\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mtrained\u001b[0m\u001b[1;36m \u001b[0m┃ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mbrier_score \u001b[0m\u001b[2m \u001b[0m│ 0.2784 │ 0.2377 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mece \u001b[0m\u001b[2m \u001b[0m│ 0.2207 │ 0.1597 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mmean_reward \u001b[0m\u001b[2m \u001b[0m│ -0.9210 │ -0.8026 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mmean_valid_reward \u001b[0m\u001b[2m \u001b[0m│ -0.9210 │ -0.8026 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mn_samples \u001b[0m\u001b[2m \u001b[0m│ 143 │ 143 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mn_valid \u001b[0m\u001b[2m \u001b[0m│ 143 │ 143 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mparse_rate \u001b[0m\u001b[2m \u001b[0m│ 1.0000 │ 1.0000 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mtotal_cost \u001b[0m\u001b[2m \u001b[0m│ 0.0084 │ 0.0084 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mtotal_input_tokens \u001b[0m\u001b[2m \u001b[0m│ 115968 │ 115968 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mtotal_output_tokens\u001b[0m\u001b[2m \u001b[0m│ 1403 │ 1416 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m └─────────────────────┴─────────┴─────────┘ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mCost:\u001b[0m $0.02 \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "eval_job = lr.evals.run(model_id=job.model_id, dataset=test_dataset)" ] diff --git a/notebooks/fine_tuning/02_trump_forecasting.ipynb b/notebooks/fine_tuning/02_trump_forecasting.ipynb index b2abe08..6447edc 100644 --- a/notebooks/fine_tuning/02_trump_forecasting.ipynb +++ b/notebooks/fine_tuning/02_trump_forecasting.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 1, "id": "6f2c4443", "metadata": {}, "outputs": [ @@ -22,7 +22,7 @@ "True" ] }, - "execution_count": 13, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -53,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 2, "id": "7ca71c31", "metadata": {}, "outputs": [], @@ -77,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 3, "id": "faccbe0a", "metadata": {}, "outputs": [], @@ -106,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 4, "id": "4ce2d710", "metadata": {}, "outputs": [], @@ -166,7 +166,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 5, "id": "4de4b87c", "metadata": {}, "outputs": [ @@ -177,54 +177,20 @@ "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">>> Pipeline Completed</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> <span style=\"font-weight: bold\">Total cost:</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">$47.21</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> <span style=\"font-weight: bold\">Total cost:</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">$0.03</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━┳━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> ┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Step </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Progress </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> In </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Out </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Rejected </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Errors </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Rejection Reasons </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Duration </span>┃ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━╇━━━━━╇━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> NewsSeedGenerator… </span>│ <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">Complete </span> │ 20 │ 195 │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">- </span> │ 7s │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> ForwardLookingQue… </span>│ <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">Complete </span> │ 195 │ 912 │ <span style=\"color: #ff0000; text-decoration-color: #ff0000\"> 57</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">date_close not </span> │ 12s │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">after event_date </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">(57) </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> WebSearchLabelerT… </span>│ <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">Complete </span> │ 912 │ 775 │ <span style=\"color: #ff0000; text-decoration-color: #ff0000\"> 137</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">Resolution date is</span> │ 1m 9s │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">before seed </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">creation date </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">(96), Undetermined</span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">label (40), Low </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">confidence: 0.80 <</span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">0.9 (1) </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> NewsContextGenera… </span>│ <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">Complete </span> │ 775 │ 756 │ <span style=\"color: #ff0000; text-decoration-color: #ff0000\"> 19</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"><failed_attempts> </span> │ 11m 32s │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"><generation </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">number=\"1\"> </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"><exception> </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> Connection </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">error. </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></exception> </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"><completion> </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> None </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></completion> </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></generation> </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"><generation </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">number=\"2\"> </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"><exception> </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> Connection </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">error. </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></exception> </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"><completion> </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> None </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></completion> </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></generation> </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></failed_attempts></span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"><last_exception> </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> Connection </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">error. </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></last_exception> </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">(19) </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> └────────────────────┴──────────────────────┴─────┴─────┴──────────┴────────┴────────────────────┴──────────┘ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━┳━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> ┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Step </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Progress </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> In </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Out </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Rejected </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Errors </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Rejection Reasons </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Duration </span>┃ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━╇━━━━━╇━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> NewsSeedGenerator… </span>│ <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">Complete </span> │ 1 │ 10 │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">- </span> │ 2s │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> ForwardLookingQue… </span>│ <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">Complete </span> │ 10 │ 50 │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">- </span> │ 1s │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> WebSearchLabelerT… </span>│ <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">Complete </span> │ 50 │ 45 │ <span style=\"color: #ff0000; text-decoration-color: #ff0000\"> 5</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">Resolution date is </span> │ 1s │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">before seed </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">creation date (4), </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">Low confidence: </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">0.80 < 0.9 (1) </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> NewsContextGenera… </span>│ <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">Complete </span> │ 45 │ 45 │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">- </span> │ 1s │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> └────────────────────┴──────────────────────┴────┴─────┴──────────┴────────┴─────────────────────┴──────────┘ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n", "</pre>\n" @@ -234,54 +200,20 @@ "\u001b[92m│\u001b[0m \u001b[92m│\u001b[0m\n", "\u001b[92m│\u001b[0m \u001b[1;92m>> Pipeline Completed\u001b[0m \u001b[92m│\u001b[0m\n", "\u001b[92m│\u001b[0m \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m \u001b[1mTotal cost:\u001b[0m \u001b[92m$47.21\u001b[0m \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m \u001b[1mTotal cost:\u001b[0m \u001b[92m$0.03\u001b[0m \u001b[92m│\u001b[0m\n", "\u001b[92m│\u001b[0m \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━┳━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m ┃\u001b[1;36m \u001b[0m\u001b[1;36mStep \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mProgress \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36m In\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mOut\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mRejected\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mErrors\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mRejection Reasons \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mDuration\u001b[0m\u001b[1;36m \u001b[0m┃ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━╇━━━━━╇━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m\u001b[1mNewsSeedGenerator…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete \u001b[0m │ 20 │ 195 │ \u001b[2m 0\u001b[0m │ \u001b[2m 0\u001b[0m │ \u001b[2m- \u001b[0m │ 7s │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m\u001b[1mForwardLookingQue…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete \u001b[0m │ 195 │ 912 │ \u001b[91m 57\u001b[0m │ \u001b[2m 0\u001b[0m │ \u001b[2mdate_close not \u001b[0m │ 12s │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2mafter event_date \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m(57) \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m\u001b[1mWebSearchLabelerT…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete \u001b[0m │ 912 │ 775 │ \u001b[91m 137\u001b[0m │ \u001b[2m 0\u001b[0m │ \u001b[2mResolution date is\u001b[0m │ 1m 9s │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2mbefore seed \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2mcreation date \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m(96), Undetermined\u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2mlabel (40), Low \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2mconfidence: 0.80 <\u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m0.9 (1) \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m\u001b[1mNewsContextGenera…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete \u001b[0m │ 775 │ 756 │ \u001b[91m 19\u001b[0m │ \u001b[2m 0\u001b[0m │ \u001b[2m<failed_attempts> \u001b[0m │ 11m 32s │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m<generation \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2mnumber=\"1\"> \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m<exception> \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m Connection \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2merror. \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m</exception> \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m<completion> \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m None \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m</completion> \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m</generation> \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m<generation \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2mnumber=\"2\"> \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m<exception> \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m Connection \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2merror. \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m</exception> \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m<completion> \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m None \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m</completion> \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m</generation> \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m</failed_attempts>\u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m<last_exception> \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m Connection \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2merror. \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m</last_exception> \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m(19) \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m └────────────────────┴──────────────────────┴─────┴─────┴──────────┴────────┴────────────────────┴──────────┘ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━┳━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m ┃\u001b[1;36m \u001b[0m\u001b[1;36mStep \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mProgress \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mIn\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mOut\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mRejected\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mErrors\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mRejection Reasons \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mDuration\u001b[0m\u001b[1;36m \u001b[0m┃ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━╇━━━━━╇━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m\u001b[1mNewsSeedGenerator…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete \u001b[0m │ 1 │ 10 │ \u001b[2m 0\u001b[0m │ \u001b[2m 0\u001b[0m │ \u001b[2m- \u001b[0m │ 2s │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m\u001b[1mForwardLookingQue…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete \u001b[0m │ 10 │ 50 │ \u001b[2m 0\u001b[0m │ \u001b[2m 0\u001b[0m │ \u001b[2m- \u001b[0m │ 1s │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m\u001b[1mWebSearchLabelerT…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete \u001b[0m │ 50 │ 45 │ \u001b[91m 5\u001b[0m │ \u001b[2m 0\u001b[0m │ \u001b[2mResolution date is \u001b[0m │ 1s │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2mbefore seed \u001b[0m │ │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2mcreation date (4), \u001b[0m │ │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2mLow confidence: \u001b[0m │ │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m0.80 < 0.9 (1) \u001b[0m │ │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m\u001b[1mNewsContextGenera…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete \u001b[0m │ 45 │ 45 │ \u001b[2m 0\u001b[0m │ \u001b[2m 0\u001b[0m │ \u001b[2m- \u001b[0m │ 1s │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m └────────────────────┴──────────────────────┴────┴─────┴──────────┴────────┴─────────────────────┴──────────┘ \u001b[92m│\u001b[0m\n", "\u001b[92m│\u001b[0m \u001b[92m│\u001b[0m\n", "\u001b[92m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" ] @@ -293,12 +225,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "969 samples (78.0% valid)\n" + "50 samples (90.0% valid)\n" ] } ], "source": [ - "dataset = lr.transforms.run(pipeline, max_questions=1000, name=\"WWTD-2025\")\n", + "dataset = lr.transforms.run(pipeline, max_questions=20, name=\"WWTD-2025\")\n", "\n", "samples = dataset.download()\n", "pct = (sum(1 for s in samples if s.is_valid is True) / len(samples) * 100) if samples else 0\n", @@ -317,7 +249,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "5e4d3f2a", "metadata": {}, "outputs": [ @@ -325,577 +257,34 @@ "name": "stdout", "output_type": "stream", "text": [ - "342\n", - "Train: 342 rows, 26.9% yes\n" + "[prepare_for_training] Starting with 50 samples\n", + "[filter] Dropped 5 invalid, 16 horizon → 29 remain\n", + "[dedup] 29 remain (0 duplicates)\n", + "[split] 23 train samples removed for leakage\n", + "[split] Temporal split: 0 train, 6 test\n" ] }, { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>sample_id</th>\n", - " <th>is_valid</th>\n", - " <th>question_text</th>\n", - " <th>date_close</th>\n", - " <th>event_date</th>\n", - " <th>resolution_criteria</th>\n", - " <th>prediction_date</th>\n", - " <th>label</th>\n", - " <th>answer_type</th>\n", - " <th>label_confidence</th>\n", - " <th>...</th>\n", - " <th>reasoning</th>\n", - " <th>answer_sources</th>\n", - " <th>seed_text</th>\n", - " <th>seed_url</th>\n", - " <th>seed_creation_date</th>\n", - " <th>seed_search_query</th>\n", - " <th>context</th>\n", - " <th>meta_sample_id</th>\n", - " <th>meta_parent_sample_id</th>\n", - " <th>meta_processing_time_ms</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>2835d25a-82fa-40e3-a706-d4b1cb202897</td>\n", - " <td>True</td>\n", - " <td>Will the 11th Circuit Court of Appeals issue a...</td>\n", - " <td>2025-02-15T00:00:00</td>\n", - " <td>2025-01-08T00:00:00</td>\n", - " <td>This question resolves to 'Yes' if the U.S. Co...</td>\n", - " <td>2025-01-08T00:00:00</td>\n", - " <td>1</td>\n", - " <td>binary</td>\n", - " <td>1.00</td>\n", - " <td>...</td>\n", - " <td>On January 9, 2025, the U.S. Court of Appeals ...</td>\n", - " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", - " <td>Title: The Situation: Ending the Trump Cases t...</td>\n", - " <td>https://www.lawfaremedia.org/article/the-situa...</td>\n", - " <td>2025-01-08T00:00:00</td>\n", - " <td>Donald Trump lawsuits and court rulings</td>\n", - " <td>[{'rendered_context': '---\n", - "ARTICLES\n", - "[1] Judge ...</td>\n", - " <td>8cae0711-1038-445c-9974-59615767f209</td>\n", - " <td>4346dfbf-438e-4860-9ce6-57f80f34844c</td>\n", - " <td>801080.135</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>8899746b-3c8f-4862-898e-1ad2cea7033e</td>\n", - " <td>True</td>\n", - " <td>Will Donald Trump grant a formal presidential ...</td>\n", - " <td>2025-02-28T00:00:00</td>\n", - " <td>2025-01-08T00:00:00</td>\n", - " <td>This question resolves to 'Yes' if the White H...</td>\n", - " <td>2025-01-08T00:00:00</td>\n", - " <td>0</td>\n", - " <td>binary</td>\n", - " <td>0.95</td>\n", - " <td>...</td>\n", - " <td>The close date for this question is 2025-02-28...</td>\n", - " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", - " <td>Title: The Situation: Ending the Trump Cases t...</td>\n", - " <td>https://www.lawfaremedia.org/article/the-situa...</td>\n", - " <td>2025-01-08T00:00:00</td>\n", - " <td>Donald Trump lawsuits and court rulings</td>\n", - " <td>[{'rendered_context': '---\n", - "ARTICLES\n", - "[1] Trump ...</td>\n", - " <td>a66dad42-22f8-4032-b9c6-b74d97342aa0</td>\n", - " <td>4346dfbf-438e-4860-9ce6-57f80f34844c</td>\n", - " <td>484199.213</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>990ae76f-9bcd-4e5d-9b45-70171de020c5</td>\n", - " <td>True</td>\n", - " <td>Will Justice Juan Merchan sentence Donald Trum...</td>\n", - " <td>2025-03-01T00:00:00</td>\n", - " <td>2025-01-08T00:00:00</td>\n", - " <td>This question resolves to 'Yes' if Justice Jua...</td>\n", - " <td>2025-01-08T00:00:00</td>\n", - " <td>0</td>\n", - " <td>binary</td>\n", - " <td>1.00</td>\n", - " <td>...</td>\n", - " <td>The close date for this question is 2025-03-01...</td>\n", - " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", - " <td>Title: The Situation: Ending the Trump Cases t...</td>\n", - " <td>https://www.lawfaremedia.org/article/the-situa...</td>\n", - " <td>2025-01-08T00:00:00</td>\n", - " <td>Donald Trump lawsuits and court rulings</td>\n", - " <td>[{'rendered_context': '---\n", - "ARTICLES\n", - "[1] Judge ...</td>\n", - " <td>6729b46e-fc9e-48fc-81b4-d72fd7bf0eff</td>\n", - " <td>4346dfbf-438e-4860-9ce6-57f80f34844c</td>\n", - " <td>844466.578</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>b1e6954e-ebfb-4ca3-898f-b5f37834e7c3</td>\n", - " <td>True</td>\n", - " <td>Will the criminal charges against Carlos De Ol...</td>\n", - " <td>2025-03-05T00:00:00</td>\n", - " <td>2025-01-08T00:00:00</td>\n", - " <td>This question resolves to 'Yes' if a federal c...</td>\n", - " <td>2025-01-08T00:00:00</td>\n", - " <td>1</td>\n", - " <td>binary</td>\n", - " <td>1.00</td>\n", - " <td>...</td>\n", - " <td>The criminal charges against Carlos De Oliveir...</td>\n", - " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", - " <td>Title: The Situation: Ending the Trump Cases t...</td>\n", - " <td>https://www.lawfaremedia.org/article/the-situa...</td>\n", - " <td>2025-01-08T00:00:00</td>\n", - " <td>Donald Trump lawsuits and court rulings</td>\n", - " <td>[{'rendered_context': '---\n", - "ARTICLES\n", - "[1] Judge ...</td>\n", - " <td>0969a617-210f-4e8f-a337-12bec1836ab9</td>\n", - " <td>4346dfbf-438e-4860-9ce6-57f80f34844c</td>\n", - " <td>156528.692</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>05cdc339-b9aa-4d88-8063-d841988ca680</td>\n", - " <td>True</td>\n", - " <td>Will Donald Trump announce a freeze on all new...</td>\n", - " <td>2025-03-01T00:00:00</td>\n", - " <td>2025-01-10T00:00:00</td>\n", - " <td>The question resolves to 'Yes' if Trump or the...</td>\n", - " <td>2025-01-10T00:00:00</td>\n", - " <td>0</td>\n", - " <td>binary</td>\n", - " <td>0.95</td>\n", - " <td>...</td>\n", - " <td>Donald Trump was inaugurated on January 20, 20...</td>\n", - " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", - " <td><html lang=\"en-US\"><head><title>Just a moment....</td>\n", - " <td>https://www.politico.com/news/2025/01/10/spend...</td>\n", - " <td>2025-01-10T00:00:00</td>\n", - " <td>Donald Trump domestic policy agenda</td>\n", - " <td>[{'rendered_context': '', 'search_query': 'Tru...</td>\n", - " <td>b450791b-2a54-47b9-9258-eebe12232a83</td>\n", - " <td>8fab8604-34d6-46a1-9596-5de6247aa96e</td>\n", - " <td>517310.074</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "<p>5 rows × 21 columns</p>\n", - "</div>" - ], - "text/plain": [ - " sample_id is_valid \\\n", - "0 2835d25a-82fa-40e3-a706-d4b1cb202897 True \n", - "1 8899746b-3c8f-4862-898e-1ad2cea7033e True \n", - "2 990ae76f-9bcd-4e5d-9b45-70171de020c5 True \n", - "3 b1e6954e-ebfb-4ca3-898f-b5f37834e7c3 True \n", - "4 05cdc339-b9aa-4d88-8063-d841988ca680 True \n", - "\n", - " question_text date_close \\\n", - "0 Will the 11th Circuit Court of Appeals issue a... 2025-02-15T00:00:00 \n", - "1 Will Donald Trump grant a formal presidential ... 2025-02-28T00:00:00 \n", - "2 Will Justice Juan Merchan sentence Donald Trum... 2025-03-01T00:00:00 \n", - "3 Will the criminal charges against Carlos De Ol... 2025-03-05T00:00:00 \n", - "4 Will Donald Trump announce a freeze on all new... 2025-03-01T00:00:00 \n", - "\n", - " event_date resolution_criteria \\\n", - "0 2025-01-08T00:00:00 This question resolves to 'Yes' if the U.S. Co... \n", - "1 2025-01-08T00:00:00 This question resolves to 'Yes' if the White H... \n", - "2 2025-01-08T00:00:00 This question resolves to 'Yes' if Justice Jua... \n", - "3 2025-01-08T00:00:00 This question resolves to 'Yes' if a federal c... \n", - "4 2025-01-10T00:00:00 The question resolves to 'Yes' if Trump or the... \n", - "\n", - " prediction_date label answer_type label_confidence ... \\\n", - "0 2025-01-08T00:00:00 1 binary 1.00 ... \n", - "1 2025-01-08T00:00:00 0 binary 0.95 ... \n", - "2 2025-01-08T00:00:00 0 binary 1.00 ... \n", - "3 2025-01-08T00:00:00 1 binary 1.00 ... \n", - "4 2025-01-10T00:00:00 0 binary 0.95 ... \n", - "\n", - " reasoning \\\n", - "0 On January 9, 2025, the U.S. Court of Appeals ... \n", - "1 The close date for this question is 2025-02-28... \n", - "2 The close date for this question is 2025-03-01... \n", - "3 The criminal charges against Carlos De Oliveir... \n", - "4 Donald Trump was inaugurated on January 20, 20... \n", - "\n", - " answer_sources \\\n", - "0 https://vertexaisearch.cloud.google.com/ground... \n", - "1 https://vertexaisearch.cloud.google.com/ground... \n", - "2 https://vertexaisearch.cloud.google.com/ground... \n", - "3 https://vertexaisearch.cloud.google.com/ground... \n", - "4 https://vertexaisearch.cloud.google.com/ground... \n", - "\n", - " seed_text \\\n", - "0 Title: The Situation: Ending the Trump Cases t... \n", - "1 Title: The Situation: Ending the Trump Cases t... \n", - "2 Title: The Situation: Ending the Trump Cases t... \n", - "3 Title: The Situation: Ending the Trump Cases t... \n", - "4 <html lang=\"en-US\"><head><title>Just a moment.... \n", - "\n", - " seed_url seed_creation_date \\\n", - "0 https://www.lawfaremedia.org/article/the-situa... 2025-01-08T00:00:00 \n", - "1 https://www.lawfaremedia.org/article/the-situa... 2025-01-08T00:00:00 \n", - "2 https://www.lawfaremedia.org/article/the-situa... 2025-01-08T00:00:00 \n", - "3 https://www.lawfaremedia.org/article/the-situa... 2025-01-08T00:00:00 \n", - "4 https://www.politico.com/news/2025/01/10/spend... 2025-01-10T00:00:00 \n", - "\n", - " seed_search_query \\\n", - "0 Donald Trump lawsuits and court rulings \n", - "1 Donald Trump lawsuits and court rulings \n", - "2 Donald Trump lawsuits and court rulings \n", - "3 Donald Trump lawsuits and court rulings \n", - "4 Donald Trump domestic policy agenda \n", - "\n", - " context \\\n", - "0 [{'rendered_context': '---\n", - "ARTICLES\n", - "[1] Judge ... \n", - "1 [{'rendered_context': '---\n", - "ARTICLES\n", - "[1] Trump ... \n", - "2 [{'rendered_context': '---\n", - "ARTICLES\n", - "[1] Judge ... \n", - "3 [{'rendered_context': '---\n", - "ARTICLES\n", - "[1] Judge ... \n", - "4 [{'rendered_context': '', 'search_query': 'Tru... \n", - "\n", - " meta_sample_id meta_parent_sample_id \\\n", - "0 8cae0711-1038-445c-9974-59615767f209 4346dfbf-438e-4860-9ce6-57f80f34844c \n", - "1 a66dad42-22f8-4032-b9c6-b74d97342aa0 4346dfbf-438e-4860-9ce6-57f80f34844c \n", - "2 6729b46e-fc9e-48fc-81b4-d72fd7bf0eff 4346dfbf-438e-4860-9ce6-57f80f34844c \n", - "3 0969a617-210f-4e8f-a337-12bec1836ab9 4346dfbf-438e-4860-9ce6-57f80f34844c \n", - "4 b450791b-2a54-47b9-9258-eebe12232a83 8fab8604-34d6-46a1-9596-5de6247aa96e \n", - "\n", - " meta_processing_time_ms \n", - "0 801080.135 \n", - "1 484199.213 \n", - "2 844466.578 \n", - "3 156528.692 \n", - "4 517310.074 \n", - "\n", - "[5 rows x 21 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "113\n", - "Test: 113 rows, 30.1% yes\n" + "ename": "ValueError", + "evalue": "[filter_and_split] Unhealthy split detected.\n\n23/23 train samples (100%) were removed for temporal leakage — the date_close or resolution_date of train questions extends into the test period.\n\nTips:\n - Use test_start=\"YYYY-MM-DD\" instead of test_size to set an explicit cutoff at least 60 days before your last question date, giving train questions room to resolve before the test window.\n - Tighten days_to_resolution_range — the current max of 60 days means train resolution dates extend far into the test window. Reducing it shrinks the bleed-over zone.\n - Generate more samples across a wider date range. With questions spread over a longer period, the temporal split cutoff moves far enough back that earlier questions resolve well before the test window.\n - Set filter_leaky_train=False to disable leakage removal. Only do this if you are confident the resolution dates do not reveal information that was unavailable at prediction time.", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mValueError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mlightningrod\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m filter_and_split\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m train_dataset, test_dataset = \u001b[43mfilter_and_split\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 4\u001b[39m \u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5\u001b[39m \u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m0.2\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 6\u001b[39m \u001b[43m \u001b[49m\u001b[43msplit_strategy\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtemporal\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 7\u001b[39m \u001b[43m \u001b[49m\u001b[43mdays_to_resolution_range\u001b[49m\u001b[43m=\u001b[49m\u001b[43m(\u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m60\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# horizon within 2 months\u001b[39;49;00m\n\u001b[32m 8\u001b[39m \u001b[43m)\u001b[49m\n\u001b[32m 10\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m name, ds \u001b[38;5;129;01min\u001b[39;00m [(\u001b[33m\"\u001b[39m\u001b[33mTrain\u001b[39m\u001b[33m\"\u001b[39m, train_dataset), (\u001b[33m\"\u001b[39m\u001b[33mTest\u001b[39m\u001b[33m\"\u001b[39m, test_dataset)]:\n\u001b[32m 11\u001b[39m data = ds.flattened()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Projects/lightningrod-python-sdk/src/lightningrod/training/samples.py:763\u001b[39m, in \u001b[36mfilter_and_split\u001b[39m\u001b[34m(dataset, test_size, split_strategy, test_start, drop_missing_context, days_to_resolution_range, random_state, filter_leaky_train, deduplicate_key_fn, verbose)\u001b[39m\n\u001b[32m 760\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m verbose:\n\u001b[32m 761\u001b[39m _print_stats(stats)\n\u001b[32m--> \u001b[39m\u001b[32m763\u001b[39m \u001b[43m_raise_if_unhealthy_split\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 764\u001b[39m \u001b[43m \u001b[49m\u001b[43mstats\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstats\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 765\u001b[39m \u001b[43m \u001b[49m\u001b[43mtest_ids\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtest_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 766\u001b[39m \u001b[43m \u001b[49m\u001b[43msplit_strategy\u001b[49m\u001b[43m=\u001b[49m\u001b[43msplit_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 767\u001b[39m \u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtest_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 768\u001b[39m \u001b[43m \u001b[49m\u001b[43mtest_start\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtest_start\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 769\u001b[39m \u001b[43m \u001b[49m\u001b[43mfilter_leaky_train\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfilter_leaky_train\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 770\u001b[39m \u001b[43m \u001b[49m\u001b[43mdays_to_resolution_range\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdays_to_resolution_range\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 771\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 773\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m dataset.subset(train_ids), dataset.subset(test_ids)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Projects/lightningrod-python-sdk/src/lightningrod/training/samples.py:701\u001b[39m, in \u001b[36m_raise_if_unhealthy_split\u001b[39m\u001b[34m(stats, test_ids, split_strategy, test_size, test_start, filter_leaky_train, days_to_resolution_range)\u001b[39m\n\u001b[32m 695\u001b[39m test_tips.append(\n\u001b[32m 696\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mIncrease the dataset size — with test_size=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtest_size\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m and only \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstats.dedup_kept\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m samples \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 697\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mafter filtering, the test set may round to zero.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 698\u001b[39m )\n\u001b[32m 699\u001b[39m msgs.append(\u001b[33m\"\u001b[39m\u001b[33mTips:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m\"\u001b[39m + \u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m\"\u001b[39m.join(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m - \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mt\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m t \u001b[38;5;129;01min\u001b[39;00m test_tips))\n\u001b[32m--> \u001b[39m\u001b[32m701\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33m[filter_and_split] Unhealthy split detected.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m\"\u001b[39m + \u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m\"\u001b[39m.join(msgs))\n", + "\u001b[31mValueError\u001b[39m: [filter_and_split] Unhealthy split detected.\n\n23/23 train samples (100%) were removed for temporal leakage — the date_close or resolution_date of train questions extends into the test period.\n\nTips:\n - Use test_start=\"YYYY-MM-DD\" instead of test_size to set an explicit cutoff at least 60 days before your last question date, giving train questions room to resolve before the test window.\n - Tighten days_to_resolution_range — the current max of 60 days means train resolution dates extend far into the test window. Reducing it shrinks the bleed-over zone.\n - Generate more samples across a wider date range. With questions spread over a longer period, the temporal split cutoff moves far enough back that earlier questions resolve well before the test window.\n - Set filter_leaky_train=False to disable leakage removal. Only do this if you are confident the resolution dates do not reveal information that was unavailable at prediction time." ] - }, - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>sample_id</th>\n", - " <th>is_valid</th>\n", - " <th>question_text</th>\n", - " <th>date_close</th>\n", - " <th>event_date</th>\n", - " <th>resolution_criteria</th>\n", - " <th>prediction_date</th>\n", - " <th>label</th>\n", - " <th>answer_type</th>\n", - " <th>label_confidence</th>\n", - " <th>...</th>\n", - " <th>reasoning</th>\n", - " <th>answer_sources</th>\n", - " <th>seed_text</th>\n", - " <th>seed_url</th>\n", - " <th>seed_creation_date</th>\n", - " <th>seed_search_query</th>\n", - " <th>context</th>\n", - " <th>meta_sample_id</th>\n", - " <th>meta_parent_sample_id</th>\n", - " <th>meta_processing_time_ms</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>9983c1d7-3e95-4355-92b0-50bfd5ecea42</td>\n", - " <td>True</td>\n", - " <td>Will Donald Trump appear as a guest on The Pat...</td>\n", - " <td>2026-01-01T00:00:00</td>\n", - " <td>2025-11-11T00:00:00</td>\n", - " <td>The question resolves to 'Yes' if Donald Trump...</td>\n", - " <td>2025-11-11T00:00:00</td>\n", - " <td>0</td>\n", - " <td>binary</td>\n", - " <td>0.95</td>\n", - " <td>...</td>\n", - " <td>Donald Trump made his first appearance on The ...</td>\n", - " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", - " <td>Title: Pat McAfee's Interview With Trump On ES...</td>\n", - " <td>https://www.outkick.com/analysis/pat-mcafees-i...</td>\n", - " <td>2025-11-11T00:00:00</td>\n", - " <td>Donald Trump interviews and press appearances</td>\n", - " <td>[{'rendered_context': '---\n", - "ARTICLES\n", - "[1] Donald...</td>\n", - " <td>c4eab320-fa8f-4ed0-afa4-c1d529213a0e</td>\n", - " <td>7865fd43-363a-467b-ab41-231e9dbe82d0</td>\n", - " <td>1040341.956</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>b95dc1c3-4166-4a15-a998-a50c3aa749e3</td>\n", - " <td>True</td>\n", - " <td>Will Donald Trump attend an NFL regular-season...</td>\n", - " <td>2026-01-06T00:00:00</td>\n", - " <td>2025-11-11T00:00:00</td>\n", - " <td>The question resolves to 'Yes' if Donald Trump...</td>\n", - " <td>2025-11-11T00:00:00</td>\n", - " <td>0</td>\n", - " <td>binary</td>\n", - " <td>0.95</td>\n", - " <td>...</td>\n", - " <td>Donald Trump attended one NFL regular-season g...</td>\n", - " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", - " <td>Title: Pat McAfee's Interview With Trump On ES...</td>\n", - " <td>https://www.outkick.com/analysis/pat-mcafees-i...</td>\n", - " <td>2025-11-11T00:00:00</td>\n", - " <td>Donald Trump interviews and press appearances</td>\n", - " <td>[{'rendered_context': '', 'search_query': 'Don...</td>\n", - " <td>23daab19-f704-498d-b2ba-51882ec525b4</td>\n", - " <td>7865fd43-363a-467b-ab41-231e9dbe82d0</td>\n", - " <td>508128.710</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>1d81af4a-4563-4e66-a8ac-db80996d2853</td>\n", - " <td>True</td>\n", - " <td>Will the 'National Center for Warrior Independ...</td>\n", - " <td>2025-12-15T00:00:00</td>\n", - " <td>2025-11-12T00:00:00</td>\n", - " <td>The question resolves as 'Yes' if there is a v...</td>\n", - " <td>2025-11-12T00:00:00</td>\n", - " <td>0</td>\n", - " <td>binary</td>\n", - " <td>1.00</td>\n", - " <td>...</td>\n", - " <td>The 'National Center for Warrior Independence'...</td>\n", - " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", - " <td>On November 11, 2025, Veterans Day in the Unit...</td>\n", - " <td>https://evrimagaci.org/gpt/trump-sparks-vetera...</td>\n", - " <td>2025-11-12T00:00:00</td>\n", - " <td>Donald Trump interviews and press appearances</td>\n", - " <td>[{'rendered_context': '', 'search_query': 'Nat...</td>\n", - " <td>a533895c-cdae-4737-865f-a9cd51f43c92</td>\n", - " <td>6508a630-bdab-4880-b24d-baf8a3e85cb6</td>\n", - " <td>506992.193</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>294259ba-7364-471c-85b5-ef69a1b93257</td>\n", - " <td>True</td>\n", - " <td>Will the United States federal government offi...</td>\n", - " <td>2025-12-31T00:00:00</td>\n", - " <td>2025-11-12T00:00:00</td>\n", - " <td>A 'Yes' resolution requires an signed executiv...</td>\n", - " <td>2025-11-12T00:00:00</td>\n", - " <td>0</td>\n", - " <td>binary</td>\n", - " <td>0.95</td>\n", - " <td>...</td>\n", - " <td>The United States federal government did not o...</td>\n", - " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", - " <td>On November 11, 2025, Veterans Day in the Unit...</td>\n", - " <td>https://evrimagaci.org/gpt/trump-sparks-vetera...</td>\n", - " <td>2025-11-12T00:00:00</td>\n", - " <td>Donald Trump interviews and press appearances</td>\n", - " <td>[{'rendered_context': '---\n", - "ARTICLES\n", - "[1] Congre...</td>\n", - " <td>2e7d1409-2e8e-4dd4-ad50-8daaba3c1814</td>\n", - " <td>6508a630-bdab-4880-b24d-baf8a3e85cb6</td>\n", - " <td>1238772.573</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>408de8f8-a4fe-4a5e-8185-74b21f577ae9</td>\n", - " <td>True</td>\n", - " <td>Will Doug Collins be the confirmed and serving...</td>\n", - " <td>2025-12-01T00:00:00</td>\n", - " <td>2025-11-12T00:00:00</td>\n", - " <td>This question resolves as 'Yes' if Doug Collin...</td>\n", - " <td>2025-11-12T00:00:00</td>\n", - " <td>1</td>\n", - " <td>binary</td>\n", - " <td>1.00</td>\n", - " <td>...</td>\n", - " <td>Doug Collins was confirmed by the United State...</td>\n", - " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", - " <td>On November 11, 2025, Veterans Day in the Unit...</td>\n", - " <td>https://evrimagaci.org/gpt/trump-sparks-vetera...</td>\n", - " <td>2025-11-12T00:00:00</td>\n", - " <td>Donald Trump interviews and press appearances</td>\n", - " <td>[{'rendered_context': '---\n", - "ARTICLES\n", - "[1] Congre...</td>\n", - " <td>49e6bfe8-dc16-4691-8258-45a190e788f1</td>\n", - " <td>6508a630-bdab-4880-b24d-baf8a3e85cb6</td>\n", - " <td>1236460.043</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "<p>5 rows × 21 columns</p>\n", - "</div>" - ], - "text/plain": [ - " sample_id is_valid \\\n", - "0 9983c1d7-3e95-4355-92b0-50bfd5ecea42 True \n", - "1 b95dc1c3-4166-4a15-a998-a50c3aa749e3 True \n", - "2 1d81af4a-4563-4e66-a8ac-db80996d2853 True \n", - "3 294259ba-7364-471c-85b5-ef69a1b93257 True \n", - "4 408de8f8-a4fe-4a5e-8185-74b21f577ae9 True \n", - "\n", - " question_text date_close \\\n", - "0 Will Donald Trump appear as a guest on The Pat... 2026-01-01T00:00:00 \n", - "1 Will Donald Trump attend an NFL regular-season... 2026-01-06T00:00:00 \n", - "2 Will the 'National Center for Warrior Independ... 2025-12-15T00:00:00 \n", - "3 Will the United States federal government offi... 2025-12-31T00:00:00 \n", - "4 Will Doug Collins be the confirmed and serving... 2025-12-01T00:00:00 \n", - "\n", - " event_date resolution_criteria \\\n", - "0 2025-11-11T00:00:00 The question resolves to 'Yes' if Donald Trump... \n", - "1 2025-11-11T00:00:00 The question resolves to 'Yes' if Donald Trump... \n", - "2 2025-11-12T00:00:00 The question resolves as 'Yes' if there is a v... \n", - "3 2025-11-12T00:00:00 A 'Yes' resolution requires an signed executiv... \n", - "4 2025-11-12T00:00:00 This question resolves as 'Yes' if Doug Collin... \n", - "\n", - " prediction_date label answer_type label_confidence ... \\\n", - "0 2025-11-11T00:00:00 0 binary 0.95 ... \n", - "1 2025-11-11T00:00:00 0 binary 0.95 ... \n", - "2 2025-11-12T00:00:00 0 binary 1.00 ... \n", - "3 2025-11-12T00:00:00 0 binary 0.95 ... \n", - "4 2025-11-12T00:00:00 1 binary 1.00 ... \n", - "\n", - " reasoning \\\n", - "0 Donald Trump made his first appearance on The ... \n", - "1 Donald Trump attended one NFL regular-season g... \n", - "2 The 'National Center for Warrior Independence'... \n", - "3 The United States federal government did not o... \n", - "4 Doug Collins was confirmed by the United State... \n", - "\n", - " answer_sources \\\n", - "0 https://vertexaisearch.cloud.google.com/ground... \n", - "1 https://vertexaisearch.cloud.google.com/ground... \n", - "2 https://vertexaisearch.cloud.google.com/ground... \n", - "3 https://vertexaisearch.cloud.google.com/ground... \n", - "4 https://vertexaisearch.cloud.google.com/ground... \n", - "\n", - " seed_text \\\n", - "0 Title: Pat McAfee's Interview With Trump On ES... \n", - "1 Title: Pat McAfee's Interview With Trump On ES... \n", - "2 On November 11, 2025, Veterans Day in the Unit... \n", - "3 On November 11, 2025, Veterans Day in the Unit... \n", - "4 On November 11, 2025, Veterans Day in the Unit... \n", - "\n", - " seed_url seed_creation_date \\\n", - "0 https://www.outkick.com/analysis/pat-mcafees-i... 2025-11-11T00:00:00 \n", - "1 https://www.outkick.com/analysis/pat-mcafees-i... 2025-11-11T00:00:00 \n", - "2 https://evrimagaci.org/gpt/trump-sparks-vetera... 2025-11-12T00:00:00 \n", - "3 https://evrimagaci.org/gpt/trump-sparks-vetera... 2025-11-12T00:00:00 \n", - "4 https://evrimagaci.org/gpt/trump-sparks-vetera... 2025-11-12T00:00:00 \n", - "\n", - " seed_search_query \\\n", - "0 Donald Trump interviews and press appearances \n", - "1 Donald Trump interviews and press appearances \n", - "2 Donald Trump interviews and press appearances \n", - "3 Donald Trump interviews and press appearances \n", - "4 Donald Trump interviews and press appearances \n", - "\n", - " context \\\n", - "0 [{'rendered_context': '---\n", - "ARTICLES\n", - "[1] Donald... \n", - "1 [{'rendered_context': '', 'search_query': 'Don... \n", - "2 [{'rendered_context': '', 'search_query': 'Nat... \n", - "3 [{'rendered_context': '---\n", - "ARTICLES\n", - "[1] Congre... \n", - "4 [{'rendered_context': '---\n", - "ARTICLES\n", - "[1] Congre... \n", - "\n", - " meta_sample_id meta_parent_sample_id \\\n", - "0 c4eab320-fa8f-4ed0-afa4-c1d529213a0e 7865fd43-363a-467b-ab41-231e9dbe82d0 \n", - "1 23daab19-f704-498d-b2ba-51882ec525b4 7865fd43-363a-467b-ab41-231e9dbe82d0 \n", - "2 a533895c-cdae-4737-865f-a9cd51f43c92 6508a630-bdab-4880-b24d-baf8a3e85cb6 \n", - "3 2e7d1409-2e8e-4dd4-ad50-8daaba3c1814 6508a630-bdab-4880-b24d-baf8a3e85cb6 \n", - "4 49e6bfe8-dc16-4691-8258-45a190e788f1 6508a630-bdab-4880-b24d-baf8a3e85cb6 \n", - "\n", - " meta_processing_time_ms \n", - "0 1040341.956 \n", - "1 508128.710 \n", - "2 506992.193 \n", - "3 1238772.573 \n", - "4 1236460.043 \n", - "\n", - "[5 rows x 21 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" } ], "source": [ - "from lightningrod import filter_and_split\n", + "from lightningrod import prepare_for_training, FilterParams, SplitParams\n", "\n", - "train_dataset, test_dataset = filter_and_split(\n", + "train_dataset, test_dataset = prepare_for_training(\n", " dataset,\n", - " test_size=0.2,\n", - " split_strategy=\"temporal\",\n", - " days_to_resolution_range=(1, 60), # horizon within 2 months\n", + " filter=FilterParams(days_to_resolution_range=(1, 60)),\n", + " split=SplitParams(test_size=0.2),\n", ")\n", "\n", "for name, ds in [(\"Train\", train_dataset), (\"Test\", test_dataset)]:\n", @@ -928,7 +317,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "24376274", "metadata": {}, "outputs": [ @@ -969,7 +358,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "a8660faf", "metadata": {}, "outputs": [ @@ -1035,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "id": "f013b514", "metadata": {}, "outputs": [ @@ -1063,7 +452,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "id": "853e7904", "metadata": {}, "outputs": [ @@ -1074,24 +463,24 @@ "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">>> Eval COMPLETED</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">ID:</span> 3ca94dc1-24fe-46ff-b5a5-c4621d0e9b54 <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">ID:</span> 00602447-5872-4732-93f9-b0d99459da1a <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Model:</span> checkpoint:13fa02ec-27f4-47a9-84c9-762d91a1904a <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Dataset:</span> 82186c26-a309-43a6-9543-37bdda38d41d <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> ┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> ┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Metric </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> base </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> trained </span>┃ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> brier_score </span>│ 0.2334 │ 0.1897 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> ece </span>│ 0.1442 │ 0.0892 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> mean_reward </span>│ -0.7850 │ -0.6088 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> mean_valid_reward </span>│ -0.7850 │ -0.6088 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> n_samples </span>│ 113 │ 113 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> n_valid </span>│ 113 │ 113 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> parse_rate </span>│ 1.0000 │ 1.0000 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> total_cost </span>│ 0.0068 │ 0.0068 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> total_input_tokens </span>│ 93344 │ 93344 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> total_output_tokens </span>│ 1111 │ 1101 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> └─────────────────────┴─────────┴─────────┘ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> ┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┓ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> ┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Metric </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> base </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> trained </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> benchmark </span>┃ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━┩ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> brier_score </span>│ 0.2333 │ 0.1877 │ 0.1555 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> ece </span>│ 0.1451 │ 0.0963 │ 0.0590 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> mean_reward </span>│ -0.7840 │ -0.6048 │ -0.4966 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> mean_valid_reward </span>│ -0.7840 │ -0.6048 │ -0.4966 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> n_samples </span>│ 113 │ 113 │ 113 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> n_valid </span>│ 113 │ 113 │ 113 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> parse_rate </span>│ 1.0000 │ 1.0000 │ 1.0000 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> total_cost </span>│ 0.0068 │ 0.0068 │ — │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> total_input_tokens </span>│ 93344 │ 93344 │ 88060 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> total_output_tokens </span>│ 1111 │ 1101 │ 28947 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> └─────────────────────┴─────────┴─────────┴───────────┘ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Cost:</span> $0.01 <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", @@ -1104,24 +493,24 @@ "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", "\u001b[94m│\u001b[0m \u001b[1;92m>> Eval COMPLETED\u001b[0m \u001b[94m│\u001b[0m\n", "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m \u001b[1mID:\u001b[0m 3ca94dc1-24fe-46ff-b5a5-c4621d0e9b54 \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mID:\u001b[0m 00602447-5872-4732-93f9-b0d99459da1a \u001b[94m│\u001b[0m\n", "\u001b[94m│\u001b[0m \u001b[1mModel:\u001b[0m checkpoint:13fa02ec-27f4-47a9-84c9-762d91a1904a \u001b[94m│\u001b[0m\n", "\u001b[94m│\u001b[0m \u001b[1mDataset:\u001b[0m 82186c26-a309-43a6-9543-37bdda38d41d \u001b[94m│\u001b[0m\n", "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m ┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m ┃\u001b[1;36m \u001b[0m\u001b[1;36mMetric \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36m base\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mtrained\u001b[0m\u001b[1;36m \u001b[0m┃ \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mbrier_score \u001b[0m\u001b[2m \u001b[0m│ 0.2334 │ 0.1897 │ \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mece \u001b[0m\u001b[2m \u001b[0m│ 0.1442 │ 0.0892 │ \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mmean_reward \u001b[0m\u001b[2m \u001b[0m│ -0.7850 │ -0.6088 │ \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mmean_valid_reward \u001b[0m\u001b[2m \u001b[0m│ -0.7850 │ -0.6088 │ \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mn_samples \u001b[0m\u001b[2m \u001b[0m│ 113 │ 113 │ \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mn_valid \u001b[0m\u001b[2m \u001b[0m│ 113 │ 113 │ \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mparse_rate \u001b[0m\u001b[2m \u001b[0m│ 1.0000 │ 1.0000 │ \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mtotal_cost \u001b[0m\u001b[2m \u001b[0m│ 0.0068 │ 0.0068 │ \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mtotal_input_tokens \u001b[0m\u001b[2m \u001b[0m│ 93344 │ 93344 │ \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mtotal_output_tokens\u001b[0m\u001b[2m \u001b[0m│ 1111 │ 1101 │ \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m └─────────────────────┴─────────┴─────────┘ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m ┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┓ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m ┃\u001b[1;36m \u001b[0m\u001b[1;36mMetric \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36m base\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mtrained\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mbenchmark\u001b[0m\u001b[1;36m \u001b[0m┃ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━┩ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mbrier_score \u001b[0m\u001b[2m \u001b[0m│ 0.2333 │ 0.1877 │ 0.1555 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mece \u001b[0m\u001b[2m \u001b[0m│ 0.1451 │ 0.0963 │ 0.0590 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mmean_reward \u001b[0m\u001b[2m \u001b[0m│ -0.7840 │ -0.6048 │ -0.4966 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mmean_valid_reward \u001b[0m\u001b[2m \u001b[0m│ -0.7840 │ -0.6048 │ -0.4966 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mn_samples \u001b[0m\u001b[2m \u001b[0m│ 113 │ 113 │ 113 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mn_valid \u001b[0m\u001b[2m \u001b[0m│ 113 │ 113 │ 113 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mparse_rate \u001b[0m\u001b[2m \u001b[0m│ 1.0000 │ 1.0000 │ 1.0000 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mtotal_cost \u001b[0m\u001b[2m \u001b[0m│ 0.0068 │ 0.0068 │ — │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mtotal_input_tokens \u001b[0m\u001b[2m \u001b[0m│ 93344 │ 93344 │ 88060 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mtotal_output_tokens\u001b[0m\u001b[2m \u001b[0m│ 1111 │ 1101 │ 28947 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m └─────────────────────┴─────────┴─────────┴───────────┘ \u001b[94m│\u001b[0m\n", "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", "\u001b[94m│\u001b[0m \u001b[1mCost:\u001b[0m $0.01 \u001b[94m│\u001b[0m\n", "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", @@ -1134,7 +523,7 @@ } ], "source": [ - "eval_job = lr.evals.run(model_id=job.model_id, dataset=test_dataset)" + "eval_job = lr.evals.run(model_id=job.model_id, dataset=test_dataset, benchmark_model_id=\"openai/gpt-5.2\")" ] }, { diff --git a/notebooks/getting_started/05_fine_tuning.ipynb b/notebooks/getting_started/05_fine_tuning.ipynb index 2f972da..21edf1c 100644 --- a/notebooks/getting_started/05_fine_tuning.ipynb +++ b/notebooks/getting_started/05_fine_tuning.ipynb @@ -1,290 +1,356 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "4dde071c", - "metadata": {}, - "source": [ - "# Training API\n", - "\n", - "Fine-tune forecasting models on your Lightning Rod datasets. This notebook walks through the full training workflow: generating a dataset, estimating cost, creating a training job, and monitoring progress.\n", - "\n", - "The training API supports LoRA fine-tuning with configurable base models, training steps, batch size, and rank." - ] - }, - { - "cell_type": "markdown", - "id": "fae3f735", - "metadata": {}, - "source": [ - "## Install the SDK" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1a6a1e2", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install lightningrod-ai python-dotenv openai\n", - "\n", - "from IPython.display import clear_output\n", - "clear_output()" - ] - }, - { - "cell_type": "markdown", - "id": "7490b222", - "metadata": {}, - "source": [ - "## Set up the client\n", - "\n", - "Sign up at [dashboard.lightningrod.ai](https://dashboard.lightningrod.ai/?redirect=/api) to get your API key and **$50 of free credits**.\n", - "\n", - "- **Google Colab**: Go to the Secrets section (key icon in left sidebar) and add a secret named `LIGHTNINGROD_API_KEY`\n", - "- **Local Jupyter**: Set the `LIGHTNINGROD_API_KEY` environment variable, or you'll be prompted to enter it" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "7a023c6c", - "metadata": {}, - "outputs": [], - "source": [ - "from dotenv import load_dotenv\n", - "from lightningrod import LightningRod\n", - "from lightningrod.utils import config\n", - "\n", - "load_dotenv()\n", - "api_key = config.get_config_value(\"LIGHTNINGROD_API_KEY\")\n", - "\n", - "lr = LightningRod(api_key=api_key)" - ] - }, - { - "cell_type": "markdown", - "id": "9c49c320", - "metadata": {}, - "source": [ - "## Prepare the dataset\n", - "\n", - "Training requires a dataset ID from a pipeline run. Run one of the other notebooks first to generate a dataset - each one prints the **Dataset ID** after `transforms.run()` — copy it into the cell below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "da381a43", - "metadata": {}, - "outputs": [], - "source": [ - "dataset_id = config.get_config_value(\"LIGHTNINGROD_DATASET_ID\")\n", - "\n", - "dataset = lr.datasets.get(dataset_id)\n", - "_ = dataset.download()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9418ddf", - "metadata": {}, - "outputs": [], - "source": [ - "from lightningrod import filter_and_split\n", - "\n", - "train_dataset, test_dataset = filter_and_split(\n", - " dataset,\n", - " test_size=0.2,\n", - " days_to_resolution_range=(90, None),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "01e98826", - "metadata": {}, - "source": [ - "## Estimate training cost\n", - "\n", - "Before starting a job, use `estimate_cost` to see the expected cost and token usage." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "4283478f", - "metadata": {}, - "outputs": [ + "cells": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Estimated cost: $0.02\n", - "Effective steps: 3\n", - "Train tokens: 63,349\n", - "Notes: Estimate uses per-answer-type output token estimates; actual may vary\n" - ] - } - ], - "source": [ - "from lightningrod import TrainingConfig\n", - "\n", - "config = TrainingConfig(\n", - " base_model=\"Qwen/Qwen3-4B-Instruct-2507\",\n", - " training_steps=50,\n", - ")\n", - "cost_estimate = lr.training.estimate_cost(config, dataset=train_dataset)\n", - "print(f\"Estimated cost: ${cost_estimate.total_cost_dollars:.2f}\")\n", - "print(f\"Effective steps: {cost_estimate.effective_steps}\")\n", - "print(f\"Train tokens: {cost_estimate.train_tokens:,}\")\n", - "print(f\"Notes: {cost_estimate.notes}\")" - ] - }, - { - "cell_type": "markdown", - "id": "ef160c9c", - "metadata": {}, - "source": [ - "## Start training\n", - "\n", - "`run` creates a job and polls until completion with a live progress display.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "b7800672", - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "id": "4dde071c", + "metadata": {}, + "source": [ + "# Training API\n", + "\n", + "Fine-tune forecasting models on your Lightning Rod datasets. This notebook walks through the full training workflow: generating a dataset, estimating cost, creating a training job, and monitoring progress.\n", + "\n", + "The training API supports LoRA fine-tuning with configurable base models, training steps, batch size, and rank." + ] + }, + { + "cell_type": "markdown", + "id": "fae3f735", + "metadata": {}, + "source": [ + "## Install the SDK" + ] + }, { - "data": { - "text/html": [ - "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #0000ff; text-decoration-color: #0000ff\">╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">>> Training COMPLETED</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Job:</span> Forecasting fine-tune <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Reward:</span> latest -0.4030 avg -0.8912 (3 steps) <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">(higher is better)</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Cost:</span> $0.01 <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n", - "</pre>\n" + "cell_type": "code", + "execution_count": 1, + "id": "c1a6a1e2", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install lightningrod-ai python-dotenv openai\n", + "\n", + "from IPython.display import clear_output\n", + "clear_output()" + ] + }, + { + "cell_type": "markdown", + "id": "7490b222", + "metadata": {}, + "source": [ + "## Set up the client\n", + "\n", + "Sign up at [dashboard.lightningrod.ai](https://dashboard.lightningrod.ai/?redirect=/api) to get your API key and **$50 of free credits**.\n", + "\n", + "- **Google Colab**: Go to the Secrets section (key icon in left sidebar) and add a secret named `LIGHTNINGROD_API_KEY`\n", + "- **Local Jupyter**: Set the `LIGHTNINGROD_API_KEY` environment variable, or you'll be prompted to enter it" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7a023c6c", + "metadata": {}, + "outputs": [], + "source": [ + "from dotenv import load_dotenv\n", + "from lightningrod import LightningRod\n", + "from lightningrod.utils import config\n", + "\n", + "load_dotenv()\n", + "api_key = config.get_config_value(\"LIGHTNINGROD_API_KEY\")\n", + "\n", + "lr = LightningRod(api_key=api_key)" + ] + }, + { + "cell_type": "markdown", + "id": "9c49c320", + "metadata": {}, + "source": [ + "## Prepare the dataset\n", + "\n", + "Training requires a dataset ID from a pipeline run. Run one of the other notebooks first to generate a dataset - each one prints the **Dataset ID** after `transforms.run()` — copy it into the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "da381a43", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_id = config.get_config_value(\"LIGHTNINGROD_DATASET_ID\")\n", + "\n", + "dataset = lr.datasets.get(dataset_id)\n", + "_ = dataset.download()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9418ddf", + "metadata": {}, + "outputs": [], + "source": [ + "from lightningrod import prepare_for_training, FilterParams, SplitParams\n", + "\n", + "train_dataset, test_dataset = prepare_for_training(\n", + " dataset,\n", + " filter=FilterParams(days_to_resolution_range=(90, None)),\n", + " split=SplitParams(test_size=0.2),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "01e98826", + "metadata": {}, + "source": [ + "## Estimate training cost\n", + "\n", + "Before starting a job, use `estimate_cost` to see the expected cost and token usage." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4283478f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Estimated cost: $0.02\n", + "Effective steps: 3\n", + "Train tokens: 63,349\n", + "Notes: Estimate uses per-answer-type output token estimates; actual may vary\n" + ] + } ], - "text/plain": [ - "\u001b[94m╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n", - "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m \u001b[1;92m>> Training COMPLETED\u001b[0m \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m \u001b[1mJob:\u001b[0m Forecasting fine-tune \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m \u001b[1mReward:\u001b[0m latest -0.4030 avg -0.8912 (3 steps) \u001b[2m(higher is better)\u001b[0m \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m \u001b[1mCost:\u001b[0m $0.01 \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", - "\u001b[94m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" + "source": [ + "from lightningrod import TrainingConfig\n", + "\n", + "config = TrainingConfig(\n", + " base_model=\"Qwen/Qwen3-4B-Instruct-2507\",\n", + " training_steps=50,\n", + ")\n", + "cost_estimate = lr.training.estimate_cost(config, dataset=train_dataset)\n", + "print(f\"Estimated cost: ${cost_estimate.total_cost_dollars:.2f}\")\n", + "print(f\"Effective steps: {cost_estimate.effective_steps}\")\n", + "print(f\"Train tokens: {cost_estimate.train_tokens:,}\")\n", + "print(f\"Notes: {cost_estimate.notes}\")" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Job bd114679-610a-4334-8802-13d047a7bc30 completed with status: COMPLETED\n", - "Trained model ID: checkpoint:bd114679-610a-4334-8802-13d047a7bc30\n" - ] - } - ], - "source": [ - "job = lr.training.run(config, dataset=train_dataset, name=\"Forecasting fine-tune\")\n", - "print(f\"Job {job.id} completed with status: {job.status}\")\n", - "print(f\"Trained model ID: {job.model_id}\")" - ] - }, - { - "cell_type": "markdown", - "id": "4fe2792c", - "metadata": {}, - "source": [ - "## Inference with your trained model\n", - "\n", - "Use `lr.predict()` to run inference with your trained model. You can also use the OpenAI-compatible API directly — see [08_foresight_model.ipynb](08_foresight_model.ipynb) for the pre-trained foresight model.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "50744b98", - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "id": "ef160c9c", + "metadata": {}, + "source": [ + "## Start training\n", + "\n", + "`run` creates a job and polls until completion with a live progress display.\n" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "<answer>0.3</answer>\n" - ] + "cell_type": "code", + "execution_count": 6, + "id": "b7800672", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #0000ff; text-decoration-color: #0000ff\">╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">>> Training COMPLETED</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Job:</span> Forecasting fine-tune <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Reward:</span> latest -0.3752 avg -0.8808 (3 steps) <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">(higher is better)</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Cost:</span> $0.01 <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n", + "</pre>\n" + ], + "text/plain": [ + "\u001b[94m╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1;92m>> Training COMPLETED\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mJob:\u001b[0m Forecasting fine-tune \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mReward:\u001b[0m latest -0.3752 avg -0.8808 (3 steps) \u001b[2m(higher is better)\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mCost:\u001b[0m $0.01 \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Job 73184285-cd39-4afe-af8e-ceab345d80dc completed with status: COMPLETED\n", + "Trained model ID: checkpoint:73184285-cd39-4afe-af8e-ceab345d80dc\n" + ] + } + ], + "source": [ + "job = lr.training.run(config, dataset=train_dataset, name=\"Forecasting fine-tune\")\n", + "print(f\"Job {job.id} completed with status: {job.status}\")\n", + "print(f\"Trained model ID: {job.model_id}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4fe2792c", + "metadata": {}, + "source": [ + "## Inference with your trained model\n", + "\n", + "Use `lr.predict()` to run inference with your trained model. You can also use the OpenAI-compatible API directly — see [08_foresight_model.ipynb](08_foresight_model.ipynb) for the pre-trained foresight model.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "50744b98", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<answer>0.35</answer>\n" + ] + } + ], + "source": [ + "print(lr.predict(job.model_id, \"Will the Fed cut rates by 25bp in March 2026?\"))\n" + ] + }, + { + "cell_type": "markdown", + "id": "c8d360d3", + "metadata": {}, + "source": [ + "## Run evals on trained model\n", + "\n", + "Run test evals on your trained model against a test dataset. The eval job runs the model on the dataset and reports metrics. Use the same dataset for a quick check, or a separate test split for production." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9dd52fd4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #0000ff; text-decoration-color: #0000ff\">╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">>> Eval COMPLETED</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">ID:</span> 4a78c3a1-a1a6-4288-9cd1-974066ddcc66 <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Model:</span> checkpoint:73184285-cd39-4afe-af8e-ceab345d80dc <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Dataset:</span> e87e04c3-4c0d-49ab-97bf-b30d724395d3 <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> ┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> ┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Metric </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> base </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> trained </span>┃ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> brier_score </span>│ 0.1925 │ 0.1961 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> ece </span>│ 0.0671 │ 0.0510 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> mean_reward </span>│ -0.5715 │ -0.5797 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> mean_valid_reward </span>│ -0.5715 │ -0.5797 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> n_samples </span>│ 81 │ 81 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> n_valid </span>│ 81 │ 81 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> parse_rate </span>│ 1.0000 │ 1.0000 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> total_cost </span>│ 0.0016 │ 0.0016 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> total_input_tokens </span>│ 20154 │ 20154 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> │<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> total_output_tokens </span>│ 787 │ 787 │ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> └─────────────────────┴─────────┴─────────┘ <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Cost:</span> $0.00 <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n", + "</pre>\n" + ], + "text/plain": [ + "\u001b[94m╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1;92m>> Eval COMPLETED\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mID:\u001b[0m 4a78c3a1-a1a6-4288-9cd1-974066ddcc66 \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mModel:\u001b[0m checkpoint:73184285-cd39-4afe-af8e-ceab345d80dc \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mDataset:\u001b[0m e87e04c3-4c0d-49ab-97bf-b30d724395d3 \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m ┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m ┃\u001b[1;36m \u001b[0m\u001b[1;36mMetric \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36m base\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mtrained\u001b[0m\u001b[1;36m \u001b[0m┃ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mbrier_score \u001b[0m\u001b[2m \u001b[0m│ 0.1925 │ 0.1961 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mece \u001b[0m\u001b[2m \u001b[0m│ 0.0671 │ 0.0510 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mmean_reward \u001b[0m\u001b[2m \u001b[0m│ -0.5715 │ -0.5797 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mmean_valid_reward \u001b[0m\u001b[2m \u001b[0m│ -0.5715 │ -0.5797 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mn_samples \u001b[0m\u001b[2m \u001b[0m│ 81 │ 81 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mn_valid \u001b[0m\u001b[2m \u001b[0m│ 81 │ 81 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mparse_rate \u001b[0m\u001b[2m \u001b[0m│ 1.0000 │ 1.0000 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mtotal_cost \u001b[0m\u001b[2m \u001b[0m│ 0.0016 │ 0.0016 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mtotal_input_tokens \u001b[0m\u001b[2m \u001b[0m│ 20154 │ 20154 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m │\u001b[2m \u001b[0m\u001b[2mtotal_output_tokens\u001b[0m\u001b[2m \u001b[0m│ 787 │ 787 │ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m └─────────────────────┴─────────┴─────────┘ \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mCost:\u001b[0m $0.00 \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "eval_job = lr.evals.run(model_id=job.model_id, dataset=test_dataset)" + ] + }, + { + "cell_type": "markdown", + "id": "72c2631e", + "metadata": {}, + "source": [ + "> Note: the trained model checkpoint will only be available for the period of 7 days. If you wish to host this model long-term, reach out to us at support@lightningrod.ai." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (lightningrod-sdk)", + "language": "python", + "name": "lightningrod-sdk" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" } - ], - "source": [ - "print(lr.predict(job.model_id, \"Will the Fed cut rates by 25bp in March 2026?\"))\n" - ] - }, - { - "cell_type": "markdown", - "id": "c8d360d3", - "metadata": {}, - "source": [ - "## Run evals on trained model\n", - "\n", - "Run test evals on your trained model against a test dataset. The eval job runs the model on the dataset and reports metrics. Use the same dataset for a quick check, or a separate test split for production." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9dd52fd4", - "metadata": {}, - "outputs": [], - "source": [ - "eval_job = lr.evals.run(model_id=job.model_id, dataset=test_dataset)" - ] - }, - { - "cell_type": "markdown", - "id": "72c2631e", - "metadata": {}, - "source": [ - "> Note: the trained model checkpoint will only be available for the period of 7 days. If you wish to host this model long-term, reach out to us at support@lightningrod.ai." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python (lightningrod-sdk)", - "language": "python", - "name": "lightningrod-sdk" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/src/lightningrod/__init__.py b/src/lightningrod/__init__.py index dd1ecfe..3f3eda9 100644 --- a/src/lightningrod/__init__.py +++ b/src/lightningrod/__init__.py @@ -9,7 +9,7 @@ from lightningrod import preprocessing, training, utils from lightningrod.utils.sample import create_sample from lightningrod.utils.models import open_router_model -from lightningrod.training import filter_and_split +from lightningrod.training import prepare_for_training, FilterParams, DedupParams, SplitParams from lightningrod.training.client import TrainingConfig from lightningrod._generated.models import ( TransformJob, @@ -95,7 +95,10 @@ "Rollout", "RolloutScorer", "RolloutGenerator", - "filter_and_split", + "prepare_for_training", + "FilterParams", + "DedupParams", + "SplitParams", "TrainingConfig", "Sample", "SampleMeta", diff --git a/src/lightningrod/_display.py b/src/lightningrod/_display.py index ccd418c..9492fd7 100644 --- a/src/lightningrod/_display.py +++ b/src/lightningrod/_display.py @@ -466,13 +466,13 @@ def _build_invalid_samples_error_message(original_message: str) -> Group: renderables.append(_safe_markup("[bold]Next steps:[/bold]")) renderables.append(_safe_markup(" • Check the dataset samples to see specific failure reasons in the 'meta.filter_reason' field")) - renderables.append(_safe_markup(" • Adjust and retry the transform pipeline (e.g., lower confidence thresholds, relax filter criteria)")) + renderables.append(_safe_markup(" • Adjust and retry the transform pipeline (e.g., try a wider date range)")) renderables.append(_safe_markup(" • If the problem persists, contact support or open a GitHub issue: [link=https://github.com/lightning-rod-labs/lightningrod-python-sdk/issues]https://github.com/lightning-rod-labs/lightningrod-python-sdk/issues[/link]")) return Group(*renderables) -def display_error(message: str, title: str = "Error", job: Any = None) -> None: +def display_error(message: str, title: str = "Error", job: Any = None, response_body: str | None = None) -> None: console = Console() renderables: list[RenderableType] = [] @@ -484,6 +484,11 @@ def display_error(message: str, title: str = "Error", job: Any = None) -> None: else: renderables.append(_safe_markup(f"[bold]{message}[/bold]")) + if response_body is not None and response_body.strip(): + renderables.append(Text("")) + renderables.append(_safe_markup("[bold]Response body:[/bold]")) + renderables.append(Text(response_body.strip()[:2000], style="dim")) + if job is not None: cost_lines = _build_transform_cost_lines(job) if isinstance(job, TransformJob) else _build_cost_lines(job) if cost_lines: @@ -493,6 +498,86 @@ def display_error(message: str, title: str = "Error", job: Any = None) -> None: console.print(Panel(Group(*renderables), border_style="bright_red", padding=(1, 2))) +def display_prepare_report(report: Any, verbose: bool = True) -> None: + """Render a PrepareReport as a Rich panel. Used inside Jupyter notebooks.""" + from lightningrod.training.samples import PrepareReport + assert isinstance(report, PrepareReport) + stats = report.stats + console = Console() + renderables: list[RenderableType] = [] + + border = "bright_green" if report.is_healthy else "yellow" + header_style = "bold bright_green" if report.is_healthy else "bold yellow" + renderables.append(_safe_markup(f"[{header_style}]>> prepare_for_training[/{header_style}]")) + renderables.append(Text("")) + + if verbose or not report.is_healthy: + renderables.append(_safe_markup(f" [dim]Starting with {stats.total} samples[/dim]")) + renderables.append(Text("")) + + parts = [] + if stats.filter_invalid: + parts.append(f"{stats.filter_invalid} invalid") + if stats.filter_horizon: + part = f"{stats.filter_horizon} horizon" + if stats.filter_missing_resolution_date or stats.filter_missing_prediction_date: + sub = [] + if stats.filter_missing_resolution_date: + sub.append(f"{stats.filter_missing_resolution_date} missing resolution date") + if stats.filter_missing_prediction_date: + sub.append(f"{stats.filter_missing_prediction_date} missing prediction date") + part += f" ({', '.join(sub)})" + parts.append(part) + if stats.filter_context: + parts.append(f"{stats.filter_context} missing context") + filter_line = ( + f" [bold]Filter:[/bold] Dropped {', '.join(parts)} → {stats.filter_kept} remain" + if parts else + f" [bold]Filter:[/bold] {stats.filter_kept} remain (0 dropped)" + ) + renderables.append(_safe_markup(filter_line)) + + if stats.dedup_removed > 0: + renderables.append(_safe_markup( + f" [bold]Dedup:[/bold] Removed {stats.dedup_removed} duplicates " + f"({stats.dedup_kept + stats.dedup_removed} → {stats.dedup_kept})" + )) + for k, c in stats.dedup_top_collisions: + q = repr(k[0])[:60] + ("..." if len(repr(k[0])) > 60 else "") + renderables.append(Text(f" ({q}, {k[1]}): {c} samples → 1", style="dim")) + else: + renderables.append(_safe_markup(f" [bold]Dedup:[/bold] {stats.dedup_kept} remain (0 duplicates)")) + + if stats.split_strategy == "temporal": + split_detail = f"Temporal: {stats.split_train_after} train | {stats.split_test_after} test" + if stats.split_no_sort_key: + split_detail += f" ({stats.split_no_sort_key} dropped, no prediction_date)" + renderables.append(_safe_markup(f" [bold]Split:[/bold] {split_detail}")) + if stats.split_train_excluded: + renderables.append(_safe_markup( + f" [yellow]{stats.split_train_excluded} train samples removed for leakage[/yellow]" + )) + else: + renderables.append(_safe_markup( + f" [bold]Split:[/bold] Random (test_size={stats.split_test_size}): " + f"{stats.split_train_after} train | {stats.split_test_after} test" + )) + + if not report.is_healthy: + renderables.append(Text("")) + renderables.append(_safe_markup("[bold yellow]⚠ Unhealthy split[/bold yellow]")) + for issue in report.issues: + renderables.append(Text("")) + renderables.append(Text(issue.message)) + if issue.tips: + renderables.append(Text("")) + renderables.append(_safe_markup(" [dim]Tips:[/dim]")) + for tip in issue.tips: + renderables.append(Text(f" • {tip}")) + + console.print(Panel(Group(*renderables), border_style=border, padding=(1, 2))) + + def display_warning(message: str, title: str = "Warning", job: Any = None) -> None: console = Console() renderables: list[RenderableType] = [] diff --git a/src/lightningrod/training/__init__.py b/src/lightningrod/training/__init__.py index 2b7d7e4..40a7073 100644 --- a/src/lightningrod/training/__init__.py +++ b/src/lightningrod/training/__init__.py @@ -4,10 +4,13 @@ from lightningrod.training.samples import ( deduplicate_samples, filter_samples, - filter_and_split, + prepare_for_training, train_test_split, to_messages, to_record, + FilterParams, + DedupParams, + SplitParams, ) __all__ = [ @@ -15,10 +18,13 @@ "print_eval", "TrainingClient", "TrainingConfig", - "filter_and_split", + "prepare_for_training", "train_test_split", "deduplicate_samples", "filter_samples", "to_record", "to_messages", + "FilterParams", + "DedupParams", + "SplitParams", ] diff --git a/src/lightningrod/training/samples.py b/src/lightningrod/training/samples.py index 4977dc2..d922677 100644 --- a/src/lightningrod/training/samples.py +++ b/src/lightningrod/training/samples.py @@ -32,26 +32,93 @@ @dataclass class PrepareStats: - """Tracks metrics collected during prepare_for_training.""" + """Tracks metrics collected during prepare_for_training. + + Fields are grouped by pipeline stage. Invariants: + filter_kept = total - filter_invalid - filter_horizon - filter_context + dedup_kept = filter_kept - dedup_removed + split_train_before + split_test_before + split_no_sort_key = dedup_kept + split_train_after = split_train_before - split_train_excluded + split_test_after = split_test_before - split_test_excluded + """ + + # ── Input ──────────────────────────────────────────────────────────────── total: int = 0 + # ── Stage 1: filter_samples ─────────────────────────────────────────────── + # Counts of samples dropped at each validity check; filter_kept is what survives. filter_invalid: int = 0 - filter_horizon: int = 0 - filter_context: int = 0 - filter_missing_resolution_date: int = 0 - filter_missing_prediction_date: int = 0 + filter_horizon: int = 0 # outside days_to_resolution_range + filter_context: int = 0 # missing / empty rendered_context + filter_missing_resolution_date: int = 0 # sub-count of filter_horizon + filter_missing_prediction_date: int = 0 # sub-count of filter_horizon filter_kept: int = 0 + # ── Stage 2: deduplicate_samples ───────────────────────────────────────── dedup_removed: int = 0 dedup_kept: int = 0 dedup_top_collisions: list[tuple[tuple[Any, ...], int]] = field(default_factory=list) + # ── Stage 3: train_test_split ───────────────────────────────────────────── split_strategy: str = "" - split_test_size: float | None = None - split_no_sort_key: int = 0 - split_leaky: int = 0 - split_train: int = 0 - split_test: int = 0 + split_test_size: float | None = None # only set for random splits + + # Samples dropped before the train/test buckets are formed (temporal only). + split_no_sort_key: int = 0 # missing prediction_date; dropped entirely + + # Train side: before → excluded → after + split_train_before: int = 0 # initial train bucket size before leakage removal + split_train_excluded: int = 0 # removed from train for temporal leakage (filter_leaky_train=True) + split_train_after: int = 0 # final train count = split_train_before - split_train_excluded + + # Test side: before → excluded → after + split_test_before: int = 0 # initial test bucket size before any test-side filtering + split_test_excluded: int = 0 # removed from test (reserved for future test-side filtering) + split_test_after: int = 0 # final test count = split_test_before - split_test_excluded + + +@dataclass +class FilterParams: + """Parameters for :func:`filter_samples`.""" + days_to_resolution_range: DaysToResolutionRange = None + drop_missing_context: bool = False + + +@dataclass +class DedupParams: + """Parameters for :func:`deduplicate_samples`.""" + key_fn: Callable[[Sample], tuple[Any, ...]] | None = None + + +@dataclass +class SplitParams: + """Parameters for :func:`train_test_split`.""" + strategy: str = "temporal" + test_size: float | None = 0.2 + test_start: str | None = None + random_state: int = 196 + sort_key: Callable[[Sample], str | None] | None = None + leakage_keys: list[Callable[[Sample], str | None]] | None = None + filter_leaky_train: bool = True + + +@dataclass +class PrepareIssue: + """A single detected problem in the prepared dataset, with actionable tips.""" + message: str + tips: list[str] = field(default_factory=list) + + +@dataclass +class PrepareReport: + """Full report produced by :func:`prepare_for_training`, covering all pipeline stages.""" + stats: PrepareStats + issues: list[PrepareIssue] = field(default_factory=list) + + @property + def is_healthy(self) -> bool: + return not self.issues + def _validate_days_to_resolution_range(value: Any) -> None: if value is None: @@ -85,14 +152,14 @@ def _parse_date(value: Any) -> Optional[date]: def filter_samples( samples: list[Sample], - days_to_resolution_range: DaysToResolutionRange = None, - drop_missing_context: bool = True, + params: FilterParams | None = None, stats: PrepareStats | None = None, ) -> list[Sample]: """Filter samples by validity, horizon, and optional context presence.""" - _validate_days_to_resolution_range(days_to_resolution_range) - min_horizon = days_to_resolution_range[0] if days_to_resolution_range else None - max_horizon = days_to_resolution_range[1] if days_to_resolution_range else None + params = params or FilterParams() + _validate_days_to_resolution_range(params.days_to_resolution_range) + min_horizon = params.days_to_resolution_range[0] if params.days_to_resolution_range else None + max_horizon = params.days_to_resolution_range[1] if params.days_to_resolution_range else None n_invalid = n_horizon = n_context = n_missing_resolution_date = n_missing_prediction_date = 0 filtered: list[Sample] = [] @@ -133,7 +200,7 @@ def filter_samples( if max_horizon is not None and horizon_days > max_horizon: n_horizon += 1 continue - if drop_missing_context: + if params.drop_missing_context: if not sample.context: n_context += 1 continue @@ -236,28 +303,23 @@ def get_resolution_date(sample: Sample) -> str | None: def train_test_split( samples: list[Sample], - *, - split_strategy: str = "temporal", - test_start: str | None = None, - test_size: float | None = None, - random_state: int = 196, - sort_key: Callable[[Sample], str | None] | None = None, - leakage_keys: list[Callable[[Sample], str | None]] | None = None, - filter_leaky_train: bool = True, + params: SplitParams | None = None, stats: PrepareStats | None = None, ) -> tuple[list[str], list[str]]: """Split samples into train/test by temporal order or random shuffle, with optional leakage filtering. Returns (train_ids, test_ids) for memory efficiency.""" - temporal_split = split_strategy == "temporal" + params = params or SplitParams() + temporal_split = params.strategy == "temporal" if temporal_split: - if (test_start is None) == (test_size is None): - raise ValueError("Provide exactly one of test_start or test_size when split_strategy='temporal'") + if (params.test_start is None) == (params.test_size is None): + raise ValueError("Provide exactly one of test_start or test_size when strategy='temporal'") else: - if test_size is None: - raise ValueError("test_size is required when split_strategy='random'") - if test_start is not None: - raise ValueError("test_start is only valid when split_strategy='temporal'") + if params.test_size is None: + raise ValueError("test_size is required when strategy='random'") + if params.test_start is not None: + raise ValueError("test_start is only valid when strategy='temporal'") + sort_key = params.sort_key if sort_key is None: def default_sort_key(sample: Sample) -> str | None: if not sample.question: @@ -271,24 +333,23 @@ def default_sort_key(sample: Sample) -> str | None: return None sort_key = default_sort_key - if leakage_keys is None: - leakage_keys = _default_leakage_keys() + leakage_keys = params.leakage_keys or _default_leakage_keys() if temporal_split: valid_samples = [r for r in samples if sort_key(r) is not None] n_no_sort_key = len(samples) - len(valid_samples) sorted_samples = sorted(valid_samples, key=sort_key) - if test_size is not None: - split_idx = int(len(sorted_samples) * (1 - test_size)) + if params.test_size is not None: + split_idx = int(len(sorted_samples) * (1 - params.test_size)) train, test = sorted_samples[:split_idx], sorted_samples[split_idx:] else: - assert test_start is not None - train = [r for r in sorted_samples if sort_key(r) is not None and sort_key(r) < test_start] - test = [r for r in sorted_samples if sort_key(r) is not None and sort_key(r) >= test_start] + assert params.test_start is not None + train = [r for r in sorted_samples if sort_key(r) is not None and sort_key(r) < params.test_start] + test = [r for r in sorted_samples if sort_key(r) is not None and sort_key(r) >= params.test_start] n_leaky = 0 - if filter_leaky_train and test: + if params.filter_leaky_train and test: test_cutoff = sort_key(test[0]) if test_cutoff is not None: def is_safe(row: Sample) -> bool: @@ -303,28 +364,35 @@ def is_safe(row: Sample) -> bool: n_leaky = train_before - len(train) if stats is not None: - stats.split_strategy = "temporal" + stats.split_strategy = params.strategy stats.split_no_sort_key = n_no_sort_key - stats.split_leaky = n_leaky - stats.split_train = len(train) - stats.split_test = len(test) + stats.split_train_before = len(train) + n_leaky + stats.split_train_excluded = n_leaky + stats.split_train_after = len(train) + stats.split_test_before = len(test) + stats.split_test_excluded = 0 + stats.split_test_after = len(test) return [s.id for s in train], [s.id for s in test] shuffled = list(samples) - rng = random.Random(random_state) if random_state is not None else random + rng = random.Random(params.random_state) if params.random_state is not None else random rng.shuffle(shuffled) - assert test_size is not None - split_idx = int(len(shuffled) * (1 - test_size)) + assert params.test_size is not None + split_idx = int(len(shuffled) * (1 - params.test_size)) train = shuffled[:split_idx] test = shuffled[split_idx:] if stats is not None: - stats.split_strategy = "random" - stats.split_test_size = test_size - stats.split_train = len(train) - stats.split_test = len(test) + stats.split_strategy = params.strategy + stats.split_test_size = params.test_size + stats.split_train_before = len(train) + stats.split_train_excluded = 0 + stats.split_train_after = len(train) + stats.split_test_before = len(test) + stats.split_test_excluded = 0 + stats.split_test_after = len(test) return [s.id for s in train], [s.id for s in test] @@ -343,11 +411,12 @@ def _default_dedup_key(sample: Sample) -> tuple[Any, ...]: def deduplicate_samples( samples: list[Sample], - key_fn: Callable[[Sample], tuple[Any, ...]] | None = None, + params: DedupParams | None = None, stats: PrepareStats | None = None, ) -> list[Sample]: """Remove duplicate samples by (question_text, resolution_date) or custom key.""" - key_fn_local: Callable[[Sample], tuple[Any, ...]] = key_fn or _default_dedup_key + params = params or DedupParams() + key_fn_local: Callable[[Sample], tuple[Any, ...]] = params.key_fn or _default_dedup_key seen: set[tuple[Any, ...]] = set() key_counts: dict[tuple[Any, ...], int] = {} result: list[Sample] = [] @@ -582,59 +651,127 @@ def _render_context(context: list[Union[NewsContext, RAGContext]]) -> str: return "\n\n".join(rendered_sections) -def _print_stats(stats: PrepareStats) -> None: - print(f"[prepare_for_training] Starting with {stats.total} samples") - - parts = [] - if stats.filter_invalid: - parts.append(f"{stats.filter_invalid} invalid") - if stats.filter_horizon: - part = f"{stats.filter_horizon} horizon" - if stats.filter_missing_resolution_date or stats.filter_missing_prediction_date: - sub = [] - if stats.filter_missing_resolution_date: - sub.append(f"{stats.filter_missing_resolution_date} missing resolution date") - if stats.filter_missing_prediction_date: - sub.append(f"{stats.filter_missing_prediction_date} missing prediction date") - part += f" ({', '.join(sub)})" - parts.append(part) - if stats.filter_context: - parts.append(f"{stats.filter_context} missing context") - if parts: - print(f"[filter] Dropped {', '.join(parts)} → {stats.filter_kept} remain") - else: - print(f"[filter] {stats.filter_kept} remain (0 dropped)") +def _build_report(stats: PrepareStats, split: SplitParams, filter: FilterParams) -> PrepareReport: + """Build a structured PrepareReport from pipeline stats and params. Pure — no side effects.""" + issues: list[PrepareIssue] = [] - if stats.dedup_removed > 0: - print(f"[dedup] Removed {stats.dedup_removed} duplicates ({stats.dedup_kept + stats.dedup_removed} → {stats.dedup_kept}). Top colliding keys:") - for k, c in stats.dedup_top_collisions: - q = repr(k[0])[:60] + ("..." if len(repr(k[0])) > 60 else "") - print(f" ({q}, {k[1]}): {c} samples → 1") - else: - print(f"[dedup] {stats.dedup_kept} remain (0 duplicates)") - - if stats.split_strategy == "temporal": - if stats.split_no_sort_key: - print(f"[split] {stats.split_no_sort_key} samples had no prediction_date (dropped)") - if stats.split_leaky: - print(f"[split] {stats.split_leaky} train samples removed for leakage") - print(f"[split] Temporal split: {stats.split_train} train, {stats.split_test} test") - else: - print(f"[split] Random split (test_size={stats.split_test_size}): {stats.split_train} train, {stats.split_test} test") + majority_train_leaked = ( + split.filter_leaky_train + and split.strategy == "temporal" + and stats.split_train_before > 0 + and stats.split_train_excluded > stats.split_train_before // 2 + ) + if majority_train_leaked: + pct = int(100 * stats.split_train_excluded / stats.split_train_before) + tips: list[str] = [] + if split.test_start is None: + max_horizon = filter.days_to_resolution_range[1] if filter.days_to_resolution_range else None + buffer = f"{max_horizon}" if max_horizon else "your max resolution horizon" + tips.append( + f"Use test_start=\"YYYY-MM-DD\" instead of test_size to set an explicit cutoff at least " + f"{buffer} days before your last question date, giving train questions room to resolve before the test window." + ) + if filter.days_to_resolution_range is not None and filter.days_to_resolution_range[1] is not None: + tips.append( + f"Tighten days_to_resolution_range — the current max of {filter.days_to_resolution_range[1]} days " + "means train resolution dates extend far into the test window. Reducing it shrinks the bleed-over zone." + ) + tips.append( + "Generate more samples across a wider date range. With questions spread over a longer period, the " + "temporal split cutoff moves far enough back that earlier questions resolve well before the test window." + ) + tips.append( + "Set filter_leaky_train=False to disable leakage removal. Only do this if you are confident the " + "resolution dates do not reveal information that was unavailable at prediction time." + ) + issues.append(PrepareIssue( + message=( + f"{stats.split_train_excluded}/{stats.split_train_before} train samples ({pct}%) were removed " + "for temporal leakage — the date_close or resolution_date of train questions extends into the test period." + ), + tips=tips, + )) + + all_test_excluded = stats.split_test_after == 0 and stats.split_test_excluded == 0 and stats.dedup_kept > 0 + if all_test_excluded: + test_tips: list[str] = [] + if split.strategy == "temporal": + if split.test_start is not None: + test_tips.append( + f"The test_start=\"{split.test_start}\" cutoff may be after all sample dates. " + "Choose an earlier date or switch to test_size=0.2." + ) + else: + test_tips.append("Increase the dataset size so there are enough samples to fill the test fraction.") + test_tips.append( + f"Decrease test_size (currently {split.test_size}) if too few samples survive to the test window." + ) + else: + test_tips.append( + f"Increase the dataset size — with test_size={split.test_size} and only {stats.dedup_kept} samples " + "after filtering, the test set may round to zero." + ) + issues.append(PrepareIssue(message="The test set is empty after splitting.", tips=test_tips)) + + return PrepareReport(stats=stats, issues=issues) -def filter_and_split( +def _print_report(report: PrepareReport, verbose: bool) -> None: + """Print the report to stdout and raise if unhealthy (non-notebook path).""" + stats = report.stats + if verbose: + print(f"[prepare_for_training] Starting with {stats.total} samples") + + parts = [] + if stats.filter_invalid: + parts.append(f"{stats.filter_invalid} invalid") + if stats.filter_horizon: + part = f"{stats.filter_horizon} horizon" + if stats.filter_missing_resolution_date or stats.filter_missing_prediction_date: + sub = [] + if stats.filter_missing_resolution_date: + sub.append(f"{stats.filter_missing_resolution_date} missing resolution date") + if stats.filter_missing_prediction_date: + sub.append(f"{stats.filter_missing_prediction_date} missing prediction date") + part += f" ({', '.join(sub)})" + parts.append(part) + if stats.filter_context: + parts.append(f"{stats.filter_context} missing context") + print(f"[filter] Dropped {', '.join(parts)} → {stats.filter_kept} remain" if parts else f"[filter] {stats.filter_kept} remain (0 dropped)") + + if stats.dedup_removed > 0: + print(f"[dedup] Removed {stats.dedup_removed} duplicates ({stats.dedup_kept + stats.dedup_removed} → {stats.dedup_kept}). Top colliding keys:") + for k, c in stats.dedup_top_collisions: + q = repr(k[0])[:60] + ("..." if len(repr(k[0])) > 60 else "") + print(f" ({q}, {k[1]}): {c} samples → 1") + else: + print(f"[dedup] {stats.dedup_kept} remain (0 duplicates)") + + if stats.split_strategy == "temporal": + if stats.split_no_sort_key: + print(f"[split] {stats.split_no_sort_key} samples had no prediction_date (dropped)") + if stats.split_train_excluded: + print(f"[split] {stats.split_train_excluded} train samples removed for leakage") + print(f"[split] Temporal split: {stats.split_train_after} train, {stats.split_test_after} test") + else: + print(f"[split] Random split (test_size={stats.split_test_size}): {stats.split_train_after} train, {stats.split_test_after} test") + + if not report.is_healthy: + lines = ["[prepare_for_training] Unhealthy split detected."] + for issue in report.issues: + lines.append(issue.message) + if issue.tips: + lines.append("Tips:\n" + "\n".join(f" - {t}" for t in issue.tips)) + raise ValueError("\n\n".join(lines)) + + +def prepare_for_training( dataset: "SampleDataset", *, - test_size: float = 0.2, - split_strategy: str = "temporal", - test_start: str | None = None, - drop_missing_context: bool = False, - days_to_resolution_range: DaysToResolutionRange = None, - random_state: int = 196, - filter_leaky_train: bool = True, - deduplicate_key_fn: Callable[[Sample], tuple[Any, ...]] | None = None, - verbose: bool = False, + filter: FilterParams | None = None, + dedup: DedupParams | None = None, + split: SplitParams | None = None, + verbose: bool = True, ) -> tuple["SampleDataset", "SampleDataset"]: """Prepare a dataset for model training: filter, deduplicate, split into train/test. @@ -644,42 +781,30 @@ def filter_and_split( Args: dataset: SampleDataset to prepare (samples are fetched via dataset.samples()). - test_size: Fraction of samples for the test set (0.0–1.0). Default 0.2. - split_strategy: 'temporal' (default) or 'random'. - test_start: ISO date string for temporal splits. Provide exactly one of - test_start or test_size for temporal splits. - drop_missing_context: If True, exclude samples with no context. - days_to_resolution_range: Optional (min_days, max_days) tuple. - random_state: Seed for reproducible random splits. - filter_leaky_train: When True and temporal, remove temporal leakage. - deduplicate_key_fn: Optional function to customize deduplication key. + filter: Controls validity filtering and horizon range. See :class:`FilterParams`. + dedup: Controls deduplication key. See :class:`DedupParams`. + split: Controls train/test split strategy, size, and leakage filtering. See :class:`SplitParams`. verbose: When True, print step-by-step stats. Returns: (train_dataset, test_dataset): SampleDatasets ready for training/eval. """ + filter = filter or FilterParams() + dedup = dedup or DedupParams() + split = split or SplitParams() + samples = dataset.samples() stats = PrepareStats(total=len(samples)) - filtered = filter_samples( - samples, - days_to_resolution_range=days_to_resolution_range, - drop_missing_context=drop_missing_context, - stats=stats, - ) - deduped = deduplicate_samples(filtered, key_fn=deduplicate_key_fn, stats=stats) - - train_ids, test_ids = train_test_split( - deduped, - split_strategy=split_strategy, - test_start=test_start, - filter_leaky_train=filter_leaky_train, - test_size=test_size, - random_state=random_state, - stats=stats, - ) + filtered = filter_samples(samples, filter, stats=stats) + deduped = deduplicate_samples(filtered, dedup, stats=stats) + train_ids, test_ids = train_test_split(deduped, split, stats=stats) - if verbose: - _print_stats(stats) + report = _build_report(stats, split=split, filter=filter) + from lightningrod._display import _is_notebook, display_prepare_report + if _is_notebook(): + display_prepare_report(report, verbose=verbose) + else: + _print_report(report, verbose=verbose) return dataset.subset(train_ids), dataset.subset(test_ids) \ No newline at end of file From b479e979c1226dc8eaf6defa7b86149a9897e0b0 Mon Sep 17 00:00:00 2001 From: Bartolomej Kozorog <bartolomej.kozorog@gmail.com> Date: Thu, 19 Mar 2026 15:37:02 +0100 Subject: [PATCH 4/5] cleanup, more helpful issue tips --- src/lightningrod/_display.py | 21 +-- src/lightningrod/training/samples.py | 195 +++++++++++++++++---------- 2 files changed, 128 insertions(+), 88 deletions(-) diff --git a/src/lightningrod/_display.py b/src/lightningrod/_display.py index 9492fd7..790bc4e 100644 --- a/src/lightningrod/_display.py +++ b/src/lightningrod/_display.py @@ -548,27 +548,20 @@ def display_prepare_report(report: Any, verbose: bool = True) -> None: else: renderables.append(_safe_markup(f" [bold]Dedup:[/bold] {stats.dedup_kept} remain (0 duplicates)")) - if stats.split_strategy == "temporal": - split_detail = f"Temporal: {stats.split_train_after} train | {stats.split_test_after} test" - if stats.split_no_sort_key: - split_detail += f" ({stats.split_no_sort_key} dropped, no prediction_date)" - renderables.append(_safe_markup(f" [bold]Split:[/bold] {split_detail}")) - if stats.split_train_excluded: - renderables.append(_safe_markup( - f" [yellow]{stats.split_train_excluded} train samples removed for leakage[/yellow]" - )) - else: + split_detail = f"Splits: {stats.split_train_after} train | {stats.split_test_after} test ({stats.split_no_sort_key} dropped, no prediction_date)" + renderables.append(_safe_markup(f" [bold]Split:[/bold] {split_detail}")) + n_leaked = stats.split_train_before - stats.split_train_after + if n_leaked: renderables.append(_safe_markup( - f" [bold]Split:[/bold] Random (test_size={stats.split_test_size}): " - f"{stats.split_train_after} train | {stats.split_test_after} test" + f" [yellow]{n_leaked} train samples removed for leakage[/yellow]" )) if not report.is_healthy: renderables.append(Text("")) - renderables.append(_safe_markup("[bold yellow]⚠ Unhealthy split[/bold yellow]")) + renderables.append(_safe_markup("[bold yellow]⚠ Unhealthy dataset[/bold yellow]")) for issue in report.issues: renderables.append(Text("")) - renderables.append(Text(issue.message)) + renderables.append(Text(issue.message, style="bold")) if issue.tips: renderables.append(Text("")) renderables.append(_safe_markup(" [dim]Tips:[/dim]")) diff --git a/src/lightningrod/training/samples.py b/src/lightningrod/training/samples.py index d922677..1d90afd 100644 --- a/src/lightningrod/training/samples.py +++ b/src/lightningrod/training/samples.py @@ -37,21 +37,19 @@ class PrepareStats: Fields are grouped by pipeline stage. Invariants: filter_kept = total - filter_invalid - filter_horizon - filter_context dedup_kept = filter_kept - dedup_removed - split_train_before + split_test_before + split_no_sort_key = dedup_kept - split_train_after = split_train_before - split_train_excluded - split_test_after = split_test_before - split_test_excluded + split_train_before + split_test_after + split_no_sort_key = dedup_kept (temporal) + split_train_excluded = split_train_before - split_train_after (inferred; train leakage only) """ # ── Input ──────────────────────────────────────────────────────────────── total: int = 0 # ── Stage 1: filter_samples ─────────────────────────────────────────────── - # Counts of samples dropped at each validity check; filter_kept is what survives. filter_invalid: int = 0 - filter_horizon: int = 0 # outside days_to_resolution_range - filter_context: int = 0 # missing / empty rendered_context - filter_missing_resolution_date: int = 0 # sub-count of filter_horizon - filter_missing_prediction_date: int = 0 # sub-count of filter_horizon + filter_horizon: int = 0 + filter_context: int = 0 + filter_missing_resolution_date: int = 0 + filter_missing_prediction_date: int = 0 filter_kept: int = 0 # ── Stage 2: deduplicate_samples ───────────────────────────────────────── @@ -60,21 +58,11 @@ class PrepareStats: dedup_top_collisions: list[tuple[tuple[Any, ...], int]] = field(default_factory=list) # ── Stage 3: train_test_split ───────────────────────────────────────────── - split_strategy: str = "" - split_test_size: float | None = None # only set for random splits + split_no_sort_key: int = 0 - # Samples dropped before the train/test buckets are formed (temporal only). - split_no_sort_key: int = 0 # missing prediction_date; dropped entirely - - # Train side: before → excluded → after - split_train_before: int = 0 # initial train bucket size before leakage removal - split_train_excluded: int = 0 # removed from train for temporal leakage (filter_leaky_train=True) - split_train_after: int = 0 # final train count = split_train_before - split_train_excluded - - # Test side: before → excluded → after - split_test_before: int = 0 # initial test bucket size before any test-side filtering - split_test_excluded: int = 0 # removed from test (reserved for future test-side filtering) - split_test_after: int = 0 # final test count = split_test_before - split_test_excluded + split_train_before: int = 0 + split_train_after: int = 0 + split_test_after: int = 0 @dataclass @@ -364,13 +352,9 @@ def is_safe(row: Sample) -> bool: n_leaky = train_before - len(train) if stats is not None: - stats.split_strategy = params.strategy stats.split_no_sort_key = n_no_sort_key stats.split_train_before = len(train) + n_leaky - stats.split_train_excluded = n_leaky stats.split_train_after = len(train) - stats.split_test_before = len(test) - stats.split_test_excluded = 0 stats.split_test_after = len(test) return [s.id for s in train], [s.id for s in test] @@ -385,13 +369,8 @@ def is_safe(row: Sample) -> bool: test = shuffled[split_idx:] if stats is not None: - stats.split_strategy = params.strategy - stats.split_test_size = params.test_size stats.split_train_before = len(train) - stats.split_train_excluded = 0 stats.split_train_after = len(train) - stats.split_test_before = len(test) - stats.split_test_excluded = 0 stats.split_test_after = len(test) return [s.id for s in train], [s.id for s in test] @@ -654,64 +633,134 @@ def _render_context(context: list[Union[NewsContext, RAGContext]]) -> str: def _build_report(stats: PrepareStats, split: SplitParams, filter: FilterParams) -> PrepareReport: """Build a structured PrepareReport from pipeline stats and params. Pure — no side effects.""" issues: list[PrepareIssue] = [] + split_train_excluded = stats.split_train_before - stats.split_train_after + # Issue: majority of train samples leaked into test period majority_train_leaked = ( split.filter_leaky_train and split.strategy == "temporal" and stats.split_train_before > 0 - and stats.split_train_excluded > stats.split_train_before // 2 + and split_train_excluded > stats.split_train_before // 2 ) if majority_train_leaked: - pct = int(100 * stats.split_train_excluded / stats.split_train_before) + pct = int(100 * split_train_excluded / stats.split_train_before) tips: list[str] = [] - if split.test_start is None: - max_horizon = filter.days_to_resolution_range[1] if filter.days_to_resolution_range else None - buffer = f"{max_horizon}" if max_horizon else "your max resolution horizon" + max_horizon = filter.days_to_resolution_range[1] if filter.days_to_resolution_range else None + if max_horizon is not None: tips.append( - f"Use test_start=\"YYYY-MM-DD\" instead of test_size to set an explicit cutoff at least " - f"{buffer} days before your last question date, giving train questions room to resolve before the test window." + f"Extend the seed generator date range to start earlier — the range should span at least " + f"{max_horizon * 2} days so questions generated near the start resolve well before the test window." ) - if filter.days_to_resolution_range is not None and filter.days_to_resolution_range[1] is not None: + else: tips.append( - f"Tighten days_to_resolution_range — the current max of {filter.days_to_resolution_range[1]} days " - "means train resolution dates extend far into the test window. Reducing it shrinks the bleed-over zone." + "Extend the seed generator (date) filter range to start earlier — questions generated near the start " + "will resolve well before the test window. Aim for at least 2× your max resolution horizon." ) tips.append( - "Generate more samples across a wider date range. With questions spread over a longer period, the " - "temporal split cutoff moves far enough back that earlier questions resolve well before the test window." + "Generate more samples by increasing max_questions in lr.transforms.run() or removing the limit, " + "or increase questions_per_seed in your question generator config. " + "A larger, temporally well-spread dataset naturally pushes the split cutoff far enough back." ) tips.append( - "Set filter_leaky_train=False to disable leakage removal. Only do this if you are confident the " - "resolution dates do not reveal information that was unavailable at prediction time." + "If very few seeds were returned by the pipeline (check the run summary table), the search queries " + "may not surface results across the full date range. Try more diverse search queries, increase " + "articles_per_search, or shorten interval_duration_days." ) issues.append(PrepareIssue( message=( - f"{stats.split_train_excluded}/{stats.split_train_before} train samples ({pct}%) were removed " + f"{split_train_excluded}/{stats.split_train_before} train samples ({pct}%) were removed " "for temporal leakage — the date_close or resolution_date of train questions extends into the test period." ), tips=tips, )) - all_test_excluded = stats.split_test_after == 0 and stats.split_test_excluded == 0 and stats.dedup_kept > 0 - if all_test_excluded: - test_tips: list[str] = [] - if split.strategy == "temporal": - if split.test_start is not None: - test_tips.append( - f"The test_start=\"{split.test_start}\" cutoff may be after all sample dates. " - "Choose an earlier date or switch to test_size=0.2." - ) - else: - test_tips.append("Increase the dataset size so there are enough samples to fill the test fraction.") - test_tips.append( - f"Decrease test_size (currently {split.test_size}) if too few samples survive to the test window." - ) - else: - test_tips.append( - f"Increase the dataset size — with test_size={split.test_size} and only {stats.dedup_kept} samples " - "after filtering, the test set may round to zero." - ) - issues.append(PrepareIssue(message="The test set is empty after splitting.", tips=test_tips)) + # Issue: too few train samples for effective training + MIN_TRAIN_SAMPLES = 200 + if stats.split_train_after < MIN_TRAIN_SAMPLES and stats.split_train_after > 0: + issues.append(PrepareIssue( + message=( + f"Only {stats.split_train_after} train samples remain after preparation. " + f"This is below the recommended minimum of {MIN_TRAIN_SAMPLES} for effective training." + ), + tips=[ + "Increase max_questions in lr.transforms.run() to generate more samples.", + "Increase questions_per_seed in your question generator (ForwardLookingQuestionGenerator or QuestionGenerator) to produce more questions from each seed article." + "Add more search queries to your seed generator to diversify seed sources.", + "Widen the seed generator date range (start_date to end_date) to capture more events.", + ], + )) + + # Issue: too few test samples for reliable evaluation + MIN_TEST_SAMPLES = 50 + if stats.split_test_after < MIN_TEST_SAMPLES and stats.split_test_after > 0: + issues.append(PrepareIssue( + message=( + f"Only {stats.split_test_after} test samples remain after preparation. " + f"This is below the recommended minimum of ~{MIN_TEST_SAMPLES} for reliable evaluation." + ), + tips=[ + "Generate more samples overall — test samples come from the most recent portion of your date range.", + "Ensure your seed generator date range extends close to the present so recent events appear in the test set.", + ], + )) + + # Issue: high invalid rate (>30% of samples were invalid) + HIGH_INVALID_THRESHOLD = 0.30 + if stats.total > 0 and stats.filter_invalid / stats.total > HIGH_INVALID_THRESHOLD: + pct = int(100 * stats.filter_invalid / stats.total) + issues.append(PrepareIssue( + message=( + f"{stats.filter_invalid}/{stats.total} samples ({pct}%) were marked invalid. " + "This suggests issues with dataset generation configuration." + ), + tips=[ + "Add more examples and bad_examples to guide the question generator toward more sensible questions.", + "Check the labeler configuration — if WebSearchLabeler can't find resolution info, samples are marked invalid.", + "Inspect a few invalid samples with dataset.flattened() to identify patterns.", + ], + )) + + # Issue: high dedup rate (>40% removed as duplicates) + HIGH_DEDUP_THRESHOLD = 0.40 + if stats.filter_kept > 0 and stats.dedup_removed / stats.filter_kept > HIGH_DEDUP_THRESHOLD: + pct = int(100 * stats.dedup_removed / stats.filter_kept) + issues.append(PrepareIssue( + message=( + f"{stats.dedup_removed}/{stats.filter_kept} samples ({pct}%) were duplicates. " + "The pipeline is generating repetitive or similar questions." + ), + tips=[ + "Add more diverse search queries to your seed generator to surface different source articles.", + "Increase interval_duration_days to spread seeds across more time periods.", + "Add bad_examples to your question generator showing the repetitive patterns to avoid.", + "Use more specific instructions in your question generator to encourage variety.", + ], + )) + + # Issue: high horizon filter rate (>50% filtered out by horizon) + HIGH_HORIZON_THRESHOLD = 0.50 + if stats.total > 0 and stats.filter_horizon / stats.total > HIGH_HORIZON_THRESHOLD: + pct = int(100 * stats.filter_horizon / stats.total) + horizon_desc = "" + if filter.days_to_resolution_range: + min_h, max_h = filter.days_to_resolution_range + if min_h is not None and max_h is not None: + horizon_desc = f" (required: {min_h}-{max_h} days)" + elif min_h is not None: + horizon_desc = f" (required: ≥{min_h} days)" + elif max_h is not None: + horizon_desc = f" (required: ≤{max_h} days)" + issues.append(PrepareIssue( + message=( + f"{stats.filter_horizon}/{stats.total} samples ({pct}%) fell outside the resolution horizon{horizon_desc}. " + ), + tips=[ + "Widen your days_to_resolution_range in FilterParams if your use case allows longer/shorter horizons.", + "Adjust your seed generator date range — questions from very recent seeds may not have resolved yet, " + "while questions from old seeds may exceed your max horizon.", + "Check that your question generator is producing questions with appropriate resolution timelines for your target horizon." + ], + )) return PrepareReport(stats=stats, issues=issues) @@ -747,14 +796,12 @@ def _print_report(report: PrepareReport, verbose: bool) -> None: else: print(f"[dedup] {stats.dedup_kept} remain (0 duplicates)") - if stats.split_strategy == "temporal": - if stats.split_no_sort_key: - print(f"[split] {stats.split_no_sort_key} samples had no prediction_date (dropped)") - if stats.split_train_excluded: - print(f"[split] {stats.split_train_excluded} train samples removed for leakage") - print(f"[split] Temporal split: {stats.split_train_after} train, {stats.split_test_after} test") - else: - print(f"[split] Random split (test_size={stats.split_test_size}): {stats.split_train_after} train, {stats.split_test_after} test") + if stats.split_no_sort_key: + print(f"[split] {stats.split_no_sort_key} samples had no prediction_date (dropped)") + n_leaked = stats.split_train_before - stats.split_train_after + if n_leaked: + print(f"[split] {n_leaked} train samples removed for leakage") + print(f"[split] Temporal split: {stats.split_train_after} train, {stats.split_test_after} test") if not report.is_healthy: lines = ["[prepare_for_training] Unhealthy split detected."] From 489add56202e7e6bfed712049808a0765e0dcb2b Mon Sep 17 00:00:00 2001 From: Bartolomej Kozorog <bartolomej.kozorog@gmail.com> Date: Thu, 19 Mar 2026 16:44:55 +0100 Subject: [PATCH 5/5] rerun trump forecasting --- .../fine_tuning/02_trump_forecasting.ipynb | 739 ++++++++++++++++-- 1 file changed, 674 insertions(+), 65 deletions(-) diff --git a/notebooks/fine_tuning/02_trump_forecasting.ipynb b/notebooks/fine_tuning/02_trump_forecasting.ipynb index 6447edc..31e5712 100644 --- a/notebooks/fine_tuning/02_trump_forecasting.ipynb +++ b/notebooks/fine_tuning/02_trump_forecasting.ipynb @@ -22,7 +22,7 @@ "True" ] }, - "execution_count": null, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -177,20 +177,24 @@ "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">>> Pipeline Completed</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> <span style=\"font-weight: bold\">Total cost:</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">$0.03</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> <span style=\"font-weight: bold\">Total cost:</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">$1.90</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━┳━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> ┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Step </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Progress </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> In </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Out </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Rejected </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Errors </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Rejection Reasons </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Duration </span>┃ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━╇━━━━━╇━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> NewsSeedGenerator… </span>│ <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">Complete </span> │ 1 │ 10 │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">- </span> │ 2s │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> ForwardLookingQue… </span>│ <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">Complete </span> │ 10 │ 50 │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">- </span> │ 1s │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> WebSearchLabelerT… </span>│ <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">Complete </span> │ 50 │ 45 │ <span style=\"color: #ff0000; text-decoration-color: #ff0000\"> 5</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">Resolution date is </span> │ 1s │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">before seed </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">creation date (4), </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">Low confidence: </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">0.80 < 0.9 (1) </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> NewsContextGenera… </span>│ <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">Complete </span> │ 45 │ 45 │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">- </span> │ 1s │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", - "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> └────────────────────┴──────────────────────┴────┴─────┴──────────┴────────┴─────────────────────┴──────────┘ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━┳━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> ┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Step </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Progress </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> In </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Out </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Rejected </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Errors </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Rejection Reasons </span>┃<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\"> Duration </span>┃ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━╇━━━━━╇━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> NewsSeedGenerator… </span>│ <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">Complete </span> │ 10 │ 95 │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">- </span> │ 1s │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> ForwardLookingQue… </span>│ <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">Complete </span> │ 95 │ 442 │ <span style=\"color: #ff0000; text-decoration-color: #ff0000\"> 31</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">date_close not </span> │ 1s │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">after event_date </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">(31) </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> WebSearchLabelerT… </span>│ <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">Complete </span> │ 442 │ 380 │ <span style=\"color: #ff0000; text-decoration-color: #ff0000\"> 62</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">Resolution date is</span> │ 4s │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">before seed </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">creation date </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">(36), Undetermined</span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">label (25), Low </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">confidence: 0.80 <</span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> </span>│ │ │ │ │ │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">0.9 (1) </span> │ │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> │<span style=\"font-weight: bold\"> NewsContextGenera… </span>│ <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">Complete </span> │ 380 │ 380 │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0</span> │ <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">- </span> │ 57s │ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", + "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> └────────────────────┴──────────────────────┴─────┴─────┴──────────┴────────┴────────────────────┴──────────┘ <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">│</span>\n", "<span style=\"color: #00ff00; text-decoration-color: #00ff00\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n", "</pre>\n" @@ -200,20 +204,24 @@ "\u001b[92m│\u001b[0m \u001b[92m│\u001b[0m\n", "\u001b[92m│\u001b[0m \u001b[1;92m>> Pipeline Completed\u001b[0m \u001b[92m│\u001b[0m\n", "\u001b[92m│\u001b[0m \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m \u001b[1mTotal cost:\u001b[0m \u001b[92m$0.03\u001b[0m \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m \u001b[1mTotal cost:\u001b[0m \u001b[92m$1.90\u001b[0m \u001b[92m│\u001b[0m\n", "\u001b[92m│\u001b[0m \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━┳━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m ┃\u001b[1;36m \u001b[0m\u001b[1;36mStep \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mProgress \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mIn\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mOut\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mRejected\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mErrors\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mRejection Reasons \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mDuration\u001b[0m\u001b[1;36m \u001b[0m┃ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━╇━━━━━╇━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m\u001b[1mNewsSeedGenerator…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete \u001b[0m │ 1 │ 10 │ \u001b[2m 0\u001b[0m │ \u001b[2m 0\u001b[0m │ \u001b[2m- \u001b[0m │ 2s │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m\u001b[1mForwardLookingQue…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete \u001b[0m │ 10 │ 50 │ \u001b[2m 0\u001b[0m │ \u001b[2m 0\u001b[0m │ \u001b[2m- \u001b[0m │ 1s │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m\u001b[1mWebSearchLabelerT…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete \u001b[0m │ 50 │ 45 │ \u001b[91m 5\u001b[0m │ \u001b[2m 0\u001b[0m │ \u001b[2mResolution date is \u001b[0m │ 1s │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2mbefore seed \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2mcreation date (4), \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2mLow confidence: \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m0.80 < 0.9 (1) \u001b[0m │ │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m\u001b[1mNewsContextGenera…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete \u001b[0m │ 45 │ 45 │ \u001b[2m 0\u001b[0m │ \u001b[2m 0\u001b[0m │ \u001b[2m- \u001b[0m │ 1s │ \u001b[92m│\u001b[0m\n", - "\u001b[92m│\u001b[0m └────────────────────┴──────────────────────┴────┴─────┴──────────┴────────┴─────────────────────┴──────────┘ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━┳━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m ┃\u001b[1;36m \u001b[0m\u001b[1;36mStep \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mProgress \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36m In\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mOut\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mRejected\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mErrors\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mRejection Reasons \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mDuration\u001b[0m\u001b[1;36m \u001b[0m┃ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━╇━━━━━╇━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m\u001b[1mNewsSeedGenerator…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete \u001b[0m │ 10 │ 95 │ \u001b[2m 0\u001b[0m │ \u001b[2m 0\u001b[0m │ \u001b[2m- \u001b[0m │ 1s │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m\u001b[1mForwardLookingQue…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete \u001b[0m │ 95 │ 442 │ \u001b[91m 31\u001b[0m │ \u001b[2m 0\u001b[0m │ \u001b[2mdate_close not \u001b[0m │ 1s │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2mafter event_date \u001b[0m │ │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m(31) \u001b[0m │ │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m\u001b[1mWebSearchLabelerT…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete \u001b[0m │ 442 │ 380 │ \u001b[91m 62\u001b[0m │ \u001b[2m 0\u001b[0m │ \u001b[2mResolution date is\u001b[0m │ 4s │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2mbefore seed \u001b[0m │ │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2mcreation date \u001b[0m │ │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m(36), Undetermined\u001b[0m │ │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2mlabel (25), Low \u001b[0m │ │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2mconfidence: 0.80 <\u001b[0m │ │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m│ │ │ │ │ │ \u001b[2m0.9 (1) \u001b[0m │ │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m │\u001b[1m \u001b[0m\u001b[1mNewsContextGenera…\u001b[0m\u001b[1m \u001b[0m│ \u001b[1;92mComplete \u001b[0m │ 380 │ 380 │ \u001b[2m 0\u001b[0m │ \u001b[2m 0\u001b[0m │ \u001b[2m- \u001b[0m │ 57s │ \u001b[92m│\u001b[0m\n", + "\u001b[92m│\u001b[0m └────────────────────┴──────────────────────┴─────┴─────┴──────────┴────────┴────────────────────┴──────────┘ \u001b[92m│\u001b[0m\n", "\u001b[92m│\u001b[0m \u001b[92m│\u001b[0m\n", "\u001b[92m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" ] @@ -225,12 +233,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "50 samples (90.0% valid)\n" + "473 samples (80.3% valid)\n" ] } ], "source": [ - "dataset = lr.transforms.run(pipeline, max_questions=20, name=\"WWTD-2025\")\n", + "dataset = lr.transforms.run(pipeline, max_questions=500, name=\"WWTD-2025\")\n", "\n", "samples = dataset.download()\n", "pct = (sum(1 for s in samples if s.is_valid is True) / len(samples) * 100) if samples else 0\n", @@ -249,33 +257,642 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "5e4d3f2a", "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000\">╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">>> prepare_for_training</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">Starting with 473 samples</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> <span style=\"font-weight: bold\">Filter:</span> Dropped 93 invalid, 103 horizon → 277 remain <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> <span style=\"font-weight: bold\">Dedup:</span> 277 remain (0 duplicates) <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> <span style=\"font-weight: bold\">Split:</span> Splits: 167 train | 56 test (0 dropped, no prediction_date) <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">54 train samples removed for leakage</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">⚠ Unhealthy dataset</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> <span style=\"font-weight: bold\">Only 167 train samples remain after preparation. This is below the recommended minimum of 200 for effective </span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> <span style=\"font-weight: bold\">training.</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">Tips:</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> • Increase max_questions in lr.transforms.run() to generate more samples. <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> • Increase questions_per_seed in your question generator (ForwardLookingQuestionGenerator or <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> QuestionGenerator) to produce more questions from each seed article.Add more search queries to your seed <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> generator to diversify seed sources. <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> • Widen the seed generator date range (start_date to end_date) to capture more events. <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">│</span> <span style=\"color: #808000; text-decoration-color: #808000\">│</span>\n", + "<span style=\"color: #808000; text-decoration-color: #808000\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n", + "</pre>\n" + ], + "text/plain": [ + "\u001b[33m╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[1;33m>> prepare_for_training\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[2mStarting with 473 samples\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[1mFilter:\u001b[0m Dropped 93 invalid, 103 horizon → 277 remain \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[1mDedup:\u001b[0m 277 remain (0 duplicates) \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[1mSplit:\u001b[0m Splits: 167 train | 56 test (0 dropped, no prediction_date) \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[33m54 train samples removed for leakage\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[1;33m⚠ Unhealthy dataset\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[1mOnly 167 train samples remain after preparation. This is below the recommended minimum of 200 for effective \u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[1mtraining.\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[2mTips:\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m • Increase max_questions in lr.transforms.run() to generate more samples. \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m • Increase questions_per_seed in your question generator (ForwardLookingQuestionGenerator or \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m QuestionGenerator) to produce more questions from each seed article.Add more search queries to your seed \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m generator to diversify seed sources. \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m • Widen the seed generator date range (start_date to end_date) to capture more events. \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", "text": [ - "[prepare_for_training] Starting with 50 samples\n", - "[filter] Dropped 5 invalid, 16 horizon → 29 remain\n", - "[dedup] 29 remain (0 duplicates)\n", - "[split] 23 train samples removed for leakage\n", - "[split] Temporal split: 0 train, 6 test\n" + "167\n", + "Train: 167 rows, 21.0% yes\n" ] }, { - "ename": "ValueError", - "evalue": "[filter_and_split] Unhealthy split detected.\n\n23/23 train samples (100%) were removed for temporal leakage — the date_close or resolution_date of train questions extends into the test period.\n\nTips:\n - Use test_start=\"YYYY-MM-DD\" instead of test_size to set an explicit cutoff at least 60 days before your last question date, giving train questions room to resolve before the test window.\n - Tighten days_to_resolution_range — the current max of 60 days means train resolution dates extend far into the test window. Reducing it shrinks the bleed-over zone.\n - Generate more samples across a wider date range. With questions spread over a longer period, the temporal split cutoff moves far enough back that earlier questions resolve well before the test window.\n - Set filter_leaky_train=False to disable leakage removal. Only do this if you are confident the resolution dates do not reveal information that was unavailable at prediction time.", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mValueError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mlightningrod\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m filter_and_split\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m train_dataset, test_dataset = \u001b[43mfilter_and_split\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 4\u001b[39m \u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5\u001b[39m \u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m0.2\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 6\u001b[39m \u001b[43m \u001b[49m\u001b[43msplit_strategy\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtemporal\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 7\u001b[39m \u001b[43m \u001b[49m\u001b[43mdays_to_resolution_range\u001b[49m\u001b[43m=\u001b[49m\u001b[43m(\u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m60\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# horizon within 2 months\u001b[39;49;00m\n\u001b[32m 8\u001b[39m \u001b[43m)\u001b[49m\n\u001b[32m 10\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m name, ds \u001b[38;5;129;01min\u001b[39;00m [(\u001b[33m\"\u001b[39m\u001b[33mTrain\u001b[39m\u001b[33m\"\u001b[39m, train_dataset), (\u001b[33m\"\u001b[39m\u001b[33mTest\u001b[39m\u001b[33m\"\u001b[39m, test_dataset)]:\n\u001b[32m 11\u001b[39m data = ds.flattened()\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Projects/lightningrod-python-sdk/src/lightningrod/training/samples.py:763\u001b[39m, in \u001b[36mfilter_and_split\u001b[39m\u001b[34m(dataset, test_size, split_strategy, test_start, drop_missing_context, days_to_resolution_range, random_state, filter_leaky_train, deduplicate_key_fn, verbose)\u001b[39m\n\u001b[32m 760\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m verbose:\n\u001b[32m 761\u001b[39m _print_stats(stats)\n\u001b[32m--> \u001b[39m\u001b[32m763\u001b[39m \u001b[43m_raise_if_unhealthy_split\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 764\u001b[39m \u001b[43m \u001b[49m\u001b[43mstats\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstats\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 765\u001b[39m \u001b[43m \u001b[49m\u001b[43mtest_ids\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtest_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 766\u001b[39m \u001b[43m \u001b[49m\u001b[43msplit_strategy\u001b[49m\u001b[43m=\u001b[49m\u001b[43msplit_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 767\u001b[39m \u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtest_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 768\u001b[39m \u001b[43m \u001b[49m\u001b[43mtest_start\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtest_start\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 769\u001b[39m \u001b[43m \u001b[49m\u001b[43mfilter_leaky_train\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfilter_leaky_train\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 770\u001b[39m \u001b[43m \u001b[49m\u001b[43mdays_to_resolution_range\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdays_to_resolution_range\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 771\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 773\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m dataset.subset(train_ids), dataset.subset(test_ids)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Projects/lightningrod-python-sdk/src/lightningrod/training/samples.py:701\u001b[39m, in \u001b[36m_raise_if_unhealthy_split\u001b[39m\u001b[34m(stats, test_ids, split_strategy, test_size, test_start, filter_leaky_train, days_to_resolution_range)\u001b[39m\n\u001b[32m 695\u001b[39m test_tips.append(\n\u001b[32m 696\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mIncrease the dataset size — with test_size=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtest_size\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m and only \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstats.dedup_kept\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m samples \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 697\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mafter filtering, the test set may round to zero.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 698\u001b[39m )\n\u001b[32m 699\u001b[39m msgs.append(\u001b[33m\"\u001b[39m\u001b[33mTips:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m\"\u001b[39m + \u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m\"\u001b[39m.join(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m - \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mt\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m t \u001b[38;5;129;01min\u001b[39;00m test_tips))\n\u001b[32m--> \u001b[39m\u001b[32m701\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33m[filter_and_split] Unhealthy split detected.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m\"\u001b[39m + \u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m\"\u001b[39m.join(msgs))\n", - "\u001b[31mValueError\u001b[39m: [filter_and_split] Unhealthy split detected.\n\n23/23 train samples (100%) were removed for temporal leakage — the date_close or resolution_date of train questions extends into the test period.\n\nTips:\n - Use test_start=\"YYYY-MM-DD\" instead of test_size to set an explicit cutoff at least 60 days before your last question date, giving train questions room to resolve before the test window.\n - Tighten days_to_resolution_range — the current max of 60 days means train resolution dates extend far into the test window. Reducing it shrinks the bleed-over zone.\n - Generate more samples across a wider date range. With questions spread over a longer period, the temporal split cutoff moves far enough back that earlier questions resolve well before the test window.\n - Set filter_leaky_train=False to disable leakage removal. Only do this if you are confident the resolution dates do not reveal information that was unavailable at prediction time." + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sample_id</th>\n", + " <th>is_valid</th>\n", + " <th>question_text</th>\n", + " <th>date_close</th>\n", + " <th>event_date</th>\n", + " <th>resolution_criteria</th>\n", + " <th>prediction_date</th>\n", + " <th>label</th>\n", + " <th>answer_type</th>\n", + " <th>label_confidence</th>\n", + " <th>...</th>\n", + " <th>reasoning</th>\n", + " <th>answer_sources</th>\n", + " <th>seed_text</th>\n", + " <th>seed_url</th>\n", + " <th>seed_creation_date</th>\n", + " <th>seed_search_query</th>\n", + " <th>context</th>\n", + " <th>meta_sample_id</th>\n", + " <th>meta_parent_sample_id</th>\n", + " <th>meta_processing_time_ms</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>07511299-78d6-4020-8efe-d7b5b865f826</td>\n", + " <td>True</td>\n", + " <td>Will the 11th Circuit Court of Appeals issue a...</td>\n", + " <td>2025-02-15T00:00:00</td>\n", + " <td>2025-01-08T00:00:00</td>\n", + " <td>This question resolves to 'Yes' if the U.S. Co...</td>\n", + " <td>2025-01-08T00:00:00</td>\n", + " <td>1</td>\n", + " <td>binary</td>\n", + " <td>1.00</td>\n", + " <td>...</td>\n", + " <td>On January 9, 2025, the U.S. Court of Appeals ...</td>\n", + " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", + " <td>Title: The Situation: Ending the Trump Cases t...</td>\n", + " <td>https://www.lawfaremedia.org/article/the-situa...</td>\n", + " <td>2025-01-08T00:00:00</td>\n", + " <td>Donald Trump lawsuits and court rulings</td>\n", + " <td>[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Judge ...</td>\n", + " <td>59824a03-d075-4de4-bd89-17ec5f0651f1</td>\n", + " <td>da9d2197-889d-4fe4-ae4e-960ab3f9726f</td>\n", + " <td>16820.019</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>3618250c-9aa7-4e3e-b1f4-d59351e25415</td>\n", + " <td>True</td>\n", + " <td>Will the criminal charges against Carlos De Ol...</td>\n", + " <td>2025-03-05T00:00:00</td>\n", + " <td>2025-01-08T00:00:00</td>\n", + " <td>This question resolves to 'Yes' if a federal c...</td>\n", + " <td>2025-01-08T00:00:00</td>\n", + " <td>1</td>\n", + " <td>binary</td>\n", + " <td>1.00</td>\n", + " <td>...</td>\n", + " <td>The criminal charges against Carlos De Oliveir...</td>\n", + " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", + " <td>Title: The Situation: Ending the Trump Cases t...</td>\n", + " <td>https://www.lawfaremedia.org/article/the-situa...</td>\n", + " <td>2025-01-08T00:00:00</td>\n", + " <td>Donald Trump lawsuits and court rulings</td>\n", + " <td>[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Trump ...</td>\n", + " <td>89a53581-5832-4797-9a76-4a85aefb8993</td>\n", + " <td>da9d2197-889d-4fe4-ae4e-960ab3f9726f</td>\n", + " <td>170592.880</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>6f5808f8-5fbe-40e3-a902-2959e0159960</td>\n", + " <td>True</td>\n", + " <td>Will Justice Juan Merchan sentence Donald Trum...</td>\n", + " <td>2025-03-01T00:00:00</td>\n", + " <td>2025-01-08T00:00:00</td>\n", + " <td>This question resolves to 'Yes' if Justice Jua...</td>\n", + " <td>2025-01-08T00:00:00</td>\n", + " <td>0</td>\n", + " <td>binary</td>\n", + " <td>1.00</td>\n", + " <td>...</td>\n", + " <td>The close date for this question is 2025-03-01...</td>\n", + " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", + " <td>Title: The Situation: Ending the Trump Cases t...</td>\n", + " <td>https://www.lawfaremedia.org/article/the-situa...</td>\n", + " <td>2025-01-08T00:00:00</td>\n", + " <td>Donald Trump lawsuits and court rulings</td>\n", + " <td>[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Judge ...</td>\n", + " <td>da9d2197-889d-4fe4-ae4e-960ab3f9726f</td>\n", + " <td>b988692d-28ac-4e59-932f-089b30c1fdff</td>\n", + " <td>19099.822</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>811942a3-0ce0-4a23-8ccf-cb9b6b038a78</td>\n", + " <td>True</td>\n", + " <td>Will the full, unredacted Special Counsel repo...</td>\n", + " <td>2025-03-01T00:00:00</td>\n", + " <td>2025-01-08T00:00:00</td>\n", + " <td>This question resolves to 'Yes' if the Departm...</td>\n", + " <td>2025-01-08T00:00:00</td>\n", + " <td>0</td>\n", + " <td>binary</td>\n", + " <td>1.00</td>\n", + " <td>...</td>\n", + " <td>Special Counsel Jack Smith submitted a two-vol...</td>\n", + " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", + " <td>Title: The Situation: Ending the Trump Cases t...</td>\n", + " <td>https://www.lawfaremedia.org/article/the-situa...</td>\n", + " <td>2025-01-08T00:00:00</td>\n", + " <td>Donald Trump lawsuits and court rulings</td>\n", + " <td>[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Trump ...</td>\n", + " <td>d53c18aa-e58f-47cc-ad8d-07284f3837b5</td>\n", + " <td>da9d2197-889d-4fe4-ae4e-960ab3f9726f</td>\n", + " <td>210318.406</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>abbac7a6-09fd-4d34-b588-0a3df04f2f37</td>\n", + " <td>True</td>\n", + " <td>Will Donald Trump grant a formal presidential ...</td>\n", + " <td>2025-02-28T00:00:00</td>\n", + " <td>2025-01-08T00:00:00</td>\n", + " <td>This question resolves to 'Yes' if the White H...</td>\n", + " <td>2025-01-08T00:00:00</td>\n", + " <td>0</td>\n", + " <td>binary</td>\n", + " <td>0.95</td>\n", + " <td>...</td>\n", + " <td>The close date for this question is 2025-02-28...</td>\n", + " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", + " <td>Title: The Situation: Ending the Trump Cases t...</td>\n", + " <td>https://www.lawfaremedia.org/article/the-situa...</td>\n", + " <td>2025-01-08T00:00:00</td>\n", + " <td>Donald Trump lawsuits and court rulings</td>\n", + " <td>[{'rendered_context': '', 'search_query': 'Tru...</td>\n", + " <td>c1f9b0fa-f364-4cda-a9ca-a41f1dcdcf3c</td>\n", + " <td>da9d2197-889d-4fe4-ae4e-960ab3f9726f</td>\n", + " <td>16623.478</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 21 columns</p>\n", + "</div>" + ], + "text/plain": [ + " sample_id is_valid \\\n", + "0 07511299-78d6-4020-8efe-d7b5b865f826 True \n", + "1 3618250c-9aa7-4e3e-b1f4-d59351e25415 True \n", + "2 6f5808f8-5fbe-40e3-a902-2959e0159960 True \n", + "3 811942a3-0ce0-4a23-8ccf-cb9b6b038a78 True \n", + "4 abbac7a6-09fd-4d34-b588-0a3df04f2f37 True \n", + "\n", + " question_text date_close \\\n", + "0 Will the 11th Circuit Court of Appeals issue a... 2025-02-15T00:00:00 \n", + "1 Will the criminal charges against Carlos De Ol... 2025-03-05T00:00:00 \n", + "2 Will Justice Juan Merchan sentence Donald Trum... 2025-03-01T00:00:00 \n", + "3 Will the full, unredacted Special Counsel repo... 2025-03-01T00:00:00 \n", + "4 Will Donald Trump grant a formal presidential ... 2025-02-28T00:00:00 \n", + "\n", + " event_date resolution_criteria \\\n", + "0 2025-01-08T00:00:00 This question resolves to 'Yes' if the U.S. Co... \n", + "1 2025-01-08T00:00:00 This question resolves to 'Yes' if a federal c... \n", + "2 2025-01-08T00:00:00 This question resolves to 'Yes' if Justice Jua... \n", + "3 2025-01-08T00:00:00 This question resolves to 'Yes' if the Departm... \n", + "4 2025-01-08T00:00:00 This question resolves to 'Yes' if the White H... \n", + "\n", + " prediction_date label answer_type label_confidence ... \\\n", + "0 2025-01-08T00:00:00 1 binary 1.00 ... \n", + "1 2025-01-08T00:00:00 1 binary 1.00 ... \n", + "2 2025-01-08T00:00:00 0 binary 1.00 ... \n", + "3 2025-01-08T00:00:00 0 binary 1.00 ... \n", + "4 2025-01-08T00:00:00 0 binary 0.95 ... \n", + "\n", + " reasoning \\\n", + "0 On January 9, 2025, the U.S. Court of Appeals ... \n", + "1 The criminal charges against Carlos De Oliveir... \n", + "2 The close date for this question is 2025-03-01... \n", + "3 Special Counsel Jack Smith submitted a two-vol... \n", + "4 The close date for this question is 2025-02-28... \n", + "\n", + " answer_sources \\\n", + "0 https://vertexaisearch.cloud.google.com/ground... \n", + "1 https://vertexaisearch.cloud.google.com/ground... \n", + "2 https://vertexaisearch.cloud.google.com/ground... \n", + "3 https://vertexaisearch.cloud.google.com/ground... \n", + "4 https://vertexaisearch.cloud.google.com/ground... \n", + "\n", + " seed_text \\\n", + "0 Title: The Situation: Ending the Trump Cases t... \n", + "1 Title: The Situation: Ending the Trump Cases t... \n", + "2 Title: The Situation: Ending the Trump Cases t... \n", + "3 Title: The Situation: Ending the Trump Cases t... \n", + "4 Title: The Situation: Ending the Trump Cases t... \n", + "\n", + " seed_url seed_creation_date \\\n", + "0 https://www.lawfaremedia.org/article/the-situa... 2025-01-08T00:00:00 \n", + "1 https://www.lawfaremedia.org/article/the-situa... 2025-01-08T00:00:00 \n", + "2 https://www.lawfaremedia.org/article/the-situa... 2025-01-08T00:00:00 \n", + "3 https://www.lawfaremedia.org/article/the-situa... 2025-01-08T00:00:00 \n", + "4 https://www.lawfaremedia.org/article/the-situa... 2025-01-08T00:00:00 \n", + "\n", + " seed_search_query \\\n", + "0 Donald Trump lawsuits and court rulings \n", + "1 Donald Trump lawsuits and court rulings \n", + "2 Donald Trump lawsuits and court rulings \n", + "3 Donald Trump lawsuits and court rulings \n", + "4 Donald Trump lawsuits and court rulings \n", + "\n", + " context \\\n", + "0 [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Judge ... \n", + "1 [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Trump ... \n", + "2 [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Judge ... \n", + "3 [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Trump ... \n", + "4 [{'rendered_context': '', 'search_query': 'Tru... \n", + "\n", + " meta_sample_id meta_parent_sample_id \\\n", + "0 59824a03-d075-4de4-bd89-17ec5f0651f1 da9d2197-889d-4fe4-ae4e-960ab3f9726f \n", + "1 89a53581-5832-4797-9a76-4a85aefb8993 da9d2197-889d-4fe4-ae4e-960ab3f9726f \n", + "2 da9d2197-889d-4fe4-ae4e-960ab3f9726f b988692d-28ac-4e59-932f-089b30c1fdff \n", + "3 d53c18aa-e58f-47cc-ad8d-07284f3837b5 da9d2197-889d-4fe4-ae4e-960ab3f9726f \n", + "4 c1f9b0fa-f364-4cda-a9ca-a41f1dcdcf3c da9d2197-889d-4fe4-ae4e-960ab3f9726f \n", + "\n", + " meta_processing_time_ms \n", + "0 16820.019 \n", + "1 170592.880 \n", + "2 19099.822 \n", + "3 210318.406 \n", + "4 16623.478 \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "56\n", + "Test: 56 rows, 37.5% yes\n" ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sample_id</th>\n", + " <th>is_valid</th>\n", + " <th>question_text</th>\n", + " <th>date_close</th>\n", + " <th>event_date</th>\n", + " <th>resolution_criteria</th>\n", + " <th>prediction_date</th>\n", + " <th>label</th>\n", + " <th>answer_type</th>\n", + " <th>label_confidence</th>\n", + " <th>...</th>\n", + " <th>reasoning</th>\n", + " <th>answer_sources</th>\n", + " <th>seed_text</th>\n", + " <th>seed_url</th>\n", + " <th>seed_creation_date</th>\n", + " <th>seed_search_query</th>\n", + " <th>context</th>\n", + " <th>meta_sample_id</th>\n", + " <th>meta_parent_sample_id</th>\n", + " <th>meta_processing_time_ms</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>7960bd90-44c8-4cd0-bac0-3a17265101e5</td>\n", + " <td>True</td>\n", + " <td>Will Donald Trump announce a complete exemptio...</td>\n", + " <td>2025-12-01T00:00:00</td>\n", + " <td>2025-10-08T00:00:00</td>\n", + " <td>The question is answered 'Yes' if the US Presi...</td>\n", + " <td>2025-10-08T00:00:00</td>\n", + " <td>0</td>\n", + " <td>binary</td>\n", + " <td>1.00</td>\n", + " <td>...</td>\n", + " <td>The close date for this question is 2025-12-01...</td>\n", + " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", + " <td>Title: 'We will get an even better deal,' Carn...</td>\n", + " <td>https://www.cbc.ca/news/politics/carney-even-b...</td>\n", + " <td>2025-10-08T00:00:00</td>\n", + " <td>Donald Trump trade and tariff actions</td>\n", + " <td>[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] U.S.-C...</td>\n", + " <td>ecd757d7-fa3c-47ec-bdc6-81cdf88c1b65</td>\n", + " <td>a3db9248-0a4f-4d51-81d1-7c21057856cf</td>\n", + " <td>12056.629</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>b562085f-b58d-4284-ae0b-fcfdeb90f90c</td>\n", + " <td>True</td>\n", + " <td>Will the United States and Canada sign a forma...</td>\n", + " <td>2025-12-01T00:00:00</td>\n", + " <td>2025-10-08T00:00:00</td>\n", + " <td>A formal bilateral agreement or signed Memoran...</td>\n", + " <td>2025-10-08T00:00:00</td>\n", + " <td>0</td>\n", + " <td>binary</td>\n", + " <td>0.95</td>\n", + " <td>...</td>\n", + " <td>Based on the provided reports regarding the 20...</td>\n", + " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", + " <td>Title: 'We will get an even better deal,' Carn...</td>\n", + " <td>https://www.cbc.ca/news/politics/carney-even-b...</td>\n", + " <td>2025-10-08T00:00:00</td>\n", + " <td>Donald Trump trade and tariff actions</td>\n", + " <td>[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Carney...</td>\n", + " <td>a3db9248-0a4f-4d51-81d1-7c21057856cf</td>\n", + " <td>993816e2-887d-4db0-99c7-f45e93776a8a</td>\n", + " <td>255122.468</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>63dc110c-6073-4f98-abcb-57b8933b8fbf</td>\n", + " <td>True</td>\n", + " <td>Will the Trump administration hold a new offsh...</td>\n", + " <td>2026-04-01T00:00:00</td>\n", + " <td>2025-11-26T00:00:00</td>\n", + " <td>The question resolves to 'Yes' if the Departme...</td>\n", + " <td>2025-11-26T00:00:00</td>\n", + " <td>1</td>\n", + " <td>binary</td>\n", + " <td>1.00</td>\n", + " <td>...</td>\n", + " <td>The Trump administration held the first new of...</td>\n", + " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", + " <td>Title: nytimes.com\\n\\nURL Source: https://www....</td>\n", + " <td>https://www.nytimes.com/2025/11/26/climate/tru...</td>\n", + " <td>2025-11-26T00:00:00</td>\n", + " <td>Donald Trump domestic policy agenda</td>\n", + " <td>[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Interi...</td>\n", + " <td>87e57caa-9696-4eb8-a7ed-cd4060b0649e</td>\n", + " <td>b4bfbb84-63db-48d6-b63d-c809b20ed5f0</td>\n", + " <td>6996.226</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>2581c035-5519-434f-aed1-c51326b11e73</td>\n", + " <td>True</td>\n", + " <td>Will Donald Trump and Javier Milei hold a join...</td>\n", + " <td>2026-01-01T00:00:00</td>\n", + " <td>2025-11-27T00:00:00</td>\n", + " <td>The question resolves to 'Yes' if Donald Trump...</td>\n", + " <td>2025-11-27T00:00:00</td>\n", + " <td>0</td>\n", + " <td>binary</td>\n", + " <td>0.95</td>\n", + " <td>...</td>\n", + " <td>The close date for this question is 2026-01-01...</td>\n", + " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", + " <td>Title: The Paradox of Europe's Trumpian Right:...</td>\n", + " <td>https://www.foreignaffairs.com/europe/paradox-...</td>\n", + " <td>2025-11-27T00:00:00</td>\n", + " <td>Donald Trump domestic policy agenda</td>\n", + " <td>[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Milei ...</td>\n", + " <td>3a88f43d-6a2c-422f-b7dd-5b6039a29375</td>\n", + " <td>7977998c-8ccb-444f-a382-1fcd8e322c23</td>\n", + " <td>11967.560</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>3bd13f7e-25a8-40c0-8b2a-d9f77ef9f827</td>\n", + " <td>True</td>\n", + " <td>Will the United States official executive bran...</td>\n", + " <td>2026-01-20T00:00:00</td>\n", + " <td>2025-11-27T00:00:00</td>\n", + " <td>The question resolves to 'Yes' if the U.S. gov...</td>\n", + " <td>2025-11-27T00:00:00</td>\n", + " <td>1</td>\n", + " <td>binary</td>\n", + " <td>0.95</td>\n", + " <td>...</td>\n", + " <td>Between the question date (2025-11-27) and the...</td>\n", + " <td>https://vertexaisearch.cloud.google.com/ground...</td>\n", + " <td>Title: The Paradox of Europe's Trumpian Right:...</td>\n", + " <td>https://www.foreignaffairs.com/europe/paradox-...</td>\n", + " <td>2025-11-27T00:00:00</td>\n", + " <td>Donald Trump domestic policy agenda</td>\n", + " <td>[{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Presid...</td>\n", + " <td>d15a72f1-d8ac-4fc9-9bde-732514e9008c</td>\n", + " <td>7977998c-8ccb-444f-a382-1fcd8e322c23</td>\n", + " <td>66899.224</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 21 columns</p>\n", + "</div>" + ], + "text/plain": [ + " sample_id is_valid \\\n", + "0 7960bd90-44c8-4cd0-bac0-3a17265101e5 True \n", + "1 b562085f-b58d-4284-ae0b-fcfdeb90f90c True \n", + "2 63dc110c-6073-4f98-abcb-57b8933b8fbf True \n", + "3 2581c035-5519-434f-aed1-c51326b11e73 True \n", + "4 3bd13f7e-25a8-40c0-8b2a-d9f77ef9f827 True \n", + "\n", + " question_text date_close \\\n", + "0 Will Donald Trump announce a complete exemptio... 2025-12-01T00:00:00 \n", + "1 Will the United States and Canada sign a forma... 2025-12-01T00:00:00 \n", + "2 Will the Trump administration hold a new offsh... 2026-04-01T00:00:00 \n", + "3 Will Donald Trump and Javier Milei hold a join... 2026-01-01T00:00:00 \n", + "4 Will the United States official executive bran... 2026-01-20T00:00:00 \n", + "\n", + " event_date resolution_criteria \\\n", + "0 2025-10-08T00:00:00 The question is answered 'Yes' if the US Presi... \n", + "1 2025-10-08T00:00:00 A formal bilateral agreement or signed Memoran... \n", + "2 2025-11-26T00:00:00 The question resolves to 'Yes' if the Departme... \n", + "3 2025-11-27T00:00:00 The question resolves to 'Yes' if Donald Trump... \n", + "4 2025-11-27T00:00:00 The question resolves to 'Yes' if the U.S. gov... \n", + "\n", + " prediction_date label answer_type label_confidence ... \\\n", + "0 2025-10-08T00:00:00 0 binary 1.00 ... \n", + "1 2025-10-08T00:00:00 0 binary 0.95 ... \n", + "2 2025-11-26T00:00:00 1 binary 1.00 ... \n", + "3 2025-11-27T00:00:00 0 binary 0.95 ... \n", + "4 2025-11-27T00:00:00 1 binary 0.95 ... \n", + "\n", + " reasoning \\\n", + "0 The close date for this question is 2025-12-01... \n", + "1 Based on the provided reports regarding the 20... \n", + "2 The Trump administration held the first new of... \n", + "3 The close date for this question is 2026-01-01... \n", + "4 Between the question date (2025-11-27) and the... \n", + "\n", + " answer_sources \\\n", + "0 https://vertexaisearch.cloud.google.com/ground... \n", + "1 https://vertexaisearch.cloud.google.com/ground... \n", + "2 https://vertexaisearch.cloud.google.com/ground... \n", + "3 https://vertexaisearch.cloud.google.com/ground... \n", + "4 https://vertexaisearch.cloud.google.com/ground... \n", + "\n", + " seed_text \\\n", + "0 Title: 'We will get an even better deal,' Carn... \n", + "1 Title: 'We will get an even better deal,' Carn... \n", + "2 Title: nytimes.com\\n\\nURL Source: https://www.... \n", + "3 Title: The Paradox of Europe's Trumpian Right:... \n", + "4 Title: The Paradox of Europe's Trumpian Right:... \n", + "\n", + " seed_url seed_creation_date \\\n", + "0 https://www.cbc.ca/news/politics/carney-even-b... 2025-10-08T00:00:00 \n", + "1 https://www.cbc.ca/news/politics/carney-even-b... 2025-10-08T00:00:00 \n", + "2 https://www.nytimes.com/2025/11/26/climate/tru... 2025-11-26T00:00:00 \n", + "3 https://www.foreignaffairs.com/europe/paradox-... 2025-11-27T00:00:00 \n", + "4 https://www.foreignaffairs.com/europe/paradox-... 2025-11-27T00:00:00 \n", + "\n", + " seed_search_query \\\n", + "0 Donald Trump trade and tariff actions \n", + "1 Donald Trump trade and tariff actions \n", + "2 Donald Trump domestic policy agenda \n", + "3 Donald Trump domestic policy agenda \n", + "4 Donald Trump domestic policy agenda \n", + "\n", + " context \\\n", + "0 [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] U.S.-C... \n", + "1 [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Carney... \n", + "2 [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Interi... \n", + "3 [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Milei ... \n", + "4 [{'rendered_context': '---\n", + "ARTICLES\n", + "[1] Presid... \n", + "\n", + " meta_sample_id meta_parent_sample_id \\\n", + "0 ecd757d7-fa3c-47ec-bdc6-81cdf88c1b65 a3db9248-0a4f-4d51-81d1-7c21057856cf \n", + "1 a3db9248-0a4f-4d51-81d1-7c21057856cf 993816e2-887d-4db0-99c7-f45e93776a8a \n", + "2 87e57caa-9696-4eb8-a7ed-cd4060b0649e b4bfbb84-63db-48d6-b63d-c809b20ed5f0 \n", + "3 3a88f43d-6a2c-422f-b7dd-5b6039a29375 7977998c-8ccb-444f-a382-1fcd8e322c23 \n", + "4 d15a72f1-d8ac-4fc9-9bde-732514e9008c 7977998c-8ccb-444f-a382-1fcd8e322c23 \n", + "\n", + " meta_processing_time_ms \n", + "0 12056.629 \n", + "1 255122.468 \n", + "2 6996.226 \n", + "3 11967.560 \n", + "4 66899.224 \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -317,7 +934,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "24376274", "metadata": {}, "outputs": [ @@ -325,9 +942,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Estimated cost: $0.32\n", - "Effective steps: 11\n", - "Train tokens: 1,073,959\n", + "Estimated cost: $0.18\n", + "Effective steps: 6\n", + "Train tokens: 617,089\n", "Notes: Estimate uses per-answer-type output token estimates; actual may vary\n" ] } @@ -358,7 +975,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "a8660faf", "metadata": {}, "outputs": [ @@ -371,9 +988,9 @@ "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Job:</span> WWTD-2025 <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Reward:</span> latest -0.8786 avg -0.6684 (11 steps) <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">(higher is better)</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Reward:</span> latest -1.3368 avg -0.8626 (6 steps) <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">(higher is better)</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", - "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Cost:</span> $0.18 <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", + "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"font-weight: bold\">Cost:</span> $0.11 <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">│</span>\n", "<span style=\"color: #0000ff; text-decoration-color: #0000ff\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n", @@ -386,9 +1003,9 @@ "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", "\u001b[94m│\u001b[0m \u001b[1mJob:\u001b[0m WWTD-2025 \u001b[94m│\u001b[0m\n", "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m \u001b[1mReward:\u001b[0m latest -0.8786 avg -0.6684 (11 steps) \u001b[2m(higher is better)\u001b[0m \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mReward:\u001b[0m latest -1.3368 avg -0.8626 (6 steps) \u001b[2m(higher is better)\u001b[0m \u001b[94m│\u001b[0m\n", "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", - "\u001b[94m│\u001b[0m \u001b[1mCost:\u001b[0m $0.18 \u001b[94m│\u001b[0m\n", + "\u001b[94m│\u001b[0m \u001b[1mCost:\u001b[0m $0.11 \u001b[94m│\u001b[0m\n", "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", "\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\n", "\u001b[94m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" @@ -401,8 +1018,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Job 13fa02ec-27f4-47a9-84c9-762d91a1904a completed with status: COMPLETED\n", - "Trained model ID: checkpoint:13fa02ec-27f4-47a9-84c9-762d91a1904a\n" + "Job 371e5ff4-ebf5-43af-8809-d26c14edb8e2 completed with status: COMPLETED\n", + "Trained model ID: checkpoint:371e5ff4-ebf5-43af-8809-d26c14edb8e2\n" ] } ], @@ -427,15 +1044,7 @@ "execution_count": null, "id": "f013b514", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "<answer>0.05</answer>\n" - ] - } - ], + "outputs": [], "source": [ "print(lr.predict(job.model_id, \"Will Trump impose 25% tariffs on all goods from Canada by February 1, 2027?\"))" ]