From eace75cf875abe47fb9562aaa51a4ced7525bc9b Mon Sep 17 00:00:00 2001 From: Bartolomej Kozorog Date: Thu, 12 Mar 2026 15:44:14 +0100 Subject: [PATCH] use foresight-v3 in model consensus notebook --- notebooks/e2e/model_consensus.ipynb | 423 +++++++++++++++------------- src/lightningrod/__init__.py | 3 +- src/lightningrod/utils/models.py | 9 + 3 files changed, 246 insertions(+), 189 deletions(-) diff --git a/notebooks/e2e/model_consensus.ipynb b/notebooks/e2e/model_consensus.ipynb index 9f5ef13..e978e0c 100644 --- a/notebooks/e2e/model_consensus.ipynb +++ b/notebooks/e2e/model_consensus.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -77,10 +77,70 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], - "source": "from datetime import datetime\nfrom lightningrod import (\n NewsSeedGenerator,\n ForwardLookingQuestionGenerator,\n WebSearchLabeler,\n QuestionPipeline,\n NewsContextGenerator,\n QuestionRenderer,\n RolloutGenerator,\n RolloutScorer,\n BinaryAnswerType,\n open_router_model,\n)\n\n# Date range — adjust these to a period ~2-3 months in the past\nSTART_DATE = datetime(2025, 11, 1)\nEND_DATE = datetime(2025, 12, 1)\n\nseed_generator = NewsSeedGenerator(\n start_date=START_DATE,\n end_date=END_DATE,\n search_query=\"technology announcements\",\n)\n\nanswer_type = BinaryAnswerType()\n\nquestion_generator = ForwardLookingQuestionGenerator(\n instructions=\"Generate forward-looking yes/no questions about technology announcements. \"\n \"Questions should be clearly resolvable within 1-2 months.\",\n answer_type=answer_type,\n)\n\nlabeler = WebSearchLabeler(answer_type=answer_type)\n\nrenderer = QuestionRenderer(answer_type=answer_type)\n\nmodels = [\n open_router_model(\"openai/gpt-4.1-mini\"),\n open_router_model(\"anthropic/claude-sonnet-4\"),\n open_router_model(\"google/gemini-2.5-flash\"),\n]\n\ncontext_generator = NewsContextGenerator()\n\nrollout_generator = RolloutGenerator(models=models)\n\nscorer = RolloutScorer(answer_type=answer_type)\n\npipeline = QuestionPipeline(\n seed_generator=seed_generator,\n question_generator=question_generator,\n context_generators=[context_generator],\n labeler=labeler,\n renderer=renderer,\n rollout_generator=rollout_generator,\n scorer=scorer,\n)" + "source": [ + "from datetime import datetime\n", + "from lightningrod import (\n", + " NewsSeedGenerator,\n", + " ForwardLookingQuestionGenerator,\n", + " WebSearchLabeler,\n", + " QuestionPipeline,\n", + " NewsContextGenerator,\n", + " QuestionRenderer,\n", + " RolloutGenerator,\n", + " RolloutScorer,\n", + " BinaryAnswerType,\n", + " open_router_model,\n", + " lightningrod_model,\n", + ")\n", + "\n", + "# Date range — adjust these to a period ~2-3 months in the past\n", + "START_DATE = datetime(2025, 11, 6)\n", + "END_DATE = datetime(2026, 3, 1)\n", + "\n", + "seed_generator = NewsSeedGenerator(\n", + " start_date=START_DATE,\n", + " end_date=END_DATE,\n", + " search_query=\"technology announcements\",\n", + ")\n", + "\n", + "answer_type = BinaryAnswerType()\n", + "\n", + "question_generator = ForwardLookingQuestionGenerator(\n", + " instructions=\"Generate forward-looking yes/no questions about tech announcements. \"\n", + " \"Questions should be clearly resolvable within 1-2 months.\",\n", + " answer_type=answer_type,\n", + ")\n", + "\n", + "labeler = WebSearchLabeler(answer_type=answer_type)\n", + "\n", + "renderer = QuestionRenderer(answer_type=answer_type)\n", + "\n", + "models = [\n", + " open_router_model(\"openai/gpt-5.2\"),\n", + " open_router_model(\"anthropic/claude-sonnet-4.6\"),\n", + " open_router_model(\"google/gemini-3.1-pro-preview\"),\n", + " lightningrod_model(\"foresight-v3\"),\n", + "]\n", + "\n", + "context_generator = NewsContextGenerator()\n", + "\n", + "rollout_generator = RolloutGenerator(models=models)\n", + "\n", + "scorer = RolloutScorer(answer_type=answer_type)\n", + "\n", + "pipeline = QuestionPipeline(\n", + " seed_generator=seed_generator,\n", + " question_generator=question_generator,\n", + " context_generators=[context_generator],\n", + " labeler=labeler,\n", + " renderer=renderer,\n", + " rollout_generator=rollout_generator,\n", + " scorer=scorer,\n", + ")" + ] }, { "cell_type": "markdown", @@ -93,13 +153,38 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 15, "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "                                                                                                                 \n",
+       "  >> Warning                                                                                                     \n",
+       "                                                                                                                 \n",
+       "  Estimated cost ($60.33) exceeds current balance ($10.31). Consider adding credits before running this job.     \n",
+       "                                                                                                                 \n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[33m╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[1;33m>> Warning\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m Estimated cost ($60.33) exceeds current balance ($10.31). Consider adding credits before running this job. \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "ebd1c29a14fc48669015ce5ef0c0e2e9", + "model_id": "b4e3e81c1ead4708902f6792c55ab49b", "version_major": 2, "version_minor": 0 }, @@ -122,7 +207,7 @@ } ], "source": [ - "dataset = lr.transforms.run(pipeline, max_questions=20, name=\"News Forecasting Benchmark\")" + "dataset = lr.transforms.run(pipeline, max_questions=600, name=\"News Forecasting Benchmark\")" ] }, { @@ -136,14 +221,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Generated 20 samples (80.0% valid)\n", + "Generated 120 samples (79.2% valid)\n", "\n" ] } @@ -167,16 +252,16 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Consensus: 6/16 questions have full agreement (38%)\n", - "Disagreement: 10/16 questions have models on opposite sides of 0.5\n", - "Mean spread: 0.400\n", + "Consensus: 58/95 questions have full agreement (61%)\n", + "Disagreement: 37/95 questions have models on opposite sides of 0.5\n", + "Mean spread: 0.308\n", "\n" ] }, @@ -205,215 +290,170 @@ " Label\n", " Spread\n", " Agree\n", - " gpt-4.1-mini\n", - " claude-sonnet-4\n", - " gemini-2.5-flash\n", + " gpt-5.2\n", + " claude-sonnet-4.6\n", + " gemini-3.1-pro-preview\n", + " foresight-v3\n", " \n", " \n", " \n", " \n", " 0\n", - " Will the New Frontiers in Research Fund (NFRF)...\n", - " 1\n", - " 0.719\n", + " Will Apple Inc. officially announce a new gene...\n", + " 0\n", + " 0.93\n", " False\n", - " 0.001\n", - " 0.72\n", - " 0.01\n", + " 0.08\n", + " 0.95\n", + " 0.02\n", + " 0.17\n", " \n", " \n", " 1\n", - " Will Autozi Internet Technology (AZI) complete...\n", - " 1\n", - " 0.670\n", + " Will Apple Inc. announce or release a new gene...\n", + " 0\n", + " 0.87\n", " False\n", - " 0.050\n", - " 0.72\n", - " 0.50\n", + " 0.90\n", + " 0.04\n", + " 0.03\n", + " 0.12\n", " \n", " \n", " 2\n", - " Is the 13th Seoul Mediacity Biennale catalogue...\n", + " Will the 'motorola signature' smartphone, anno...\n", " 1\n", - " 0.670\n", + " 0.76\n", " False\n", - " 0.050\n", - " 0.72\n", - " 0.50\n", + " 0.55\n", + " 0.88\n", + " 0.85\n", + " 0.12\n", " \n", " \n", " 3\n", - " Will Mercury Ev-Tech Limited hold its 39th Ann...\n", + " Will the European Commission or European Parli...\n", " 0\n", - " 0.620\n", + " 0.70\n", " False\n", - " 0.600\n", - " 0.72\n", - " 0.10\n", + " 0.18\n", + " 0.35\n", + " 0.88\n", + " 0.22\n", " \n", " \n", " 4\n", - " Will Supermicro announce or list at least one ...\n", + " By January 15, 2026, will Supermicro appear in...\n", " 1\n", - " 0.500\n", + " 0.67\n", " False\n", - " 0.150\n", + " 0.35\n", " 0.65\n", - " 0.60\n", + " 0.76\n", + " 0.09\n", " \n", " \n", - " 5\n", - " Will the Commonwealth Bank of Australia (CBA) ...\n", - " 0\n", - " 0.450\n", - " False\n", - " 0.200\n", - " 0.65\n", - " 0.60\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 6\n", - " By January 31, 2026, will Trimble have officia...\n", + " 90\n", + " Will the Rwandan Ministry of ICT and Innovatio...\n", " 1\n", - " 0.420\n", - " False\n", - " 0.450\n", - " 0.72\n", - " 0.30\n", - " \n", - " \n", - " 7\n", - " Will the application period for Innovate UK's ...\n", - " 0\n", - " 0.400\n", + " 0.05\n", " True\n", - " 0.900\n", - " 0.65\n", - " 0.50\n", - " \n", - " \n", - " 8\n", - " Will the weekly U.S. initial jobless claims (s...\n", - " 0\n", - " 0.400\n", - " False\n", - " 0.200\n", - " 0.35\n", - " 0.60\n", + " 0.25\n", + " 0.20\n", + " NaN\n", + " 0.22\n", " \n", " \n", - " 9\n", - " By December 31, 2025, will Google officially r...\n", + " 91\n", + " Will the solo exhibition of 'Calculating Empir...\n", " 1\n", - " 0.400\n", - " False\n", - " 0.200\n", - " 0.35\n", - " 0.60\n", - " \n", - " \n", - " 10\n", - " Will Straumann Holding (SWX:STMN) report a yea...\n", - " 0\n", - " 0.250\n", + " 0.05\n", " True\n", - " 0.200\n", - " 0.25\n", - " 0.45\n", + " 0.93\n", + " 0.90\n", + " 0.95\n", + " 0.93\n", " \n", " \n", - " 11\n", - " Will the Northern Virginia Technology Council ...\n", + " 92\n", + " Will the U.S. Attorney's Office for the Distri...\n", " 0\n", - " 0.250\n", - " True\n", - " 0.600\n", - " 0.72\n", - " 0.85\n", - " \n", - " \n", - " 12\n", - " Will Amazon Web Services (AWS) officially anno...\n", - " 1\n", - " 0.250\n", - " False\n", - " 0.600\n", - " 0.35\n", - " 0.60\n", - " \n", - " \n", - " 13\n", - " By December 31, 2025, will IBM officially anno...\n", - " 1\n", - " 0.200\n", + " 0.04\n", " True\n", - " 0.700\n", - " 0.75\n", - " 0.90\n", + " 0.01\n", + " 0.05\n", + " 0.01\n", + " 0.03\n", " \n", " \n", - " 14\n", - " Will the United States and South Korea formall...\n", + " 93\n", + " Will IBM publicly announce the deployment of a...\n", " 0\n", - " 0.100\n", + " 0.04\n", " True\n", - " 0.100\n", - " 0.15\n", - " 0.20\n", + " 0.05\n", + " 0.03\n", + " 0.01\n", + " 0.04\n", " \n", " \n", - " 15\n", - " By December 1, 2025, will AI-generated voice a...\n", - " 1\n", - " 0.100\n", + " 94\n", + " Will Robin Mooldijk remain an active employee ...\n", + " 0\n", + " 0.03\n", " True\n", - " 0.150\n", - " 0.05\n", - " 0.05\n", + " 0.03\n", + " 0.04\n", + " 0.01\n", + " 0.03\n", " \n", " \n", "\n", + "

95 rows × 8 columns

\n", "" ], "text/plain": [ " Question Label Spread Agree \\\n", - "0 Will the New Frontiers in Research Fund (NFRF)... 1 0.719 False \n", - "1 Will Autozi Internet Technology (AZI) complete... 1 0.670 False \n", - "2 Is the 13th Seoul Mediacity Biennale catalogue... 1 0.670 False \n", - "3 Will Mercury Ev-Tech Limited hold its 39th Ann... 0 0.620 False \n", - "4 Will Supermicro announce or list at least one ... 1 0.500 False \n", - "5 Will the Commonwealth Bank of Australia (CBA) ... 0 0.450 False \n", - "6 By January 31, 2026, will Trimble have officia... 1 0.420 False \n", - "7 Will the application period for Innovate UK's ... 0 0.400 True \n", - "8 Will the weekly U.S. initial jobless claims (s... 0 0.400 False \n", - "9 By December 31, 2025, will Google officially r... 1 0.400 False \n", - "10 Will Straumann Holding (SWX:STMN) report a yea... 0 0.250 True \n", - "11 Will the Northern Virginia Technology Council ... 0 0.250 True \n", - "12 Will Amazon Web Services (AWS) officially anno... 1 0.250 False \n", - "13 By December 31, 2025, will IBM officially anno... 1 0.200 True \n", - "14 Will the United States and South Korea formall... 0 0.100 True \n", - "15 By December 1, 2025, will AI-generated voice a... 1 0.100 True \n", + "0 Will Apple Inc. officially announce a new gene... 0 0.93 False \n", + "1 Will Apple Inc. announce or release a new gene... 0 0.87 False \n", + "2 Will the 'motorola signature' smartphone, anno... 1 0.76 False \n", + "3 Will the European Commission or European Parli... 0 0.70 False \n", + "4 By January 15, 2026, will Supermicro appear in... 1 0.67 False \n", + ".. ... ... ... ... \n", + "90 Will the Rwandan Ministry of ICT and Innovatio... 1 0.05 True \n", + "91 Will the solo exhibition of 'Calculating Empir... 1 0.05 True \n", + "92 Will the U.S. Attorney's Office for the Distri... 0 0.04 True \n", + "93 Will IBM publicly announce the deployment of a... 0 0.04 True \n", + "94 Will Robin Mooldijk remain an active employee ... 0 0.03 True \n", + "\n", + " gpt-5.2 claude-sonnet-4.6 gemini-3.1-pro-preview foresight-v3 \n", + "0 0.08 0.95 0.02 0.17 \n", + "1 0.90 0.04 0.03 0.12 \n", + "2 0.55 0.88 0.85 0.12 \n", + "3 0.18 0.35 0.88 0.22 \n", + "4 0.35 0.65 0.76 0.09 \n", + ".. ... ... ... ... \n", + "90 0.25 0.20 NaN 0.22 \n", + "91 0.93 0.90 0.95 0.93 \n", + "92 0.01 0.05 0.01 0.03 \n", + "93 0.05 0.03 0.01 0.04 \n", + "94 0.03 0.04 0.01 0.03 \n", "\n", - " gpt-4.1-mini claude-sonnet-4 gemini-2.5-flash \n", - "0 0.001 0.72 0.01 \n", - "1 0.050 0.72 0.50 \n", - "2 0.050 0.72 0.50 \n", - "3 0.600 0.72 0.10 \n", - "4 0.150 0.65 0.60 \n", - "5 0.200 0.65 0.60 \n", - "6 0.450 0.72 0.30 \n", - "7 0.900 0.65 0.50 \n", - "8 0.200 0.35 0.60 \n", - "9 0.200 0.35 0.60 \n", - "10 0.200 0.25 0.45 \n", - "11 0.600 0.72 0.85 \n", - "12 0.600 0.35 0.60 \n", - "13 0.700 0.75 0.90 \n", - "14 0.100 0.15 0.20 \n", - "15 0.150 0.05 0.05 " + "[95 rows x 8 columns]" ] }, - "execution_count": 7, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -455,7 +495,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -492,36 +532,43 @@ " \n", " \n", " \n", - " openai/gpt-4.1-mini\n", - " -0.443781\n", - " 1.0\n", - " 16\n", + " openai/gpt-5.2\n", + " -0.230492\n", + " 1.000000\n", + " 95\n", + " \n", + " \n", + " anthropic/claude-sonnet-4.6\n", + " -0.222194\n", + " 1.000000\n", + " 95\n", " \n", " \n", - " anthropic/claude-sonnet-4\n", - " -0.270962\n", - " 1.0\n", - " 16\n", + " google/gemini-3.1-pro-preview\n", + " -0.279008\n", + " 0.978947\n", + " 95\n", " \n", " \n", - " google/gemini-2.5-flash\n", - " -0.331725\n", - " 1.0\n", - " 16\n", + " LightningRodLabs/foresight-v3\n", + " -0.298932\n", + " 1.000000\n", + " 95\n", " \n", " \n", "\n", "" ], "text/plain": [ - " mean_reward parse_rate n_total\n", - "model \n", - "openai/gpt-4.1-mini -0.443781 1.0 16\n", - "anthropic/claude-sonnet-4 -0.270962 1.0 16\n", - "google/gemini-2.5-flash -0.331725 1.0 16" + " mean_reward parse_rate n_total\n", + "model \n", + "openai/gpt-5.2 -0.230492 1.000000 95\n", + "anthropic/claude-sonnet-4.6 -0.222194 1.000000 95\n", + "google/gemini-3.1-pro-preview -0.279008 0.978947 95\n", + "LightningRodLabs/foresight-v3 -0.298932 1.000000 95" ] }, - "execution_count": 8, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -550,9 +597,9 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "Python (lightningrod-sdk)", "language": "python", - "name": "python3" + "name": "lightningrod-sdk" }, "language_info": { "codemirror_mode": { @@ -564,9 +611,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.18" + "version": "3.11.2" } }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/src/lightningrod/__init__.py b/src/lightningrod/__init__.py index f7e1a45..201f4ae 100644 --- a/src/lightningrod/__init__.py +++ b/src/lightningrod/__init__.py @@ -8,7 +8,7 @@ from lightningrod.datasets.dataset import Dataset from lightningrod import preprocessing, utils from lightningrod.utils.sample import create_sample -from lightningrod.utils.models import open_router_model +from lightningrod.utils.models import open_router_model, lightningrod_model from lightningrod import preprocessing, training, utils from lightningrod.training import to_messages from lightningrod._generated.models import ( @@ -81,6 +81,7 @@ "QuestionRenderer", "create_sample", "open_router_model", + "lightningrod_model", "render_sample", "Rollout", "RolloutScorer", diff --git a/src/lightningrod/utils/models.py b/src/lightningrod/utils/models.py index 9c1d61f..16a96fd 100644 --- a/src/lightningrod/utils/models.py +++ b/src/lightningrod/utils/models.py @@ -11,3 +11,12 @@ def open_router_model(model_name: str) -> ModelConfig: model_source=ModelSourceType.OPEN_ROUTER, use_pipeline_key=True, ) + +def lightningrod_model(model_name = "foresight-v3") -> ModelConfig: + """Create a ModelConfig for a Lightning Rod-hosted model.""" + return ModelConfig( + model_name=f"LightningRodLabs/{model_name}", + model_source=ModelSourceType.VLLM, + reasoning_effort="high", + is_lightningrod_model=True, + ) \ No newline at end of file