diff --git a/notebooks/e2e/model_consensus.ipynb b/notebooks/e2e/model_consensus.ipynb index 9f5ef13..e978e0c 100644 --- a/notebooks/e2e/model_consensus.ipynb +++ b/notebooks/e2e/model_consensus.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -77,10 +77,70 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], - "source": "from datetime import datetime\nfrom lightningrod import (\n NewsSeedGenerator,\n ForwardLookingQuestionGenerator,\n WebSearchLabeler,\n QuestionPipeline,\n NewsContextGenerator,\n QuestionRenderer,\n RolloutGenerator,\n RolloutScorer,\n BinaryAnswerType,\n open_router_model,\n)\n\n# Date range — adjust these to a period ~2-3 months in the past\nSTART_DATE = datetime(2025, 11, 1)\nEND_DATE = datetime(2025, 12, 1)\n\nseed_generator = NewsSeedGenerator(\n start_date=START_DATE,\n end_date=END_DATE,\n search_query=\"technology announcements\",\n)\n\nanswer_type = BinaryAnswerType()\n\nquestion_generator = ForwardLookingQuestionGenerator(\n instructions=\"Generate forward-looking yes/no questions about technology announcements. \"\n \"Questions should be clearly resolvable within 1-2 months.\",\n answer_type=answer_type,\n)\n\nlabeler = WebSearchLabeler(answer_type=answer_type)\n\nrenderer = QuestionRenderer(answer_type=answer_type)\n\nmodels = [\n open_router_model(\"openai/gpt-4.1-mini\"),\n open_router_model(\"anthropic/claude-sonnet-4\"),\n open_router_model(\"google/gemini-2.5-flash\"),\n]\n\ncontext_generator = NewsContextGenerator()\n\nrollout_generator = RolloutGenerator(models=models)\n\nscorer = RolloutScorer(answer_type=answer_type)\n\npipeline = QuestionPipeline(\n seed_generator=seed_generator,\n question_generator=question_generator,\n context_generators=[context_generator],\n labeler=labeler,\n renderer=renderer,\n rollout_generator=rollout_generator,\n scorer=scorer,\n)" + "source": [ + "from datetime import datetime\n", + "from lightningrod import (\n", + " NewsSeedGenerator,\n", + " ForwardLookingQuestionGenerator,\n", + " WebSearchLabeler,\n", + " QuestionPipeline,\n", + " NewsContextGenerator,\n", + " QuestionRenderer,\n", + " RolloutGenerator,\n", + " RolloutScorer,\n", + " BinaryAnswerType,\n", + " open_router_model,\n", + " lightningrod_model,\n", + ")\n", + "\n", + "# Date range — adjust these to a period ~2-3 months in the past\n", + "START_DATE = datetime(2025, 11, 6)\n", + "END_DATE = datetime(2026, 3, 1)\n", + "\n", + "seed_generator = NewsSeedGenerator(\n", + " start_date=START_DATE,\n", + " end_date=END_DATE,\n", + " search_query=\"technology announcements\",\n", + ")\n", + "\n", + "answer_type = BinaryAnswerType()\n", + "\n", + "question_generator = ForwardLookingQuestionGenerator(\n", + " instructions=\"Generate forward-looking yes/no questions about tech announcements. \"\n", + " \"Questions should be clearly resolvable within 1-2 months.\",\n", + " answer_type=answer_type,\n", + ")\n", + "\n", + "labeler = WebSearchLabeler(answer_type=answer_type)\n", + "\n", + "renderer = QuestionRenderer(answer_type=answer_type)\n", + "\n", + "models = [\n", + " open_router_model(\"openai/gpt-5.2\"),\n", + " open_router_model(\"anthropic/claude-sonnet-4.6\"),\n", + " open_router_model(\"google/gemini-3.1-pro-preview\"),\n", + " lightningrod_model(\"foresight-v3\"),\n", + "]\n", + "\n", + "context_generator = NewsContextGenerator()\n", + "\n", + "rollout_generator = RolloutGenerator(models=models)\n", + "\n", + "scorer = RolloutScorer(answer_type=answer_type)\n", + "\n", + "pipeline = QuestionPipeline(\n", + " seed_generator=seed_generator,\n", + " question_generator=question_generator,\n", + " context_generators=[context_generator],\n", + " labeler=labeler,\n", + " renderer=renderer,\n", + " rollout_generator=rollout_generator,\n", + " scorer=scorer,\n", + ")" + ] }, { "cell_type": "markdown", @@ -93,13 +153,38 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 15, "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n", + "│ │\n", + "│ >> Warning │\n", + "│ │\n", + "│ Estimated cost ($60.33) exceeds current balance ($10.31). Consider adding credits before running this job. │\n", + "│ │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n", + "\n" + ], + "text/plain": [ + "\u001b[33m╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[1;33m>> Warning\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m Estimated cost ($60.33) exceeds current balance ($10.31). Consider adding credits before running this job. \u001b[33m│\u001b[0m\n", + "\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", + "\u001b[33m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "ebd1c29a14fc48669015ce5ef0c0e2e9", + "model_id": "b4e3e81c1ead4708902f6792c55ab49b", "version_major": 2, "version_minor": 0 }, @@ -122,7 +207,7 @@ } ], "source": [ - "dataset = lr.transforms.run(pipeline, max_questions=20, name=\"News Forecasting Benchmark\")" + "dataset = lr.transforms.run(pipeline, max_questions=600, name=\"News Forecasting Benchmark\")" ] }, { @@ -136,14 +221,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Generated 20 samples (80.0% valid)\n", + "Generated 120 samples (79.2% valid)\n", "\n" ] } @@ -167,16 +252,16 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Consensus: 6/16 questions have full agreement (38%)\n", - "Disagreement: 10/16 questions have models on opposite sides of 0.5\n", - "Mean spread: 0.400\n", + "Consensus: 58/95 questions have full agreement (61%)\n", + "Disagreement: 37/95 questions have models on opposite sides of 0.5\n", + "Mean spread: 0.308\n", "\n" ] }, @@ -205,215 +290,170 @@ "
95 rows × 8 columns
\n", "" ], "text/plain": [ " Question Label Spread Agree \\\n", - "0 Will the New Frontiers in Research Fund (NFRF)... 1 0.719 False \n", - "1 Will Autozi Internet Technology (AZI) complete... 1 0.670 False \n", - "2 Is the 13th Seoul Mediacity Biennale catalogue... 1 0.670 False \n", - "3 Will Mercury Ev-Tech Limited hold its 39th Ann... 0 0.620 False \n", - "4 Will Supermicro announce or list at least one ... 1 0.500 False \n", - "5 Will the Commonwealth Bank of Australia (CBA) ... 0 0.450 False \n", - "6 By January 31, 2026, will Trimble have officia... 1 0.420 False \n", - "7 Will the application period for Innovate UK's ... 0 0.400 True \n", - "8 Will the weekly U.S. initial jobless claims (s... 0 0.400 False \n", - "9 By December 31, 2025, will Google officially r... 1 0.400 False \n", - "10 Will Straumann Holding (SWX:STMN) report a yea... 0 0.250 True \n", - "11 Will the Northern Virginia Technology Council ... 0 0.250 True \n", - "12 Will Amazon Web Services (AWS) officially anno... 1 0.250 False \n", - "13 By December 31, 2025, will IBM officially anno... 1 0.200 True \n", - "14 Will the United States and South Korea formall... 0 0.100 True \n", - "15 By December 1, 2025, will AI-generated voice a... 1 0.100 True \n", + "0 Will Apple Inc. officially announce a new gene... 0 0.93 False \n", + "1 Will Apple Inc. announce or release a new gene... 0 0.87 False \n", + "2 Will the 'motorola signature' smartphone, anno... 1 0.76 False \n", + "3 Will the European Commission or European Parli... 0 0.70 False \n", + "4 By January 15, 2026, will Supermicro appear in... 1 0.67 False \n", + ".. ... ... ... ... \n", + "90 Will the Rwandan Ministry of ICT and Innovatio... 1 0.05 True \n", + "91 Will the solo exhibition of 'Calculating Empir... 1 0.05 True \n", + "92 Will the U.S. Attorney's Office for the Distri... 0 0.04 True \n", + "93 Will IBM publicly announce the deployment of a... 0 0.04 True \n", + "94 Will Robin Mooldijk remain an active employee ... 0 0.03 True \n", + "\n", + " gpt-5.2 claude-sonnet-4.6 gemini-3.1-pro-preview foresight-v3 \n", + "0 0.08 0.95 0.02 0.17 \n", + "1 0.90 0.04 0.03 0.12 \n", + "2 0.55 0.88 0.85 0.12 \n", + "3 0.18 0.35 0.88 0.22 \n", + "4 0.35 0.65 0.76 0.09 \n", + ".. ... ... ... ... \n", + "90 0.25 0.20 NaN 0.22 \n", + "91 0.93 0.90 0.95 0.93 \n", + "92 0.01 0.05 0.01 0.03 \n", + "93 0.05 0.03 0.01 0.04 \n", + "94 0.03 0.04 0.01 0.03 \n", "\n", - " gpt-4.1-mini claude-sonnet-4 gemini-2.5-flash \n", - "0 0.001 0.72 0.01 \n", - "1 0.050 0.72 0.50 \n", - "2 0.050 0.72 0.50 \n", - "3 0.600 0.72 0.10 \n", - "4 0.150 0.65 0.60 \n", - "5 0.200 0.65 0.60 \n", - "6 0.450 0.72 0.30 \n", - "7 0.900 0.65 0.50 \n", - "8 0.200 0.35 0.60 \n", - "9 0.200 0.35 0.60 \n", - "10 0.200 0.25 0.45 \n", - "11 0.600 0.72 0.85 \n", - "12 0.600 0.35 0.60 \n", - "13 0.700 0.75 0.90 \n", - "14 0.100 0.15 0.20 \n", - "15 0.150 0.05 0.05 " + "[95 rows x 8 columns]" ] }, - "execution_count": 7, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -455,7 +495,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -492,36 +532,43 @@ " \n", " \n", "