From 46550cce46084dbb1f7c4fbdfaa7d4c512657854 Mon Sep 17 00:00:00 2001 From: Luke Beukelman Date: Sat, 14 Feb 2026 08:55:40 -0700 Subject: [PATCH 01/14] Basic skeleton of a forecast resolver --- .../auto_resolver/__init__.py | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/__init__.py diff --git a/forecasting_tools/agents_and_tools/auto_resolver/__init__.py b/forecasting_tools/agents_and_tools/auto_resolver/__init__.py new file mode 100644 index 00000000..d8ea4569 --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/__init__.py @@ -0,0 +1,58 @@ +from typing import Optional +import os +from forecasting_tools.data_models.questions import ( + BinaryResolution, NumericResolution, DateResolution, MultipleChoiceResolution, ResolutionType +) +from forecasting_tools import ( + MetaculusQuestion, BinaryQuestion, MultipleChoiceQuestion, + NumericQuestion, DateQuestion, MetaculusClient +) +from abc import ABC, abstractmethod + +class AutoResolver(ABC): + """ + Auto resolvers are provided a metaculus question, and aggregate research on + - whether they have resolved or not + - whether they have resolved conclusively or not + - what conclusion they have reaches + """ + + @abstractmethod + def resolve_question(self, question: MetaculusQuestion) -> Optional[ResolutionType]: + pass + +class CommunityForecastResolver(AutoResolver): + """ + Checks if the community forecast has reached a consensus. This only works on binary forecasts + at the minute. + + This should not be used alone, as there can be extremely slim chanced theoretical questions + (i.e. what is the chance that a meteor hits the earth in the next year) that have not resolved. + + This is also unable to determine if a question should resolve as annulled or ambiguous. + """ + + def __init__(self, binary_threshold: float = 1, mc_threshold: float = 1): + self.binary_theshold = binary_threshold + self.mc_threshold = mc_threshold + + @abstractmethod + def resolve_question(self, question: MetaculusQuestion) -> Optional[ResolutionType]: + # Update the question + question = MetaculusClient.get_question_by_post_id(question.id_of_post) + if isinstance(question, BinaryQuestion): + return self._resolve_binary_question(question) + else: + return None + + + def _resolve_binary_question(self, question: BinaryQuestion) -> Optional[BinaryResolution]: + if question.community_prediction_at_access_time is None or type(self.binary_theshold) is not int and type(self.binary_theshold) is not float: + return None + if self.binary_theshold + question.community_prediction_at_access_time >= 100: + return True + elif question.community_prediction_at_access_time - self.binary_theshold <= 0: + return False + else: + return None + From 576de9b5fe0585fb97935cfc58a4688442c3a260 Mon Sep 17 00:00:00 2001 From: Luke Beukelman Date: Sat, 21 Feb 2026 11:54:55 -0700 Subject: [PATCH 02/14] Asyncio has been part of the standard library for awhile, removing from deps --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 93b3031a..0b9d8129 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,6 @@ tiktoken = ">=0.8.0,<0.13.0" aiofiles = ">=24.1.0,<26.0.0" aiohttp = "^3.9.3" aiolimiter = "^1.1.0" -asyncio = ">=3.0.0,<5.0.0" nest-asyncio = "^1.5.8" requests = "^2.32.3" pydantic = "^2.9.2" From 40f12467ce199fd50f41015c07c984a993d100c9 Mon Sep 17 00:00:00 2001 From: Luke Beukelman Date: Sat, 21 Feb 2026 13:59:07 -0700 Subject: [PATCH 03/14] Updated lock file for removing asyncio --- poetry.lock | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/poetry.lock b/poetry.lock index a04e8ca3..c0e13dff 100644 --- a/poetry.lock +++ b/poetry.lock @@ -402,18 +402,6 @@ files = [ {file = "async_lru-2.0.5.tar.gz", hash = "sha256:481d52ccdd27275f42c43a928b4a50c3bfb2d67af4e78b170e3e0bb39c66e5bb"}, ] -[[package]] -name = "asyncio" -version = "4.0.0" -description = "Deprecated backport of asyncio; use the stdlib package instead" -optional = false -python-versions = ">=3.4" -groups = ["main"] -files = [ - {file = "asyncio-4.0.0-py3-none-any.whl", hash = "sha256:c1eddb0659231837046809e68103969b2bef8b0400d59cfa6363f6b5ed8cc88b"}, - {file = "asyncio-4.0.0.tar.gz", hash = "sha256:570cd9e50db83bc1629152d4d0b7558d6451bb1bfd5dfc2e935d96fc2f40329b"}, -] - [[package]] name = "attrs" version = "25.4.0" @@ -6579,4 +6567,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "2a96003391909665f518e8cab4863d7e64b456444804f1441830ad51b393750b" +content-hash = "2a47fec44b5f12100f2aaee440bfe0bdbd1fb8b73b6b4e28d167f92f6bdf7457" From 7e0551c6cfefed74d2da7453607a07de949ab954 Mon Sep 17 00:00:00 2001 From: Luke Beukelman Date: Sat, 21 Feb 2026 17:00:55 -0700 Subject: [PATCH 04/14] Stashing changes for creating perplexity auto resolver --- .../auto_resolver/__init__.py | 20 +- .../agents_and_tools/auto_resolver/assess.py | 207 +++++++ .../auto_resolver/openai_resolver.py | 541 ++++++++++++++++++ .../auto_resolver/resolution_models.py | 36 ++ 4 files changed, 795 insertions(+), 9 deletions(-) create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/assess.py create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/openai_resolver.py create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/resolution_models.py diff --git a/forecasting_tools/agents_and_tools/auto_resolver/__init__.py b/forecasting_tools/agents_and_tools/auto_resolver/__init__.py index d8ea4569..273fbc6a 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/__init__.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/__init__.py @@ -11,15 +11,18 @@ class AutoResolver(ABC): """ - Auto resolvers are provided a metaculus question, and aggregate research on - - whether they have resolved or not - - whether they have resolved conclusively or not - - what conclusion they have reaches + Auto resolvers are provided a metaculus question, and return a resolution if it was able to + conclusively resolve a question. + + It should be noted that "ambiguous" and "annulled" resolutions + ARE conclusive (as they are a final resolution). An inconclusive, or null, resolution, means + that the deciding event or deadline has not occured, or there is presently not enough + information for the resolver to come to a conclusion. """ @abstractmethod - def resolve_question(self, question: MetaculusQuestion) -> Optional[ResolutionType]: - pass + async def resolve_question(self, question: MetaculusQuestion) -> Optional[ResolutionType]: + raise NotImplementedError() class CommunityForecastResolver(AutoResolver): """ @@ -37,13 +40,13 @@ def __init__(self, binary_threshold: float = 1, mc_threshold: float = 1): self.mc_threshold = mc_threshold @abstractmethod - def resolve_question(self, question: MetaculusQuestion) -> Optional[ResolutionType]: + async def resolve_question(self, question: MetaculusQuestion) -> Optional[ResolutionType]: # Update the question question = MetaculusClient.get_question_by_post_id(question.id_of_post) if isinstance(question, BinaryQuestion): return self._resolve_binary_question(question) else: - return None + return NotImplemented def _resolve_binary_question(self, question: BinaryQuestion) -> Optional[BinaryResolution]: @@ -55,4 +58,3 @@ def _resolve_binary_question(self, question: BinaryQuestion) -> Optional[BinaryR return False else: return None - diff --git a/forecasting_tools/agents_and_tools/auto_resolver/assess.py b/forecasting_tools/agents_and_tools/auto_resolver/assess.py new file mode 100644 index 00000000..bf89a624 --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/assess.py @@ -0,0 +1,207 @@ +from forecasting_tools.agents_and_tools.auto_resolver import AutoResolver +import asyncio +from forecasting_tools.data_models.questions import QuestionBasicType, BinaryResolution, CanceledResolution +from forecasting_tools.helpers.metaculus_client import MetaculusClient +from forecasting_tools import MetaculusQuestion, ApiFilter, BinaryQuestion +from dataclasses import dataclass, field + +@dataclass +class ResolutionAssessmentReport: + """ + Table for binary assessment (y-axis is true value, x-axis is predicted value): + + | True | False | Unresolvable | Cancelled + ------|------|-------|--------------|---------- + True | n_tp | n_fn | n_mp | n_ct + False | n_fp | n_tn | n_mn | n_cf + Cancelled | n_tc | n_fc | n_mc | n_cc + """ + + tp: list[int] = field(default_factory=list) + fp: list[int] = field(default_factory=list) + fn: list[int] = field(default_factory=list) + tn: list[int] = field(default_factory=list) + mp: list[int] = field(default_factory=list) + mn: list[int] = field(default_factory=list) + ct: list[int] = field(default_factory=list) # True actual, Cancelled predicted + cf: list[int] = field(default_factory=list) # False actual, Cancelled predicted + tc: list[int] = field(default_factory=list) # Cancelled actual, True predicted + fc: list[int] = field(default_factory=list) # Cancelled actual, False predicted + mc: list[int] = field(default_factory=list) # Cancelled actual, Unresolvable predicted + cc: list[int] = field(default_factory=list) # Cancelled actual, Cancelled predicted + + @property + def n_tp(self) -> int: + return len(self.tp) + + @property + def n_fp(self) -> int: + return len(self.fp) + + @property + def n_fn(self) -> int: + return len(self.fn) + + @property + def n_tn(self) -> int: + return len(self.tn) + + @property + def n_mp(self) -> int: + return len(self.mp) + + @property + def n_mn(self) -> int: + return len(self.mn) + + @property + def n_ct(self) -> int: + return len(self.ct) + + @property + def n_cf(self) -> int: + return len(self.cf) + + @property + def n_tc(self) -> int: + return len(self.tc) + + @property + def n_fc(self) -> int: + return len(self.fc) + + @property + def n_mc(self) -> int: + return len(self.mc) + + @property + def n_cc(self) -> int: + return len(self.cc) + + def binary_results_table(self) -> str: + """ + Returns a markdown table representation of the binary assessment report. + + Returns: + str: A markdown formatted confusion matrix table + """ + return f"""\ +| Actual \\ Predicted | True | False | Unresolvable | Cancelled | +|--------------------|------|-------|--------------|-----------| +| True | {str(self.n_tp).rjust(3)} | {str(self.n_fn).rjust(3)} | {str(self.n_mp).rjust(3)} | {str(self.n_ct).rjust(3)} | +| False | {str(self.n_fp).rjust(3)} | {str(self.n_tn).rjust(3)} | {str(self.n_mn).rjust(3)} | {str(self.n_cf).rjust(3)} | +| Cancelled | {str(self.n_tc).rjust(3)} | {str(self.n_fc).rjust(3)} | {str(self.n_mc).rjust(3)} | {str(self.n_cc).rjust(3)} |""" + + + def __str__(self): + return self.binary_results_table() + +class ResolutionAssesser: + """ + Utility for assessing how an auto resolver behaves on a set of already resolved questions. + """ + + def __init__(self, resolver: AutoResolver, allowed_types: list[QuestionBasicType], questions: list[int | str] = [], tournaments: list[int | str] = []): + self.client = MetaculusClient() + self.questions: dict[int, MetaculusQuestion] = {} + self.allowed_types = allowed_types + self.resolver = resolver + self.tournament_ids = tournaments + self.question_ids = questions + + + def _insert_question(self, question: MetaculusQuestion): + if question.actual_resolution_time is None: + raise ValueError("Question is not yet resolved") + if question.get_question_type() not in self.allowed_types: + raise ValueError("Question is not of allowed type") + if question.id_of_post is None: + raise ValueError("Question does not have a post id") + self.questions[question.id_of_post] = question + + def _load_question(self, question: int | str): + loaded = None + if type(question) is int: + loaded = self.client.get_question_by_post_id(question) + elif type(question) is str: + loaded = self.client.get_question_by_url(question) + else: + return NotImplemented + + if loaded is None or not isinstance(loaded, MetaculusQuestion): + raise ValueError("unable to find question") + + self._insert_question(loaded) + + def _load_questions(self, questions: list[int | str]): + for question in questions: + try: + self._load_question(question) + except ValueError: + continue + + async def _load_tournaments(self, tournament_ids: list[str | int]): + filter = ApiFilter( + allowed_tournaments=tournament_ids, + allowed_statuses=["resolved"], + allowed_types=self.allowed_types, + group_question_mode="exclude", + order_by="-published_time" + ) + + questions: list[MetaculusQuestion] = await self.client.get_questions_matching_filter(filter) + + for question in questions: + try: + self._insert_question(question) + except ValueError: + continue + + async def _load_tournament(self, tournament_id: str | int): + await self._load_tournaments([tournament_id]) + + async def assess_resolver(self) -> ResolutionAssessmentReport: + # Load tournaments if not already loaded + if self.tournament_ids: + await self._load_tournaments(self.tournament_ids) + if self.question_ids: + self._load_questions(self.question_ids) + report = ResolutionAssessmentReport() + for question_id, question in self.questions.items(): + true_resolution = question.typed_resolution + test_resolution = await self.resolver.resolve_question(question) + if isinstance(true_resolution, BinaryResolution): + # Check if test_resolution is CanceledResolution + test_is_cancelled = isinstance(test_resolution, CanceledResolution) + true_is_cancelled = isinstance(true_resolution, CanceledResolution) + + match true_resolution, test_resolution: + # True actual resolution cases + case True, True: + report.tp.append(question_id) + case True, False: + report.fn.append(question_id) + case True, None: + report.mp.append(question_id) + # False actual resolution cases + case False, True: + report.fp.append(question_id) + case False, False: + report.tn.append(question_id) + case False, None: + report.mn.append(question_id) + # Cancelled actual resolution cases + case CanceledResolution(), True: + report.tc.append(question_id) + case CanceledResolution(), False: + report.fc.append(question_id) + case CanceledResolution(), None: + report.mc.append(question_id) + case CanceledResolution(), CanceledResolution(): + report.cc.append(question_id) + # True/False actual with Cancelled predicted + case True, CanceledResolution(): + report.ct.append(question_id) + case False, CanceledResolution(): + report.cf.append(question_id) + return report diff --git a/forecasting_tools/agents_and_tools/auto_resolver/openai_resolver.py b/forecasting_tools/agents_and_tools/auto_resolver/openai_resolver.py new file mode 100644 index 00000000..22de8656 --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/openai_resolver.py @@ -0,0 +1,541 @@ +"""Multi-agent question resolver using OpenAI Agents SDK. + +This module implements a multi-agent architecture for resolving Metaculus +forecasting questions. It uses a pluggable researcher pattern with a default +Perplexity-based implementation. + +Architecture: + 1. Orchestrator (minimal) coordinates handoffs between agents + 2. Researcher agent performs multiple strategic searches + 3. Resolver agent analyzes research and determines resolution + 4. Structured output parsing converts to typed resolution +""" + +import logging +from typing import Optional, Callable + +from forecasting_tools.data_models.questions import ( + ResolutionType, + CanceledResolution, + BinaryResolution, +) +from forecasting_tools import MetaculusQuestion, BinaryQuestion +from forecasting_tools.agents_and_tools.auto_resolver import AutoResolver +from forecasting_tools.agents_and_tools.auto_resolver.resolution_models import ( + BinaryResolutionResult, +) +from forecasting_tools.agents_and_tools.minor_tools import ( + perplexity_reasoning_pro_search, +) +from forecasting_tools.ai_models.agent_wrappers import ( + AgentRunner, + AgentSdkLlm, + AiAgent, +) +from forecasting_tools.ai_models.general_llm import GeneralLlm +from forecasting_tools.helpers.structure_output import structure_output +from forecasting_tools.util.misc import clean_indents + +logger = logging.getLogger(__name__) + + +class StructuredOutputParsingError(Exception): + """Raised when structured output parsing fails. + + This exception preserves the raw unparsed output for debugging purposes. + + Attributes: + raw_output: The unparsed output that failed parsing + original_error: The original exception that caused the failure + """ + + def __init__(self, raw_output: str, original_error: Exception): + self.raw_output = raw_output + self.original_error = original_error + super().__init__( + f"Failed to parse structured output: {original_error}\n" + f"Raw output available in exception.raw_output" + ) + + +class OpenAIResolver(AutoResolver): + """Multi-agent resolver using OpenAI Agents SDK. + + This resolver uses a multi-agent architecture to resolve Metaculus questions: + 1. A minimal orchestrator coordinates the workflow + 2. A researcher agent performs multiple Perplexity searches + 3. A resolver agent analyzes the research and determines the resolution + 4. Structured output parsing converts the result to a typed resolution + + The researcher agent is pluggable, allowing for custom implementations. + + Attributes: + model: LLM model identifier for the agents + researcher_agent_factory: Factory function to create researcher agents + structure_output_model: Model used for structured output parsing + timeout: Timeout for LLM calls in seconds + """ + + def __init__( + self, + model: str = "openrouter/anthropic/claude-sonnet-4", + researcher_agent_factory: Optional[Callable[[BinaryQuestion], AiAgent]] = None, + structure_output_model: Optional[GeneralLlm] = None, + timeout: int = 480, + ): + """Initialize OpenAI Resolver. + + Args: + model: LLM model for agents (default: claude-sonnet-4) + researcher_agent_factory: Optional factory function that takes a + BinaryQuestion and returns a custom researcher AiAgent. + If None, uses default Perplexity-based researcher. + structure_output_model: Model for structured output parsing. + If None, uses gpt-5.2 mini with low temperature. + timeout: Timeout for LLM calls in seconds + """ + self.model = model + self.researcher_agent_factory = ( + researcher_agent_factory or self._create_default_researcher + ) + self.structure_output_model = structure_output_model or GeneralLlm( + "openrouter/openai/gpt-5.2", + temperature=0.2, + timeout=timeout, + ) + self.timeout = timeout + logger.info( + f"Initialized OpenAIResolver with model={model}, timeout={timeout}s" + ) + + async def resolve_question( + self, question: MetaculusQuestion + ) -> Optional[ResolutionType]: + """Resolve a Metaculus question. + + Args: + question: The question to resolve + + Returns: + Typed resolution or None if not yet resolvable + + Raises: + StructuredOutputParsingError: If output parsing fails (includes raw output) + Other exceptions: Propagated from agent execution + """ + logger.info( + f"Starting resolution for question {question.id_of_post}: " + f"{question.question_text[:100]}..." + ) + + if isinstance(question, BinaryQuestion): + return await self._resolve_binary(question) + else: + logger.warning(f"Question type {type(question)} not yet supported") + return NotImplemented + + async def _resolve_binary( + self, question: BinaryQuestion + ) -> Optional[BinaryResolution]: + """Resolve a binary question using multi-agent workflow. + + Workflow: + 1. Create researcher, resolver, and orchestrator agents + 2. Run orchestrator (which coordinates handoffs) + 3. Parse final output with structured output + 4. Convert to typed resolution + + Args: + question: Binary question to resolve + + Returns: + BinaryResolution (True/False/AMBIGUOUS/ANNULLED) or None + + Raises: + StructuredOutputParsingError: If parsing fails + """ + logger.info(f"Creating agent workflow for binary question {question.id_of_post}") + + # Create agents + researcher = self.researcher_agent_factory(question) + resolver = self._create_resolver_agent(question) + orchestrator = self._create_orchestrator_agent(researcher, resolver) + + logger.info("Running orchestrator agent (max_turns=10)") + + # Run the workflow (non-streaming) + result = await AgentRunner.run( + orchestrator, "Please begin the resolution process.", max_turns=10 + ) + + logger.info( + f"Agent workflow completed. Final output length: " + f"{len(result.final_output)} chars" + ) + logger.debug(f"Final output preview: {result.final_output[:200]}...") + + # Parse structured output with error handling + try: + resolution_result = await structure_output( + result.final_output, + BinaryResolutionResult, + model=self.structure_output_model, + ) + logger.info( + f"Successfully parsed resolution: {resolution_result.resolution_status}" + ) + except Exception as e: + logger.error(f"Failed to parse structured output: {e}", exc_info=True) + raise StructuredOutputParsingError( + raw_output=result.final_output, original_error=e + ) from e + + # Convert to typed resolution + typed_resolution = self._convert_to_binary_resolution(resolution_result) + logger.info(f"Final resolution: {typed_resolution}") + + return typed_resolution + + def _create_default_researcher(self, question: BinaryQuestion) -> AiAgent: + """Create default Perplexity-based researcher agent. + + This agent performs multiple strategic searches to gather + comprehensive information about the question's resolution status. + + Args: + question: The question to research + + Returns: + Configured researcher AiAgent + """ + logger.debug("Creating default Perplexity-based researcher agent") + + instructions = self._build_researcher_instructions(question) + + return AiAgent( + name="Resolution Researcher", + instructions=instructions, + model=AgentSdkLlm(model=self.model), + tools=[perplexity_reasoning_pro_search], + handoffs=["resolver"], + ) + + def _create_resolver_agent(self, question: BinaryQuestion) -> AiAgent: + """Create resolver agent that determines final resolution. + + This agent receives research from the researcher agent and + makes the final resolution determination. + + Args: + question: The question being resolved + + Returns: + Configured resolver AiAgent + """ + logger.debug("Creating resolver agent") + + instructions = self._build_resolver_instructions(question) + + return AiAgent( + name="resolver", + instructions=instructions, + model=AgentSdkLlm(model=self.model), + tools=[], # No tools - only analyzes research + handoffs=[], # Terminal agent + ) + + def _create_orchestrator_agent( + self, researcher: AiAgent, resolver: AiAgent + ) -> AiAgent: + """Create minimal orchestrator that enables handoffs. + + This is a simple coordinator that connects the researcher + and resolver agents. + + Args: + researcher: The researcher agent + resolver: The resolver agent + + Returns: + Minimal orchestrator AiAgent + """ + logger.debug("Creating minimal orchestrator agent") + + instructions = clean_indents( + """ + You are coordinating a question resolution process. + + Your task is simple: + 1. Hand off to the Resolution Researcher to gather information + 2. The researcher will hand off to the resolver when ready + 3. The resolver will provide the final resolution + + Begin by handing off to the researcher. + """ + ) + + return AiAgent( + name="Resolution Orchestrator", + instructions=instructions, + model=AgentSdkLlm(model=self.model), + tools=[], + handoffs=[researcher, resolver], + ) + + def _build_researcher_instructions(self, question: BinaryQuestion) -> str: + """Build detailed instructions for the researcher agent. + + Args: + question: The question being researched + + Returns: + Formatted instruction string + """ + logger.debug("Building researcher instructions") + + return clean_indents( + f""" + # Your Role + + You are a research assistant gathering information to resolve a forecasting question. + + # The Question + + {question.give_question_details_as_markdown()} + + # Your Task + + Perform multiple strategic Perplexity searches to thoroughly investigate: + + 1. **Current Status**: What is the current state of affairs related to this question? + 2. **Resolution Criteria**: Have the resolution criteria been met? + 3. **Timeline Check**: Consider the scheduled resolution date and current date + 4. **Verification**: Cross-check information from multiple sources + 5. **Edge Cases**: Look for any ambiguities, disputes, or complications + + # Search Strategy Guidelines + + - Run 3-5 searches total (don't overdo it) + - Run searches in parallel when they're independent + - Use follow-up searches based on initial findings + - Focus on authoritative and recent sources + - Note any contradictions or uncertainties you find + - Pay special attention to dates and timelines + + # Example Search Sequence + + 1. Broad search: "Current status of [topic] as of [current date]" + 2. Specific search: "Has [specific criterion] occurred?" + 3. Verification: "Latest news about [topic]" + 4. (Optional) Follow-up based on findings + + # Important Reminders + + - Be thorough but efficient + - Document your findings clearly + - Note the sources and dates of information + - If you find conflicting information, document both sides + - When ready, hand off your research to the resolver + + # Handoff + + When you've gathered sufficient information, hand off to the resolver + with a comprehensive summary of your research findings. + """ + ) + + def _build_resolver_instructions(self, question: BinaryQuestion) -> str: + """Build detailed instructions for the resolver agent. + + Args: + question: The question being resolved + + Returns: + Formatted instruction string + """ + logger.debug("Building resolver instructions") + + return clean_indents( + f""" + # Your Role + + You are a resolution analyst determining the final resolution status + of a forecasting question based on research provided to you. + + # The Question + + {question.give_question_details_as_markdown()} + + # Resolution Options + + You must determine one of the following resolutions: + + ## TRUE + - Resolution criteria have been definitively met + - The outcome is YES/positive + - There is strong evidence supporting this + + ## FALSE + - Resolution criteria have been definitively met + - The outcome is NO/negative + - There is strong evidence supporting this + + ## AMBIGUOUS + - The resolution criteria occurred + - BUT the outcome is unclear or disputed + - Multiple interpretations are reasonable + - Example: A law passed but its scope is unclear + + ## ANNULLED + - A fundamental assumption of the question is false + - The question itself is invalid or malformed + - Example: Question asks about a company that never existed + + ## NOT_YET_RESOLVABLE + - Insufficient information currently available + - OR the resolution date/event hasn't occurred yet + - OR you cannot confidently determine the resolution + - **BE CONSERVATIVE: Default to this when uncertain** + + # Analysis Guidelines + + 1. **Review the research** provided by the researcher carefully + 2. **Check the timeline**: Has the scheduled resolution date passed? + 3. **Assess the evidence**: Is it strong enough for a definitive resolution? + 4. **Consider ambiguity**: Is the outcome clear or disputed? + 5. **Be conservative**: If uncertain, return NOT_YET_RESOLVABLE + + # Critical Distinctions + + **AMBIGUOUS vs ANNULLED:** + - AMBIGUOUS: Question is valid, but answer is unclear + - ANNULLED: Question itself is invalid/malformed + + **FALSE vs NOT_YET_RESOLVABLE:** + - FALSE: Definitively did NOT happen + - NOT_YET_RESOLVABLE: Might still happen or unclear if it happened + + # Output Format + + Provide your analysis in the following format: + + **Resolution Status**: [Your chosen status] + + **Reasoning**: [2-4 sentences explaining your decision] + + **Key Evidence**: + - [Evidence point 1] + - [Evidence point 2] + - [Evidence point 3] + - [Evidence point 4 - optional] + - [Evidence point 5 - optional] + + # Important + + - Be thorough in your reasoning + - Cite specific information from the research + - Acknowledge uncertainties when present + - Your output will be parsed programmatically, so follow the format exactly + """ + ) + + def _convert_to_binary_resolution( + self, result: BinaryResolutionResult + ) -> Optional[BinaryResolution]: + """Convert structured result to typed binary resolution. + + Args: + result: Parsed resolution result + + Returns: + Typed BinaryResolution or None + + Raises: + ValueError: If resolution status is unexpected + """ + logger.debug(f"Converting result status: {result.resolution_status}") + + match result.resolution_status: + case "TRUE": + return True + case "FALSE": + return False + case "AMBIGUOUS": + return CanceledResolution.AMBIGUOUS + case "ANNULLED": + return CanceledResolution.ANNULLED + case "NOT_YET_RESOLVABLE": + return None + case _: + raise ValueError( + f"Unexpected resolution status: {result.resolution_status}" + ) + + +async def main(): + """Test harness for OpenAI Resolver. + + Runs the resolver against 10 random binary questions from the AIB Fall 2025 tournament + and outputs a confusion matrix comparing predicted vs actual resolutions. + """ + from forecasting_tools.agents_and_tools.auto_resolver.assess import ( + ResolutionAssesser, + ) + from forecasting_tools import MetaculusClient, ApiFilter + from dotenv import load_dotenv + import random + + load_dotenv() + + logger.info("Starting OpenAI Resolver assessment") + + # Fetch all resolved binary questions from AIB Fall 2025 + client = MetaculusClient() + filter = ApiFilter( + allowed_tournaments=[MetaculusClient.AIB_FALL_2025_ID], + allowed_statuses=["resolved"], + allowed_types=["binary"], + group_question_mode="exclude", + order_by="-published_time" + ) + + logger.info("Fetching resolved binary questions from AIB Fall 2025...") + all_questions = await client.get_questions_matching_filter(filter) + + # Randomly sample 10 questions + sample_size = min(20, len(all_questions)) + sampled_questions = random.sample(all_questions, sample_size) + question_ids = [q.id_of_post for q in sampled_questions if q.id_of_post is not None] + + logger.info(f"Selected {len(question_ids)} random questions for assessment") + + # Create resolver + resolver = OpenAIResolver() + + # Create assessor with specific question IDs + assesser = ResolutionAssesser( + resolver, allowed_types=["binary"], questions=question_ids + ) + + logger.info(f"Running assessment on {len(question_ids)} questions") + + # Run assessment + report = await assesser.assess_resolver() + + # Print results + print("\n" + "=" * 60) + print("OpenAI Resolver Assessment Results") + print(f"Tested on {len(question_ids)} random questions from AIB Fall 2025") + print("=" * 60) + print(report) + print("=" * 60) + + logger.info("Assessment complete") + + +if __name__ == "__main__": + import asyncio + from dotenv import load_dotenv + + load_dotenv() + asyncio.run(main()) diff --git a/forecasting_tools/agents_and_tools/auto_resolver/resolution_models.py b/forecasting_tools/agents_and_tools/auto_resolver/resolution_models.py new file mode 100644 index 00000000..cf85dbef --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/resolution_models.py @@ -0,0 +1,36 @@ +"""Pydantic models for structured resolution output parsing. + +This module defines the data models used to parse the output from the resolver +agent into typed resolution decisions. +""" + +from pydantic import BaseModel, Field +from typing import Literal + + +class BinaryResolutionResult(BaseModel): + """Structured output for binary question resolution. + + This model is used to parse the final output from the resolver agent + into a typed resolution decision with supporting evidence and reasoning. + + Attributes: + resolution_status: The final resolution determination (TRUE, FALSE, + AMBIGUOUS, ANNULLED, or NOT_YET_RESOLVABLE) + reasoning: A 2-4 sentence explanation of why this resolution was chosen + key_evidence: A list of 3-5 key pieces of evidence supporting this resolution + """ + + resolution_status: Literal[ + "TRUE", "FALSE", "AMBIGUOUS", "ANNULLED", "NOT_YET_RESOLVABLE" + ] = Field(description="The final resolution determination") + + reasoning: str = Field( + description="2-4 sentence explanation of why this resolution was chosen" + ) + + key_evidence: list[str] = Field( + description="3-5 key pieces of evidence supporting this resolution", + min_length=3, + max_length=5, + ) From 85149aa3bf855dcb30bdc9175fb9f473a527c3e5 Mon Sep 17 00:00:00 2001 From: Luke Beukelman Date: Fri, 27 Feb 2026 15:47:33 -0700 Subject: [PATCH 05/14] Geting more detailed logs on failure --- .../auto_resolver/__init__.py | 12 + .../agents_and_tools/auto_resolver/assess.py | 285 ++++++++++++++++-- .../{openai_resolver.py => perplexity.py} | 58 +++- 3 files changed, 329 insertions(+), 26 deletions(-) rename forecasting_tools/agents_and_tools/auto_resolver/{openai_resolver.py => perplexity.py} (85%) diff --git a/forecasting_tools/agents_and_tools/auto_resolver/__init__.py b/forecasting_tools/agents_and_tools/auto_resolver/__init__.py index 273fbc6a..df6961e3 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/__init__.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/__init__.py @@ -23,6 +23,18 @@ class AutoResolver(ABC): @abstractmethod async def resolve_question(self, question: MetaculusQuestion) -> Optional[ResolutionType]: raise NotImplementedError() + + def get_last_resolution_metadata(self) -> dict | None: + """ + Returns metadata from the last resolution attempt, such as chain of thought or key evidence. + + Subclasses should override this to provide additional context about how the resolution + was determined. + + Returns: + dict with metadata fields (e.g., 'key_evidence', 'reasoning', etc.) or None + """ + return None class CommunityForecastResolver(AutoResolver): """ diff --git a/forecasting_tools/agents_and_tools/auto_resolver/assess.py b/forecasting_tools/agents_and_tools/auto_resolver/assess.py index bf89a624..c35aa983 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/assess.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/assess.py @@ -1,20 +1,39 @@ from forecasting_tools.agents_and_tools.auto_resolver import AutoResolver import asyncio -from forecasting_tools.data_models.questions import QuestionBasicType, BinaryResolution, CanceledResolution +import logging +from datetime import datetime +from pathlib import Path +from typing import Optional +from forecasting_tools.data_models.questions import QuestionBasicType, BinaryResolution, CanceledResolution, ResolutionType from forecasting_tools.helpers.metaculus_client import MetaculusClient from forecasting_tools import MetaculusQuestion, ApiFilter, BinaryQuestion from dataclasses import dataclass, field + +@dataclass +class QuestionAssessmentResult: + """Detailed assessment result for a single question.""" + + question_id: int + question_title: str + question_text: str + question_url: str + actual_resolution: ResolutionType + predicted_resolution: Optional[ResolutionType] + key_evidence: Optional[list[str]] = None + outcome_category: Optional[str] = None + + @dataclass class ResolutionAssessmentReport: """ Table for binary assessment (y-axis is true value, x-axis is predicted value): - - | True | False | Unresolvable | Cancelled - ------|------|-------|--------------|---------- - True | n_tp | n_fn | n_mp | n_ct - False | n_fp | n_tn | n_mn | n_cf - Cancelled | n_tc | n_fc | n_mc | n_cc + + | True | False | Unresolvable | Cancelled | Unmatched + ------|------|-------|--------------|-----------|------------ + True | n_tp | n_fn | n_mp | n_ct | n_um_true + False | n_fp | n_tn | n_mn | n_cf | n_um_false + Cancelled | n_tc | n_fc | n_mc | n_cc | n_um_ca """ tp: list[int] = field(default_factory=list) @@ -29,6 +48,10 @@ class ResolutionAssessmentReport: fc: list[int] = field(default_factory=list) # Cancelled actual, False predicted mc: list[int] = field(default_factory=list) # Cancelled actual, Unresolvable predicted cc: list[int] = field(default_factory=list) # Cancelled actual, Cancelled predicted + um_true: list[int] = field(default_factory=list) # True actual, unmatched predicted (error/edge case) + um_false: list[int] = field(default_factory=list) # False actual, unmatched predicted (error/edge case) + um_ca: list[int] = field(default_factory=list) # Cancelled actual, unmatched predicted (error/edge case) + question_results: dict[int, QuestionAssessmentResult] = field(default_factory=dict) @property def n_tp(self) -> int: @@ -78,19 +101,142 @@ def n_mc(self) -> int: def n_cc(self) -> int: return len(self.cc) + @property + def n_um_true(self) -> int: + return len(self.um_true) + + @property + def n_um_false(self) -> int: + return len(self.um_false) + + @property + def n_um_ca(self) -> int: + return len(self.um_ca) + def binary_results_table(self) -> str: """ Returns a markdown table representation of the binary assessment report. - + + The "Unmatched" column contains cases where the resolver returned an unexpected + value (e.g., NotImplemented for unsupported question types, or other edge cases). + These are logged as warnings for debugging. + Returns: str: A markdown formatted confusion matrix table """ return f"""\ -| Actual \\ Predicted | True | False | Unresolvable | Cancelled | -|--------------------|------|-------|--------------|-----------| -| True | {str(self.n_tp).rjust(3)} | {str(self.n_fn).rjust(3)} | {str(self.n_mp).rjust(3)} | {str(self.n_ct).rjust(3)} | -| False | {str(self.n_fp).rjust(3)} | {str(self.n_tn).rjust(3)} | {str(self.n_mn).rjust(3)} | {str(self.n_cf).rjust(3)} | -| Cancelled | {str(self.n_tc).rjust(3)} | {str(self.n_fc).rjust(3)} | {str(self.n_mc).rjust(3)} | {str(self.n_cc).rjust(3)} |""" +| Actual \\ Predicted | True | False | Unresolvable | Cancelled | Unmatched | +|--------------------|------|-------|--------------|-----------|-----------| +| True | {str(self.n_tp).rjust(3)} | {str(self.n_fn).rjust(3)} | {str(self.n_mp).rjust(3)} | {str(self.n_ct).rjust(3)} | {str(self.n_um_true).rjust(3)} | +| False | {str(self.n_fp).rjust(3)} | {str(self.n_tn).rjust(3)} | {str(self.n_mn).rjust(3)} | {str(self.n_cf).rjust(3)} | {str(self.n_um_false).rjust(3)} | +| Cancelled | {str(self.n_tc).rjust(3)} | {str(self.n_fc).rjust(3)} | {str(self.n_mc).rjust(3)} | {str(self.n_cc).rjust(3)} | {str(self.n_um_ca).rjust(3)} |""" + + def detailed_report(self) -> str: + """ + Returns a complete detailed markdown report including summary and per-question details. + + Returns: + str: A complete markdown formatted assessment report + """ + lines = [] + lines.append("# Auto Resolver Assessment Report\n") + lines.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") + + # Summary section + lines.append("## Summary\n") + lines.append(self.binary_results_table()) + lines.append("") + + # Results calculation + total = self.n_tp + self.n_fp + self.n_fn + self.n_tn + self.n_mp + self.n_mn + \ + self.n_ct + self.n_cf + self.n_tc + self.n_fc + self.n_mc + self.n_cc + \ + self.n_um_true + self.n_um_false + self.n_um_ca + correct = self.n_tp + self.n_tn + self.n_cc + accuracy = (correct / total * 100) if total > 0 else 0 + + lines.append(f"**Total Questions:** {total}") + lines.append(f"**Correct Predictions:** {correct} ({accuracy:.1f}%)") + lines.append("") + + # Detailed results section + lines.append("## Detailed Results\n") + lines.append("") + + # Sort question results by category, then by question ID + sorted_results = sorted( + self.question_results.values(), + key=lambda x: (x.outcome_category or "", x.question_id) + ) + + for result in sorted_results: + lines.append(f"### Question {result.question_id}\n") + lines.append(f"**Title:** {result.question_title}") + lines.append(f"**URL:** {result.question_url}") + lines.append("") + + # Question contents + lines.append(f"**Question Contents:**\n") + lines.append("> " + result.question_text.replace("\n", "\n> ")) + lines.append("") + + # Resolution comparison table + lines.append("| Output Resolution | Correct Resolution |") + lines.append("|-------------------|--------------------|") + actual_str = self._resolution_to_str(result.actual_resolution) + predicted_str = self._resolution_to_str(result.predicted_resolution) + lines.append(f"| {predicted_str} | {actual_str} |") + lines.append("") + + # Key evidence section + if result.key_evidence: + lines.append("**Key Evidence:**") + for evidence in result.key_evidence: + lines.append(f"- {evidence}") + lines.append("") + + lines.append("---\n") + + return "\n".join(lines) + + def write_to_file(self, directory: str = "reports") -> Path: + """ + Writes the detailed report to a markdown file in the specified directory. + + Args: + directory: Directory path to write the report to (default: "reports") + + Returns: + Path object pointing to the written file + """ + from pathlib import Path + + # Create directory if it doesn't exist + reports_dir = Path(directory) + reports_dir.mkdir(parents=True, exist_ok=True) + + # Generate filename with timestamp + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"assessment_report_{timestamp}.md" + filepath = reports_dir / filename + + # Write report + with open(filepath, "w", encoding="utf-8") as f: + f.write(self.detailed_report()) + + logging.info(f"Assessment report written to {filepath}") + return filepath + + @staticmethod + def _resolution_to_str(resolution: Optional[ResolutionType]) -> str: + """Convert a resolution to a human-readable string.""" + if resolution is None: + return "None (Unresolvable)" + elif isinstance(resolution, bool): + return "TRUE" if resolution else "FALSE" + elif isinstance(resolution, CanceledResolution): + return str(resolution) + else: + return str(resolution) def __str__(self): @@ -99,15 +245,28 @@ def __str__(self): class ResolutionAssesser: """ Utility for assessing how an auto resolver behaves on a set of already resolved questions. + + Supports concurrent resolution of questions when assessing resolvers. """ - def __init__(self, resolver: AutoResolver, allowed_types: list[QuestionBasicType], questions: list[int | str] = [], tournaments: list[int | str] = []): + def __init__(self, resolver: AutoResolver, allowed_types: list[QuestionBasicType], questions: list[int | str] = [], tournaments: list[int | str] = [], max_concurrency: int = 3): + """ + Initialize the resolution assessor. + + Args: + resolver: AutoResolver to assess + allowed_types: List of question types to include + questions: Optional list of question IDs or URLs to assess + tournaments: Optional list of tournament IDs to load + max_concurrency: Maximum number of questions to resolve concurrently (default: 3) + """ self.client = MetaculusClient() self.questions: dict[int, MetaculusQuestion] = {} self.allowed_types = allowed_types self.resolver = resolver self.tournament_ids = tournaments self.question_ids = questions + self._concurrency_limiter = asyncio.Semaphore(max_concurrency) def _insert_question(self, question: MetaculusQuestion): @@ -137,7 +296,8 @@ def _load_questions(self, questions: list[int | str]): for question in questions: try: self._load_question(question) - except ValueError: + except ValueError as e: + logging.warning(f"Skipping question {question}: {e}") continue async def _load_tournaments(self, tournament_ids: list[str | int]): @@ -154,54 +314,129 @@ async def _load_tournaments(self, tournament_ids: list[str | int]): for question in questions: try: self._insert_question(question) - except ValueError: + except ValueError as e: + question_id = getattr(question, 'id_of_post', 'unknown') + logging.warning(f"Skipping question {question_id}: {e}") continue async def _load_tournament(self, tournament_id: str | int): await self._load_tournaments([tournament_id]) + async def _resolve_single_question( + self, question: MetaculusQuestion, index: int + ) -> tuple[int, Optional[ResolutionType], dict | None]: + async with self._concurrency_limiter: + resolution = await self.resolver.resolve_question(question) + metadata = self.resolver.get_last_resolution_metadata() + return (index, resolution, metadata) + async def assess_resolver(self) -> ResolutionAssessmentReport: - # Load tournaments if not already loaded + """ + Assess the resolver against the loaded questions. + + Uses concurrent execution (via asyncio.gather) to resolve multiple + questions in parallel, controlled by a semaphore to limit concurrency. + + Returns: + ResolutionAssessmentReport: Detailed report of resolver performance + """ + report = ResolutionAssessmentReport() + if self.tournament_ids: await self._load_tournaments(self.tournament_ids) if self.question_ids: self._load_questions(self.question_ids) - report = ResolutionAssessmentReport() - for question_id, question in self.questions.items(): + + question_items = list(self.questions.items()) + + tasks = [ + self._resolve_single_question(question, idx) + for idx, (question_id, question) in enumerate(question_items) + ] + results = await asyncio.gather(*tasks, return_exceptions=True) + + for result, (question_id, question) in zip(results, question_items): + if isinstance(result, Exception): + logging.error(f"Exception resolving question {question_id}: {result}") + continue + + index, test_resolution, metadata = result true_resolution = question.typed_resolution - test_resolution = await self.resolver.resolve_question(question) + + key_evidence = metadata.get("key_evidence") if metadata else None + if isinstance(true_resolution, BinaryResolution): - # Check if test_resolution is CanceledResolution - test_is_cancelled = isinstance(test_resolution, CanceledResolution) - true_is_cancelled = isinstance(true_resolution, CanceledResolution) - + outcome_category = None + match true_resolution, test_resolution: # True actual resolution cases case True, True: report.tp.append(question_id) + outcome_category = "True Positive" case True, False: report.fn.append(question_id) + outcome_category = "False Negative" case True, None: report.mp.append(question_id) + outcome_category = "Missed Positive" # False actual resolution cases case False, True: report.fp.append(question_id) + outcome_category = "False Positive" case False, False: report.tn.append(question_id) + outcome_category = "True Negative" case False, None: report.mn.append(question_id) + outcome_category = "Missed Negative" # Cancelled actual resolution cases case CanceledResolution(), True: report.tc.append(question_id) + outcome_category = "True Incorrectly Predicted as True" case CanceledResolution(), False: report.fc.append(question_id) + outcome_category = "Cancelled Incorrectly Predicted as False" case CanceledResolution(), None: report.mc.append(question_id) + outcome_category = "Cancelled Predicted as Unresolvable" case CanceledResolution(), CanceledResolution(): report.cc.append(question_id) + outcome_category = "Correct Cancel" # True/False actual with Cancelled predicted case True, CanceledResolution(): report.ct.append(question_id) + outcome_category = "True Incorrectly Predicted as Cancelled" case False, CanceledResolution(): report.cf.append(question_id) - return report + outcome_category = "False Incorrectly Predicted as Cancelled" + # Catch-all for unmatched cases (edge cases, errors, NotImplemented, etc.) + case _: + logging.warning( + f"Question {question_id} had unmatched prediction: " + f"true_resolution={true_resolution}, test_resolution={test_resolution}" + ) + if true_resolution == True: + report.um_true.append(question_id) + outcome_category = "Unmatched - True" + elif true_resolution == False: + report.um_false.append(question_id) + outcome_category = "Unmatched - False" + elif isinstance(true_resolution, CanceledResolution): + report.um_ca.append(question_id) + outcome_category = "Unmatched - Cancelled" + else: + continue + + # Create detailed result + question_result = QuestionAssessmentResult( + question_id=question_id, + question_title=question.question_text[:100] if question.question_text else "No title", + question_text=question.question_text or "No text available", + question_url=question.page_url or f"https://metaculus.com/{question_id}", + actual_resolution=true_resolution, + predicted_resolution=test_resolution, + key_evidence=key_evidence, + outcome_category=outcome_category, + ) + report.question_results[question_id] = question_result + return report \ No newline at end of file diff --git a/forecasting_tools/agents_and_tools/auto_resolver/openai_resolver.py b/forecasting_tools/agents_and_tools/auto_resolver/perplexity.py similarity index 85% rename from forecasting_tools/agents_and_tools/auto_resolver/openai_resolver.py rename to forecasting_tools/agents_and_tools/auto_resolver/perplexity.py index 22de8656..81b38c9d 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/openai_resolver.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/perplexity.py @@ -1,4 +1,5 @@ -"""Multi-agent question resolver using OpenAI Agents SDK. +""" +Multi-agent question resolver using OpenAI Agents SDK and perplexity. This module implements a multi-agent architecture for resolving Metaculus forecasting questions. It uses a pluggable researcher pattern with a default @@ -104,6 +105,7 @@ def __init__( timeout=timeout, ) self.timeout = timeout + self._last_resolution_metadata = None logger.info( f"Initialized OpenAIResolver with model={model}, timeout={timeout}s" ) @@ -144,6 +146,7 @@ async def _resolve_binary( 2. Run orchestrator (which coordinates handoffs) 3. Parse final output with structured output 4. Convert to typed resolution + 5. Store metadata (key_evidence) for later retrieval Args: question: Binary question to resolve @@ -190,12 +193,26 @@ async def _resolve_binary( raw_output=result.final_output, original_error=e ) from e + # Store metadata for later retrieval + self._last_resolution_metadata = { + "reasoning": resolution_result.reasoning, + "key_evidence": resolution_result.key_evidence, + } + # Convert to typed resolution typed_resolution = self._convert_to_binary_resolution(resolution_result) logger.info(f"Final resolution: {typed_resolution}") return typed_resolution + def get_last_resolution_metadata(self) -> dict | None: + """Returns metadata from the last resolution attempt. + + Returns: + dict with 'reasoning' and 'key_evidence' keys, or None if no resolution has been attempted + """ + return self._last_resolution_metadata + def _create_default_researcher(self, question: BinaryQuestion) -> AiAgent: """Create default Perplexity-based researcher agent. @@ -312,6 +329,7 @@ def _build_researcher_instructions(self, question: BinaryQuestion) -> str: 3. **Timeline Check**: Consider the scheduled resolution date and current date 4. **Verification**: Cross-check information from multiple sources 5. **Edge Cases**: Look for any ambiguities, disputes, or complications + 6. **Validity Check**: Investigate whether the question's subject is valid/possible (for potential annulment) # Search Strategy Guidelines @@ -322,12 +340,40 @@ def _build_researcher_instructions(self, question: BinaryQuestion) -> str: - Note any contradictions or uncertainties you find - Pay special attention to dates and timelines + # Decomposition Strategy for Multi-Entity Questions + + If a question involves multiple entities (companies, people, organizations, etc.): + + 1. **First attempt**: Search for comprehensive data about all entities + 2. **If comprehensive search fails**: Decompose the question and search each entity individually + 3. **Example**: For "Did all Magnificent Seven stocks decline 50%?", search: + - "Microsoft MSFT all-time high 2025" + - "Nvidia NVDA 2025 stock performance" + - "Apple AAPL 2025 pricing" + - And so on for each company + 4. **Then aggregate**: Combine individual findings to answer the comprehensive question + + # Detecting Annulled/Invalid Questions + + Some questions may be fundamentally invalid (annulled). Look for: + + - Studies/experiments that were never conducted, cancelled, or abandoned + - Questions about entities that never existed or were fundamentally misconceived + - Research projects that lost funding, had impossible criteria, or were invalid from the start + - Any indication that the question's subject is impossible or doesn't exist + + **When you cannot find evidence of an event occurring:** + - Search specifically for: "[subject] cancelled", "[subject] abandoned", "[subject] never conducted" + - Search for: "[subject] funding withdrawn", "[subject] fundamental problems", "[subject] invalid" + - If you find evidence the subject was never valid/possible, note this for potential ANNULLED resolution + # Example Search Sequence 1. Broad search: "Current status of [topic] as of [current date]" 2. Specific search: "Has [specific criterion] occurred?" 3. Verification: "Latest news about [topic]" 4. (Optional) Follow-up based on findings + 5. (If no results found) Validity check: "[topic] cancelled", "[topic] validity", "[topic] problems" # Important Reminders @@ -335,6 +381,8 @@ def _build_researcher_instructions(self, question: BinaryQuestion) -> str: - Document your findings clearly - Note the sources and dates of information - If you find conflicting information, document both sides + - Decompose multi-entity questions when comprehensive searches fail + - Actively search for evidence of annulment/invalidity when no results are found - When ready, hand off your research to the resolver # Handoff @@ -530,6 +578,14 @@ async def main(): print(report) print("=" * 60) + # Save detailed report to reports directory + try: + report_path = report.write_to_file(directory="reports") + print(f"\nDetailed report saved to: {report_path}") + except Exception as e: + logger.error(f"Failed to save report to reports directory: {e}", exc_info=True) + print(f"\nWarning: Could not save report to reports directory: {e}") + logger.info("Assessment complete") From 9465ed856776c92811c8e354bb8654361fdd2bb3 Mon Sep 17 00:00:00 2001 From: Luke Beukelman Date: Fri, 27 Feb 2026 16:45:21 -0700 Subject: [PATCH 06/14] Pushing a report --- .../assessment_report_20260227_155703.md | 444 ++++++++++++++++++ 1 file changed, 444 insertions(+) create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/reports/assessment_report_20260227_155703.md diff --git a/forecasting_tools/agents_and_tools/auto_resolver/reports/assessment_report_20260227_155703.md b/forecasting_tools/agents_and_tools/auto_resolver/reports/assessment_report_20260227_155703.md new file mode 100644 index 00000000..8ce49ac8 --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/reports/assessment_report_20260227_155703.md @@ -0,0 +1,444 @@ +# Auto Resolver Assessment Report + +**Generated:** 2026-02-27 15:57:03 + +## Summary + +| Actual \ Predicted | True | False | Unresolvable | Cancelled | Unmatched | +|--------------------|------|-------|--------------|-----------|-----------| +| True | 8 | 1 | 0 | 0 | 0 | +| False | 1 | 10 | 0 | 0 | 0 | +| Cancelled | 0 | 0 | 0 | 0 | 0 | + +**Total Questions:** 20 +**Correct Predictions:** 18 (90.0%) + +## Detailed Results + + +### Question 39996 + +**Title:** Will any of these listed US federal government officials be out of their position before October 1, +**URL:** https://www.metaculus.com/questions/39996 + +**Question Contents:** + +> Will any of these listed US federal government officials be out of their position before October 1, 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | TRUE | + +**Key Evidence:** +- “My searches found no evidence of any cabinet secretaries or cabinet-level officials from the specified list departing during September 14 - October 1, 2025.” +- “While there were some departures at federal agencies during this period, they were not from the positions specified in the question:” +- “One source specifically noted that ‘Trump ends 2025 with no cabinet turnover,’ indicating the core cabinet remained stable through the end of 2025.” +- “Despite extensive searches using various terms and date ranges, I found no evidence of any of the 100 specifically listed officials leaving their positions during the September 14 - October 1, 2025 timeframe.” + +--- + +### Question 40939 + +**Title:** Will Iran announce a new capital location before January 3, 2026? +**URL:** https://www.metaculus.com/questions/40939 + +**Question Contents:** + +> Will Iran announce a new capital location before January 3, 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | FALSE | + +**Key Evidence:** +- “Iran officially announced a new capital location on January 8, 2025 - well before the January 3, 2026 deadline.” +- “Official Announcement Date: January 8, 2025” +- “Government spokeswoman Fatemeh Mohajerani made the official announcement” +- “Specific Location Announced: Makran Region” +- “This meets the resolution criteria requiring ‘at least at the level of a specific county’” + +--- + +### Question 39559 + +**Title:** Will Alexandre de Moraes cease to be a minister of the Supreme Federal Court of Brazil before 2026? +**URL:** https://www.metaculus.com/questions/39559 + +**Question Contents:** + +> Will Alexandre de Moraes cease to be a minister of the Supreme Federal Court of Brazil before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Key Evidence:** +- “Alexandre de Moraes remained a minister of Brazil's Supreme Federal Court through January 1, 2026” +- “No evidence found of any departure, resignation, retirement, or impeachment before the January 1, 2026 deadline” +- “He continued serving actively as a justice well into 2026” +- “He was issuing judicial orders as late as January 30, 2026” +- “No resignation occurred” + +--- + +### Question 39564 + +**Title:** Will the United Nations have more than 193 member states before January 1, 2026? +**URL:** https://www.metaculus.com/questions/39564 + +**Question Contents:** + +> Will the United Nations have more than 193 member states before January 1, 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Key Evidence:** +- The United Nations has 193 member states as of February 2026 +- This number has remained unchanged since South Sudan joined as the 193rd member on July 14, 2011 +- No new member states were admitted in 2024-2025 +- The US vetoed the Security Council recommendation on April 18, 2024 +- Palestine remains a non-member observer state + +--- + +### Question 39572 + +**Title:** Will the US pass a federal bill on AI regulation before January 1, 2026? +**URL:** https://www.metaculus.com/questions/39572 + +**Question Contents:** + +> Will the US pass a federal bill on AI regulation before January 1, 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Key Evidence:** +- “No comprehensive federal AI legislation was passed: Despite multiple legislative efforts in 2024 and 2025, no federal bill primarily focused on AI regulation was successfully passed by both the House and Senate and signed into law before January 1, 2026.” +- “Executive Orders, not legislation: The main federal AI actions were Executive Orders: - Biden's Executive Order 14141 (January 14, 2025) on AI infrastructure - Trump's Executive Order 14365 (December 11, 2025) on national AI policy framework” +- “Limited enacted legislation: The only AI-related federal statute was the TAKE IT DOWN Act (signed May 2025), which specifically addressed nonconsensual distribution of intimate images - this does not meet the criteria as it's not primarily focused on general AI regulation.” +- “None of these criteria were met.” + +--- + +### Question 39584 + +**Title:** Will a new war or a substantial escalation to a previous war kill at least 5,000 people in 2025? +**URL:** https://www.metaculus.com/questions/39584 + +**Question Contents:** + +> Will a new war or a substantial escalation to a previous war kill at least 5,000 people in 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Key Evidence:** +- "The India-Pakistan conflict was genuinely new but appears to have been brief with limited casualties" +- "However, the search results do NOT indicate casualties reached 5,000" +- "Sudan and Gaza had massive death tolls but were continuations of conflicts that began in 2023" +- "Syrian Network for Human Rights documented 3,338 deaths in 2025" + +--- + +### Question 39588 + +**Title:** Will Boris Johnson return as the leader of the UK Conservative Party before 2026? +**URL:** https://www.metaculus.com/questions/39588 + +**Question Contents:** + +> Will Boris Johnson return as the leader of the UK Conservative Party before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Key Evidence:** +- “The resolution criteria have NOT been met.” +- “Boris Johnson did not return as leader of the UK Conservative Party at any time between October 24, 2022 and January 1, 2026.” +- “October 25, 2022: Rishi Sunak became party leader after Truss's resignation (Johnson briefly considered a comeback but withdrew)” +- “Lost MP Status: Johnson resigned as MP for Uxbridge and South Ruislip on June 12, 2023, following the Privileges Committee investigation into ‘Partygate’” +- “The question should resolve as No.” + +--- + +### Question 39591 + +**Title:** Will an application to ban AfD be filed at the Federal Constitutional Court before 2026? +**URL:** https://www.metaculus.com/questions/39591 + +**Question Contents:** + +> Will an application to ban AfD be filed at the Federal Constitutional Court before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Key Evidence:** +- “No Federal Constitutional Court Application Filed: Despite the parliamentary initiative, no formal application was actually filed at the Federal Constitutional Court by any of the authorized entities (Federal Government, Bundestag, or Bundesrat) before January 1, 2026.” +- “Bundestag Debate (January 30, 2025): The Bundestag held its first-ever plenary debate on banning the AfD. However, the motion was NOT voted upon and was instead referred back to committee to avoid political risks and potential benefit to the AfD.” +- “Metaculus Confirmation: A Metaculus forecasting question asking the identical question resolved with ‘No,’ confirming that no such application was submitted before 2026.” +- “Conclusion: The resolution criteria have NOT been met. No application to ban AfD was filed at the Federal Constitutional Court by the Federal Government, Bundestag, or Bundesrat before January 1, 2026.” + +--- + +### Question 39608 + +**Title:** Will bitcoin reach $200k in 2025? +**URL:** https://www.metaculus.com/questions/39608 + +**Question Contents:** + +> Will bitcoin reach $200k in 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Key Evidence:** +- The question asks whether Bitcoin reached $200,000 USD as a "High" price on any single day in 2025, according to Yahoo Finance historical data. +- Bitcoin's **maximum price in 2025 was approximately $126,296**, reached on **October 6, 2025** +- Bitcoin never reached $200,000 at any point during 2025 +- **Resolution Status**: The criteria for a "Yes" resolution have NOT been met. + +--- + +### Question 39613 + +**Title:** Will the Nonlinear Fund have more than eight full-time equivalent employees on January 1, 2026? +**URL:** https://www.metaculus.com/questions/39613 + +**Question Contents:** + +> Will the Nonlinear Fund have more than eight full-time equivalent employees on January 1, 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Key Evidence:** +- “Key Observation: Throughout all my searches, I found no evidence that Nonlinear Fund ever had or was approaching 9+ full-time equivalent employees.” +- “The organization appears to have always operated as a small team of 4-5 core members plus advisors.” +- “Evidence suggests NO: - The organization's core team consists of approximately 4-5 people - No evidence of expansion to 9+ employees was found” +- “Based on the available evidence, the Nonlinear Fund appears to have remained a small organization (4-5 core team members) and shows no indication of having expanded to 9+ full-time equivalent employees by January 1, 2026.” + +--- + +### Question 39615 + +**Title:** Will there be major civil unrest or martial law in the Philippines in 2025? +**URL:** https://www.metaculus.com/questions/39615 + +**Question Contents:** + +> Will there be major civil unrest or martial law in the Philippines in 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Key Evidence:** +- Result: No martial law declared - Despite significant civil unrest throughout 2025, there is no evidence that President Ferdinand Marcos Jr. declared martial law anywhere in the Philippines during 2025 +- Total deaths from civil unrest: 2 fatalities +- Largest arrest event: September 21, 2025 protests with 224 arrests (including 95 minors) +- Based on this comprehensive research, none of the three resolution criteria were met + +--- + +### Question 40284 + +**Title:** Will there be a political protest with at least 600,000 participants in France before 2026? +**URL:** https://www.metaculus.com/questions/40284 + +**Question Contents:** + +> Will there be a political protest with at least 600,000 participants in France before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Key Evidence:** +- “Official Interior Ministry count: 506,000 participants nationwide (with 55,000 in Paris)” +- “This was the largest single protest during the relevant timeframe” +- “The count fell short of the required 600,000 threshold by approximately 94,000 participants” +- “Answer: The question should resolve as NO - there was no political protest with at least 600,000 participants according to French Ministry of Interior official reports before January 1, 2026.” + +--- + +### Question 39560 + +**Title:** Will the highest-grossing film of 2025 be part of a franchise? +**URL:** https://www.metaculus.com/questions/39560 + +**Question Contents:** + +> Will the highest-grossing film of 2025 be part of a franchise? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Key Evidence:** +- The year 2025 has concluded, and final box office data is available. +- Highest-Grossing Film of 2025: Ne Zha 2 with approximately $2.2-2.3 billion worldwide gross (sources show slight variations but all confirm it as #1) +- It's a sequel to the original "Ne Zha" (2019) +- Final worldwide box office data is available from reliable sources including Box Office Mojo + +--- + +### Question 40020 + +**Title:** Will the US government be shut down before October 2, 2025? +**URL:** https://www.metaculus.com/questions/40020 + +**Question Contents:** + +> Will the US government be shut down before October 2, 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Key Evidence:** +- Resolution Status: YES - The criteria have been met +- Shutdown Occurred: The US federal government entered shutdown furlough status on **October 1, 2025 at 12:01 AM ET** +- "as announced by the US Office of Personnel Management" - Confirmed: OPM issued special instructions for agencies affected by the lapse in appropriations starting October 1, 2025 +- "before October 2, 2025" - Confirmed: The shutdown began October 1, 2025 at 12:01 AM ET, which is before the October 2, 2025 deadline + +--- + +### Question 40210 + +**Title:** Will the United States bailout Argentina before 2026? +**URL:** https://www.metaculus.com/questions/40210 + +**Question Contents:** + +> Will the United States bailout Argentina before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Key Evidence:** +- “On October 9-20, 2025, the US Treasury Department established a $20 billion currency swap line with Argentina's central bank (BCRA)” +- “The BCRA drew $2.5 billion from this swap line in October 2025 - this clearly meets the resolution criteria” +- “All relevant actions occurred in 2025, well before the 2026 deadline” +- “Recommendation: RESOLVE AS YES” + +--- + +### Question 40246 + +**Title:** Will the following companies be added to the S&P 500 before 2026? (ARES) +**URL:** https://www.metaculus.com/questions/40246 + +**Question Contents:** + +> Will the following companies be added to the S&P 500 before 2026? (ARES) + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Key Evidence:** +- “S&P Dow Jones Indices officially announced on December 8, 2025 that Ares Management Corporation (NYSE: ARES) would be added to the S&P 500 Index.” +- “The inclusion became effective prior to the open of trading on December 11, 2025, replacing Kellanov (NYSE: K) which was being acquired by Mars Inc.” +- “The announcement date of December 8, 2025 was before January 1, 2026, which meets the resolution criteria for this question.” +- “Multiple sources confirm this was an official announcement from S&P Dow Jones Indices, the authoritative body that maintains the S&P 500 Index.” +- “Recommendation: This question should resolve YES.” + +--- + +### Question 40252 + +**Title:** Will the following companies be added to the S&P 500 before 2026? (CVNA) +**URL:** https://www.metaculus.com/questions/40252 + +**Question Contents:** + +> Will the following companies be added to the S&P 500 before 2026? (CVNA) + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Key Evidence:** +- “S&P Dow Jones Indices officially announced on December 5, 2025 that Carvana (CVNA) would be added to the S&P 500 Index, effective December 22, 2025.” +- “The announcement was made on December 5, 2025, which is before the January 1, 2026 deadline specified in the resolution criteria.” +- “The announcement came directly from S&P Dow Jones Indices, which is the authoritative source mentioned in the resolution criteria.” +- “The criteria only require the announcement, not the actual inclusion (which also occurred on December 22, 2025)” + +--- + +### Question 40253 + +**Title:** Will a player outside of the top 10 seeds compete in the 2025 FIDE World Cup final? +**URL:** https://www.metaculus.com/questions/40253 + +**Question Contents:** + +> Will a player outside of the top 10 seeds compete in the 2025 FIDE World Cup final? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Key Evidence:** +- “Javokhir Sindarov (Uzbekistan) defeated Wei Yi (China) 2.5-1.5 in the final” +- “Wei Yi was the 7th seed (2754 rating) - placing him WITHIN the top 10 seeds” +- “Javokhir Sindarov was the 16th seed - placing him OUTSIDE the top 10 seeds” +- “Resolution: YES - A player outside the top 10 seeds (Javokhir Sindarov, 16th seed) did compete in the 2025 FIDE World Cup final and actually won the tournament.” + +--- + +### Question 40285 + +**Title:** Will Ukraine extend martial law beyond November 5, 2025? +**URL:** https://www.metaculus.com/questions/40285 + +**Question Contents:** + +> Will Ukraine extend martial law beyond November 5, 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Key Evidence:** +- “Ukraine DID extend martial law beyond November 5, 2025 - this is definitively confirmed by multiple official sources.” +- “First extension: From November 5, 2025 to February 3, 2026 (approved by parliament in October 2025, signed by President Zelensky on October 30-31, 2025)” +- “Second extension: From February 3, 2026 to May 4, 2026 (approved by parliament in January 2026 based on Zelensky's decree from January 12, 2026)” +- “Current status as of February 27, 2026: - Martial law remains in effect until May 4, 2026” +- “Official confirmations: - UN depositary notifications confirm the extensions - Multiple parliamentary votes with 330+ MPs supporting each extension - Presidential decrees and parliamentary approvals are well-documented” + +--- + +### Question 40947 + +**Title:** Will Germany enact the Aktivrente before 2026? +**URL:** https://www.metaculus.com/questions/40947 + +**Question Contents:** + +> Will Germany enact the Aktivrente before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Key Evidence:** +- “The German Aktivrente was officially enacted in 2025 before the January 3, 2026 deadline” +- “December 5, 2025: Bundestag (Parliament) passage - the critical enactment date” +- “The Bundestag passed the bill on December 5, 2025, which constitutes official enactment” +- “This question will not wait for the law to enter into legal force, if the bill is officially enacted this question will immediately resolve as Yes” +- “RESOLUTION: YES - Germany enacted the Aktivrente before 2026.” + +--- From 09f3fd8e630902e5f3d98ffeb277791b8404e0d7 Mon Sep 17 00:00:00 2001 From: Luke Beukelman Date: Sun, 1 Mar 2026 13:29:38 -0700 Subject: [PATCH 07/14] Commiting updates from yesterday --- .../auto_resolver/agentic/__init__.py | 224 +++++++ .../auto_resolver/agentic/__main__.py | 69 ++ .../auto_resolver/agentic/instructions.py | 279 ++++++++ .../agents_and_tools/auto_resolver/assess.py | 223 ++++--- .../auto_resolver/perplexity.py | 597 ------------------ .../auto_resolver/resolution_models.py | 34 +- 6 files changed, 726 insertions(+), 700 deletions(-) create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/agentic/__main__.py create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/agentic/instructions.py delete mode 100644 forecasting_tools/agents_and_tools/auto_resolver/perplexity.py diff --git a/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py b/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py new file mode 100644 index 00000000..dbf9be40 --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py @@ -0,0 +1,224 @@ +""" +Agentic question resolver using OpenAI Agents. + +This module implements a multi-agent architecture for resolving Metaculus +forecasting questions. It uses a pluggable researcher pattern with a default +Perplexity-based implementation. + +Architecture: + 0. Question rephraser (standalone LLM call) converts past-deadline + questions from future tense to past tense for better research + 1. Orchestrator (minimal) coordinates handoffs between agents + 2. Researcher agent performs multiple strategic searches + 3. Resolver agent analyzes research and determines resolution + 4. Structured output parsing converts to typed resolution +""" + +import logging +from typing import Optional, Callable + +from forecasting_tools.data_models.questions import ( + ResolutionType, + CanceledResolution, + BinaryResolution, +) +from forecasting_tools import MetaculusQuestion, BinaryQuestion +from forecasting_tools.agents_and_tools.auto_resolver.agentic.instructions import * +from forecasting_tools.agents_and_tools.auto_resolver.resolution_models import BinaryResolutionResult +from forecasting_tools.agents_and_tools.auto_resolver import AutoResolver +from forecasting_tools.agents_and_tools.minor_tools import ( + perplexity_reasoning_pro_search, +) +from forecasting_tools.ai_models.agent_wrappers import ( + AgentRunner, + AgentSdkLlm, + AiAgent, +) +from forecasting_tools.ai_models.general_llm import GeneralLlm +from forecasting_tools.helpers.structure_output import structure_output +from forecasting_tools.util.misc import clean_indents + +logger = logging.getLogger(__name__) + + + + +class StructuredOutputParsingError(Exception): + """Raised when structured output parsing fails. + + This exception preserves the raw unparsed output for debugging purposes. + + Attributes: + raw_output: The unparsed output that failed parsing + original_error: The original exception that caused the failure + """ + + def __init__(self, raw_output: str, original_error: Exception): + self.raw_output = raw_output + self.original_error = original_error + super().__init__( + f"Failed to parse structured output: {original_error}\n" + f"Raw output available in exception.raw_output" + ) + + +class AgenticResolver(AutoResolver): + """ + Agentic resolver using agents SDK. + """ + + def __init__(self, + model_for_supervisor: str = "openrouter/anthropic/claude-sonnet-4.6", + model_for_resolver: str = "openrouter/anthropic/claude-sonnet-4.6", + model_for_output_structure: str = "openrouter/anthropic/claude-sonnet-4.6", + model_for_researcher: str = "openrouter/anthropic/claude-sonnet-4.6", + model_for_rephraser: str = "openrouter/anthropic/claude-sonnet-4.6", + timeout: int = 480 + ): + self.model_for_supervisor = model_for_supervisor + self.model_for_resolver = model_for_resolver + self.model_for_output_structure = model_for_output_structure + self.model_for_researcher = model_for_researcher + self.model_for_rephraser = model_for_rephraser + self.timeout = timeout + + + async def resolve_question( + self, question: MetaculusQuestion + ) -> Optional[ResolutionType]: + if isinstance(question, BinaryQuestion): + return await self._resolve_binary(question) + else: + return NotImplemented + + async def _resolve_binary( + self, question: BinaryQuestion + ) -> Optional[BinaryResolution]: + + # Rephrase question if its time context has passed + question = await self._rephrase_question_if_needed(question) + + # Create agents + researcher = self._create_researcher(question) + resolver = self._create_resolver_agent(question) + orchestrator = self._create_orchestrator_agent(researcher, resolver) + + # Run the workflow (non-streaming) + result = await AgentRunner.run( + orchestrator, "Please begin the resolution process.", max_turns=10 + ) + + # Parse structured output with error handling + try: + resolution_result = await structure_output( + result.final_output, + BinaryResolutionResult, + model=self.model_for_output_structure, + ) + logger.info( + f"Successfully parsed resolution: {resolution_result.resolution_status}" + ) + except Exception as e: + logger.error(f"Failed to parse structured output: {e}", exc_info=True) + raise StructuredOutputParsingError( + raw_output=result.final_output, original_error=e + ) from e + + # Store metadata for later retrieval + self._last_resolution_metadata = { + "reasoning": resolution_result.reasoning, + "key_evidence": resolution_result.key_evidence, + } + + # Convert to typed resolution + typed_resolution = resolution_result.convert_to_binary_resolution() + logger.info(f"Final resolution: {typed_resolution}") + + return typed_resolution + + async def _rephrase_question_if_needed( + self, question: BinaryQuestion + ) -> BinaryQuestion: + """Rephrase the question into past tense if its time context has passed. + + Uses a lightweight LLM call to determine whether the question's deadline + has already passed and, if so, rephrases it from future tense to past + tense. This makes downstream research searches more effective. + + Args: + question: The original question to potentially rephrase. + + Returns: + A copy of the question with question_text updated if rephrasing + was needed, or the original question unchanged. + """ + prompt = question_rephraser_instructions(question) + llm = GeneralLlm(model=self.model_for_rephraser, temperature=0.0) + + try: + rephrased_text = await llm.invoke(prompt) + rephrased_text = rephrased_text.strip().strip('"').strip("'") + + if rephrased_text and rephrased_text != question.question_text: + logger.info( + f"Question rephrased:\n" + f" Original: {question.question_text}\n" + f" Rephrased: {rephrased_text}" + ) + question = question.model_copy(deep=True) + question.question_text = rephrased_text + else: + logger.info("Question rephraser: no rephrasing needed") + except Exception as e: + logger.warning( + f"Question rephrasing failed, proceeding with original: {e}", + exc_info=True, + ) + + return question + + def _create_researcher(self, question: BinaryQuestion) -> AiAgent: + instructions = researcher_instructions(question) + return AiAgent( + name="Resolution Researcher", + instructions=instructions, + model=AgentSdkLlm(model=self.model_for_researcher), + tools=[perplexity_reasoning_pro_search], + handoffs=[], + ) + + def _create_resolver_agent(self, question: BinaryQuestion) -> AiAgent: + instructions = binary_resolver_instructions(question) + return AiAgent( + name="resolver", + instructions=instructions, + model=AgentSdkLlm(model=self.model_for_resolver), + tools=[], # No tools - only analyzes research + handoffs=[], # Terminal agent + ) + + def _create_orchestrator_agent( + self, researcher: AiAgent, resolver: AiAgent + ) -> AiAgent: + logger.debug("Creating minimal orchestrator agent") + + instructions = clean_indents( + """ + You are coordinating a question resolution process. + + Your task is simple: + 1. Hand off to the Resolution Researcher to gather information + 2. The researcher will hand off to the resolver when ready + 3. The resolver will provide the final resolution + + Begin by handing off to the researcher. + """ + ) + + return AiAgent( + name="Resolution Orchestrator", + instructions=instructions, + model=AgentSdkLlm(model=self.model_for_output_structure), + tools=[], + handoffs=[researcher, resolver], + ) diff --git a/forecasting_tools/agents_and_tools/auto_resolver/agentic/__main__.py b/forecasting_tools/agents_and_tools/auto_resolver/agentic/__main__.py new file mode 100644 index 00000000..f0d9a503 --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/agentic/__main__.py @@ -0,0 +1,69 @@ +import forecasting_tools +from forecasting_tools.agents_and_tools.auto_resolver.agentic import AgenticResolver +from forecasting_tools.agents_and_tools.auto_resolver.agentic import logger as target_logger +import asyncio +from dotenv import load_dotenv + +load_dotenv() + + +from forecasting_tools.agents_and_tools.auto_resolver.assess import ( + ResolutionAssesser, +) +from forecasting_tools import MetaculusClient, ApiFilter +from dotenv import load_dotenv +import random + +import logging + + +async def main(): + logging.getLogger(target_logger.name).setLevel(logging.INFO) + + # Fetch all resolved binary questions from AIB Fall 2025 + client = MetaculusClient() + filter = ApiFilter( + allowed_tournaments=[MetaculusClient.AIB_FALL_2025_ID], + allowed_statuses=["resolved"], + allowed_types=["binary"], + group_question_mode="exclude", + order_by="-published_time" + ) + + all_questions = await client.get_questions_matching_filter(filter) + + # Randomly sample questions + sample_size = min(20, len(all_questions)) + sampled_questions = random.sample(all_questions, sample_size) + question_ids = [q.id_of_post for q in sampled_questions if q.id_of_post is not None] + + + # Create resolver + resolver = AgenticResolver() + + # Create assessor with specific question IDs + assesser = ResolutionAssesser( + resolver, allowed_types=["binary"], questions=question_ids + ) + + + # Run assessment + report = await assesser.assess_resolver() + + # Print results + print("\n" + "=" * 60) + print("OpenAI Resolver Assessment Results") + print(f"Tested on {len(question_ids)} random questions from AIB Fall 2025") + print("=" * 60) + print(report) + print("=" * 60) + + # Save detailed report to reports directory + try: + report_path = report.write_to_file(directory="reports") + print(f"\nDetailed report saved to: {report_path}") + except Exception as e: + print(f"\nWarning: Could not save report to reports directory: {e}") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/forecasting_tools/agents_and_tools/auto_resolver/agentic/instructions.py b/forecasting_tools/agents_and_tools/auto_resolver/agentic/instructions.py new file mode 100644 index 00000000..abfd4d86 --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/agentic/instructions.py @@ -0,0 +1,279 @@ +""" +To avoid clogging the code file (__init__.py), instruction generation functions are placed here. +""" + +import pendulum + +from forecasting_tools import clean_indents, BinaryQuestion + +def researcher_instructions(question: BinaryQuestion) -> str: + """Build detailed instructions for the researcher agent. + + Args: + question: The question being researched + + Returns: + Formatted instruction string + """ + + return clean_indents( + f""" + # Your Role + + You are a research assistant gathering information to resolve a forecasting question. + + # The Question + + {question.give_question_details_as_markdown()} + + # Your Task + + Perform multiple strategic Perplexity searches to thoroughly investigate: + + 1. **Current Status**: What is the current state of affairs related to this question? + 2. **Resolution Criteria**: Have the resolution criteria been met? + 3. **Timeline Check**: Consider the scheduled resolution date and current date + 4. **Verification**: Cross-check information from multiple sources + 5. **Edge Cases**: Look for any ambiguities, disputes, or complications + 6. **Validity Check**: Investigate whether the question's subject is valid/possible (for potential annulment) + + # Search Strategy Guidelines + + - Run 3-5 searches total (don't overdo it) + - Run searches in parallel when they're independent + - Use follow-up searches based on initial findings + - Focus on authoritative and recent sources + - Note any contradictions or uncertainties you find + - Pay special attention to dates and timelines + + # Decomposition Strategy for Multi-Entity Questions + + If a question involves multiple entities (companies, people, organizations, etc.): + + 1. **First attempt**: Search for comprehensive data about all entities + 2. **If comprehensive search fails**: Decompose the question and search each entity individually + 3. **Example**: For "Did all Magnificent Seven stocks decline 50%?", search: + - "Microsoft MSFT all-time high 2025" + - "Nvidia NVDA 2025 stock performance" + - "Apple AAPL 2025 pricing" + - And so on for each company + 4. **Then aggregate**: Combine individual findings to answer the comprehensive question + + # Detecting Annulled/Invalid Questions + + Some questions may be fundamentally invalid (annulled). Look for: + + - Studies/experiments that were never conducted, cancelled, or abandoned + - Questions about entities that never existed or were fundamentally misconceived + - Research projects that lost funding, had impossible criteria, or were invalid from the start + - Any indication that the question's subject is impossible or doesn't exist + + **When you cannot find evidence of an event occurring:** + - Search specifically for: "[subject] cancelled", "[subject] abandoned", "[subject] never conducted" + - Search for: "[subject] funding withdrawn", "[subject] fundamental problems", "[subject] invalid" + - If you find evidence the subject was never valid/possible, note this for potential ANNULLED resolution + + # Example Search Sequence + + 1. Broad search: "Current status of [topic] as of [current date]" + 2. Specific search: "Has [specific criterion] occurred?" + 3. Verification: "Latest news about [topic]" + 4. (Optional) Follow-up based on findings + 5. (If no results found) Validity check: "[topic] cancelled", "[topic] validity", "[topic] problems" + + # Important Reminders + + - Be thorough but efficient + - Document your findings clearly + - Note the sources and dates of information + - If you find conflicting information, document both sides + - Decompose multi-entity questions when comprehensive searches fail + - Actively search for evidence of annulment/invalidity when no results are found + - When ready, hand off your research to the resolver + + # Handoff + + When you've gathered sufficient information, hand off to the resolver + with a comprehensive summary of your research findings. + """ + ) + +def question_rephraser_instructions(question: BinaryQuestion) -> str: + """Build instructions for the question rephraser LLM call. + + This prompt asks the LLM to rephrase a forward-looking forecasting + question into past tense when the question's time context has already + passed, making it easier for downstream research agents to search for + information. + + Args: + question: The question to potentially rephrase + + Returns: + Formatted instruction string + """ + today_string = pendulum.now(tz="UTC").strftime("%Y-%m-%d") + scheduled_resolution = ( + question.scheduled_resolution_time.strftime("%Y-%m-%d") + if question.scheduled_resolution_time + else "Not specified" + ) + + return clean_indents( + f""" + # Your Task + + You are a question rephraser for a forecasting resolution system. Your job + is to determine whether a forecasting question's time context has already + passed and, if so, rephrase it from future tense into past tense. + + This rephrasing helps downstream research agents search more effectively, + since searching for "Did X happen?" yields better results than "Will X + happen?" when the deadline has already passed. + + # The Question + + {question.question_text} + + # Additional Context + + Resolution criteria: {question.resolution_criteria} + + Fine print: {question.fine_print} + + Scheduled resolution date: {scheduled_resolution} + + Today's date (UTC): {today_string} + + # Instructions + + 1. Examine the question text, resolution criteria, fine print, and + scheduled resolution date to identify any deadlines or time-bound + conditions. + 2. Compare those deadlines to today's date. + 3. If the deadline or time context has ALREADY PASSED: + - Rephrase the question from future tense to past tense. + - Keep the meaning, scope, and specificity identical. + - Preserve all named entities, numbers, and conditions exactly. + - Use the deadline from the original question (not today's date) + in the rephrased version. + 4. If the deadline has NOT yet passed, or the question has no + time-bound element, return the question text EXACTLY as-is. + + # Examples + + Example 1 (deadline passed): + - Original: "Will a trade deal between the US and China be signed by October 2025?" + - Today: 2025-11-15 + - Rephrased: "Was a trade deal between the US and China signed by October 2025?" + + Example 2 (deadline passed): + - Original: "Will a deal be signed by the end of March 2024?" + - Today: 2024-06-01 + - Rephrased: "Was a deal signed before the end of March 2024?" + + Example 3 (deadline NOT passed): + - Original: "Will AI surpass human performance on all MMLU categories by 2030?" + - Today: 2026-02-28 + - Rephrased: "Will AI surpass human performance on all MMLU categories by 2030?" + (returned unchanged) + + Example 4 (deadline passed, complex question): + - Original: "Will the global average temperature exceed 1.5C above pre-industrial levels before January 1, 2026?" + - Today: 2026-03-01 + - Rephrased: "Did the global average temperature exceed 1.5C above pre-industrial levels before January 1, 2026?" + + # Output Format + + Return ONLY the (possibly rephrased) question text. Do not include any + explanation, reasoning, or additional text. Just the question. + """ + ) + + +def binary_resolver_instructions(question: BinaryQuestion) -> str: + + return clean_indents( + f""" + # Your Role + + You are a resolution analyst determining the final resolution status + of a forecasting question based on research provided to you. + + # The Question + + {question.give_question_details_as_markdown()} + + # Resolution Options + + You must determine one of the following resolutions: + + ## TRUE + - Resolution criteria have been definitively met + - The outcome is YES/positive + - There is strong evidence supporting this + + ## FALSE + - Resolution criteria have been definitively met + - The outcome is NO/negative + - There is strong evidence supporting this + + ## AMBIGUOUS + - The resolution criteria occurred + - BUT the outcome is unclear or disputed + - Multiple interpretations are reasonable + - Example: A law passed but its scope is unclear + + ## ANNULLED + - A fundamental assumption of the question is false + - The question itself is invalid or malformed + - Example: Question asks about a company that never existed + + ## NOT_YET_RESOLVABLE + - Insufficient information currently available + - OR the resolution date/event hasn't occurred yet + - OR you cannot confidently determine the resolution + - **BE CONSERVATIVE: Default to this when uncertain** + + # Analysis Guidelines + + 1. **Review the research** provided by the researcher carefully + 2. **Check the timeline**: Has the scheduled resolution date passed? + 3. **Assess the evidence**: Is it strong enough for a definitive resolution? + 4. **Consider ambiguity**: Is the outcome clear or disputed? + 5. **Be conservative**: If uncertain, return NOT_YET_RESOLVABLE + + # Critical Distinctions + + **AMBIGUOUS vs ANNULLED:** + - AMBIGUOUS: Question is valid, but answer is unclear + - ANNULLED: Question itself is invalid/malformed + + **FALSE vs NOT_YET_RESOLVABLE:** + - FALSE: Definitively did NOT happen + - NOT_YET_RESOLVABLE: Might still happen or unclear if it happened + + # Output Format + + Provide your analysis in the following format: + + **Resolution Status**: [Your chosen status] + + **Reasoning**: [2-4 sentences explaining your decision] + + **Key Evidence**: + - [Evidence point 1] + - [Evidence point 2] + - [Evidence point 3] + - [Evidence point 4 - optional] + - [Evidence point 5 - optional] + + # Important + + - Be thorough in your reasoning + - Cite specific information from the research + - Acknowledge uncertainties when present + - Your output will be parsed programmatically, so follow the format exactly + """ + ) + diff --git a/forecasting_tools/agents_and_tools/auto_resolver/assess.py b/forecasting_tools/agents_and_tools/auto_resolver/assess.py index c35aa983..48f22bf8 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/assess.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/assess.py @@ -27,109 +27,128 @@ class QuestionAssessmentResult: @dataclass class ResolutionAssessmentReport: """ - Table for binary assessment (y-axis is true value, x-axis is predicted value): - - | True | False | Unresolvable | Cancelled | Unmatched - ------|------|-------|--------------|-----------|------------ - True | n_tp | n_fn | n_mp | n_ct | n_um_true - False | n_fp | n_tn | n_mn | n_cf | n_um_false - Cancelled | n_tc | n_fc | n_mc | n_cc | n_um_ca + Contained variables are arrays, named according to the convention: "{predicted}{true}". + Values for predicted/true are as follows: + - p: positive + - n: negative + - c: cancelled + - x: not answered """ - tp: list[int] = field(default_factory=list) - fp: list[int] = field(default_factory=list) - fn: list[int] = field(default_factory=list) - tn: list[int] = field(default_factory=list) - mp: list[int] = field(default_factory=list) - mn: list[int] = field(default_factory=list) - ct: list[int] = field(default_factory=list) # True actual, Cancelled predicted - cf: list[int] = field(default_factory=list) # False actual, Cancelled predicted - tc: list[int] = field(default_factory=list) # Cancelled actual, True predicted - fc: list[int] = field(default_factory=list) # Cancelled actual, False predicted - mc: list[int] = field(default_factory=list) # Cancelled actual, Unresolvable predicted - cc: list[int] = field(default_factory=list) # Cancelled actual, Cancelled predicted - um_true: list[int] = field(default_factory=list) # True actual, unmatched predicted (error/edge case) - um_false: list[int] = field(default_factory=list) # False actual, unmatched predicted (error/edge case) - um_ca: list[int] = field(default_factory=list) # Cancelled actual, unmatched predicted (error/edge case) + pp: list[int] = field(default_factory=list) + pn: list[int] = field(default_factory=list) + pc: list[int] = field(default_factory=list) + + np: list[int] = field(default_factory=list) + nn: list[int] = field(default_factory=list) + nc: list[int] = field(default_factory=list) + + cp: list[int] = field(default_factory=list) + cn: list[int] = field(default_factory=list) + cc: list[int] = field(default_factory=list) + + xp: list[int] = field(default_factory=list) + xn: list[int] = field(default_factory=list) + xc: list[int] = field(default_factory=list) + question_results: dict[int, QuestionAssessmentResult] = field(default_factory=dict) @property - def n_tp(self) -> int: - return len(self.tp) + def n_pp(self) -> int: + return len(self.pp) @property - def n_fp(self) -> int: - return len(self.fp) + def n_pn(self) -> int: + return len(self.pn) @property - def n_fn(self) -> int: - return len(self.fn) + def n_pc(self) -> int: + return len(self.pc) @property - def n_tn(self) -> int: - return len(self.tn) + def n_np(self) -> int: + return len(self.np) @property - def n_mp(self) -> int: - return len(self.mp) + def n_nn(self) -> int: + return len(self.nn) @property - def n_mn(self) -> int: - return len(self.mn) - - @property - def n_ct(self) -> int: - return len(self.ct) - - @property - def n_cf(self) -> int: - return len(self.cf) - - @property - def n_tc(self) -> int: - return len(self.tc) - + def n_nc(self) -> int: + return len(self.nc) + @property - def n_fc(self) -> int: - return len(self.fc) - + def n_cp(self) -> int: + return len(self.cp) + @property - def n_mc(self) -> int: - return len(self.mc) - + def n_cn(self) -> int: + return len(self.cn) + @property def n_cc(self) -> int: return len(self.cc) - + @property - def n_um_true(self) -> int: - return len(self.um_true) - + def n_xp(self) -> int: + return len(self.xp) + @property - def n_um_false(self) -> int: - return len(self.um_false) - + def n_xn(self) -> int: + return len(self.xn) + @property - def n_um_ca(self) -> int: - return len(self.um_ca) + def n_xc(self) -> int: + return len(self.xc) def binary_results_table(self) -> str: """ Returns a markdown table representation of the binary assessment report. - The "Unmatched" column contains cases where the resolver returned an unexpected - value (e.g., NotImplemented for unsupported question types, or other edge cases). - These are logged as warnings for debugging. + Columns represent predicted resolutions, rows represent actual resolutions. + The "Not Answered" column contains cases where the resolver returned None + or an unexpected value (logged as warnings for debugging). Returns: str: A markdown formatted confusion matrix table """ - return f"""\ -| Actual \\ Predicted | True | False | Unresolvable | Cancelled | Unmatched | -|--------------------|------|-------|--------------|-----------|-----------| -| True | {str(self.n_tp).rjust(3)} | {str(self.n_fn).rjust(3)} | {str(self.n_mp).rjust(3)} | {str(self.n_ct).rjust(3)} | {str(self.n_um_true).rjust(3)} | -| False | {str(self.n_fp).rjust(3)} | {str(self.n_tn).rjust(3)} | {str(self.n_mn).rjust(3)} | {str(self.n_cf).rjust(3)} | {str(self.n_um_false).rjust(3)} | -| Cancelled | {str(self.n_tc).rjust(3)} | {str(self.n_fc).rjust(3)} | {str(self.n_mc).rjust(3)} | {str(self.n_cc).rjust(3)} | {str(self.n_um_ca).rjust(3)} |""" + corner_label = "Actual \\ Predicted" + col_headers = ["Positive", "Negative", "Cancelled", "Not Answered"] + row_labels = ["Positive", "Negative", "Cancelled"] + # Rows ordered: actual Positive, actual Negative, actual Cancelled + # Columns ordered: predicted Positive, predicted Negative, predicted Cancelled, predicted Not Answered + data = [ + [str(self.n_pp), str(self.n_np), str(self.n_cp), str(self.n_xp)], + [str(self.n_pn), str(self.n_nn), str(self.n_cn), str(self.n_xn)], + [str(self.n_pc), str(self.n_nc), str(self.n_cc), str(self.n_xc)], + ] + + # Compute column widths dynamically + col_widths = [len(corner_label)] + for i, header in enumerate(col_headers): + max_data_width = max(len(data[row][i]) for row in range(len(row_labels))) + col_widths.append(max(len(header), max_data_width)) + + # Update corner label width to account for row labels + col_widths[0] = max(col_widths[0], max(len(label) for label in row_labels)) + + def fmt_row(cells: list[str]) -> str: + parts = [] + for i, cell in enumerate(cells): + parts.append(f" {cell.ljust(col_widths[i])} ") + return "|" + "|".join(parts) + "|" + + def fmt_separator() -> str: + return "|" + "|".join("-" * (w + 2) for w in col_widths) + "|" + + header_row = fmt_row([corner_label] + col_headers) + separator = fmt_separator() + body_rows = [ + fmt_row([row_labels[r]] + data[r]) + for r in range(len(row_labels)) + ] + + return "\n".join([header_row, separator] + body_rows) def detailed_report(self) -> str: """ @@ -148,10 +167,11 @@ def detailed_report(self) -> str: lines.append("") # Results calculation - total = self.n_tp + self.n_fp + self.n_fn + self.n_tn + self.n_mp + self.n_mn + \ - self.n_ct + self.n_cf + self.n_tc + self.n_fc + self.n_mc + self.n_cc + \ - self.n_um_true + self.n_um_false + self.n_um_ca - correct = self.n_tp + self.n_tn + self.n_cc + total = self.n_pp + self.n_pn + self.n_pc + \ + self.n_np + self.n_nn + self.n_nc + \ + self.n_cp + self.n_cn + self.n_cc + \ + self.n_xp + self.n_xn + self.n_xc + correct = self.n_pp + self.n_nn + self.n_cc accuracy = (correct / total * 100) if total > 0 else 0 lines.append(f"**Total Questions:** {total}") @@ -369,46 +389,45 @@ async def assess_resolver(self) -> ResolutionAssessmentReport: outcome_category = None match true_resolution, test_resolution: - # True actual resolution cases + # Positive actual resolution cases case True, True: - report.tp.append(question_id) + report.pp.append(question_id) outcome_category = "True Positive" case True, False: - report.fn.append(question_id) + report.np.append(question_id) outcome_category = "False Negative" case True, None: - report.mp.append(question_id) + report.xp.append(question_id) outcome_category = "Missed Positive" - # False actual resolution cases + case True, CanceledResolution(): + report.cp.append(question_id) + outcome_category = "Positive Incorrectly Predicted as Cancelled" + # Negative actual resolution cases case False, True: - report.fp.append(question_id) + report.pn.append(question_id) outcome_category = "False Positive" case False, False: - report.tn.append(question_id) + report.nn.append(question_id) outcome_category = "True Negative" case False, None: - report.mn.append(question_id) + report.xn.append(question_id) outcome_category = "Missed Negative" + case False, CanceledResolution(): + report.cn.append(question_id) + outcome_category = "Negative Incorrectly Predicted as Cancelled" # Cancelled actual resolution cases case CanceledResolution(), True: - report.tc.append(question_id) - outcome_category = "True Incorrectly Predicted as True" + report.pc.append(question_id) + outcome_category = "Cancelled Incorrectly Predicted as Positive" case CanceledResolution(), False: - report.fc.append(question_id) - outcome_category = "Cancelled Incorrectly Predicted as False" + report.nc.append(question_id) + outcome_category = "Cancelled Incorrectly Predicted as Negative" case CanceledResolution(), None: - report.mc.append(question_id) - outcome_category = "Cancelled Predicted as Unresolvable" + report.xc.append(question_id) + outcome_category = "Cancelled Not Answered" case CanceledResolution(), CanceledResolution(): report.cc.append(question_id) outcome_category = "Correct Cancel" - # True/False actual with Cancelled predicted - case True, CanceledResolution(): - report.ct.append(question_id) - outcome_category = "True Incorrectly Predicted as Cancelled" - case False, CanceledResolution(): - report.cf.append(question_id) - outcome_category = "False Incorrectly Predicted as Cancelled" # Catch-all for unmatched cases (edge cases, errors, NotImplemented, etc.) case _: logging.warning( @@ -416,13 +435,13 @@ async def assess_resolver(self) -> ResolutionAssessmentReport: f"true_resolution={true_resolution}, test_resolution={test_resolution}" ) if true_resolution == True: - report.um_true.append(question_id) - outcome_category = "Unmatched - True" + report.xp.append(question_id) + outcome_category = "Unmatched - Positive" elif true_resolution == False: - report.um_false.append(question_id) - outcome_category = "Unmatched - False" + report.xn.append(question_id) + outcome_category = "Unmatched - Negative" elif isinstance(true_resolution, CanceledResolution): - report.um_ca.append(question_id) + report.xc.append(question_id) outcome_category = "Unmatched - Cancelled" else: continue @@ -439,4 +458,4 @@ async def assess_resolver(self) -> ResolutionAssessmentReport: outcome_category=outcome_category, ) report.question_results[question_id] = question_result - return report \ No newline at end of file + return report diff --git a/forecasting_tools/agents_and_tools/auto_resolver/perplexity.py b/forecasting_tools/agents_and_tools/auto_resolver/perplexity.py deleted file mode 100644 index 81b38c9d..00000000 --- a/forecasting_tools/agents_and_tools/auto_resolver/perplexity.py +++ /dev/null @@ -1,597 +0,0 @@ -""" -Multi-agent question resolver using OpenAI Agents SDK and perplexity. - -This module implements a multi-agent architecture for resolving Metaculus -forecasting questions. It uses a pluggable researcher pattern with a default -Perplexity-based implementation. - -Architecture: - 1. Orchestrator (minimal) coordinates handoffs between agents - 2. Researcher agent performs multiple strategic searches - 3. Resolver agent analyzes research and determines resolution - 4. Structured output parsing converts to typed resolution -""" - -import logging -from typing import Optional, Callable - -from forecasting_tools.data_models.questions import ( - ResolutionType, - CanceledResolution, - BinaryResolution, -) -from forecasting_tools import MetaculusQuestion, BinaryQuestion -from forecasting_tools.agents_and_tools.auto_resolver import AutoResolver -from forecasting_tools.agents_and_tools.auto_resolver.resolution_models import ( - BinaryResolutionResult, -) -from forecasting_tools.agents_and_tools.minor_tools import ( - perplexity_reasoning_pro_search, -) -from forecasting_tools.ai_models.agent_wrappers import ( - AgentRunner, - AgentSdkLlm, - AiAgent, -) -from forecasting_tools.ai_models.general_llm import GeneralLlm -from forecasting_tools.helpers.structure_output import structure_output -from forecasting_tools.util.misc import clean_indents - -logger = logging.getLogger(__name__) - - -class StructuredOutputParsingError(Exception): - """Raised when structured output parsing fails. - - This exception preserves the raw unparsed output for debugging purposes. - - Attributes: - raw_output: The unparsed output that failed parsing - original_error: The original exception that caused the failure - """ - - def __init__(self, raw_output: str, original_error: Exception): - self.raw_output = raw_output - self.original_error = original_error - super().__init__( - f"Failed to parse structured output: {original_error}\n" - f"Raw output available in exception.raw_output" - ) - - -class OpenAIResolver(AutoResolver): - """Multi-agent resolver using OpenAI Agents SDK. - - This resolver uses a multi-agent architecture to resolve Metaculus questions: - 1. A minimal orchestrator coordinates the workflow - 2. A researcher agent performs multiple Perplexity searches - 3. A resolver agent analyzes the research and determines the resolution - 4. Structured output parsing converts the result to a typed resolution - - The researcher agent is pluggable, allowing for custom implementations. - - Attributes: - model: LLM model identifier for the agents - researcher_agent_factory: Factory function to create researcher agents - structure_output_model: Model used for structured output parsing - timeout: Timeout for LLM calls in seconds - """ - - def __init__( - self, - model: str = "openrouter/anthropic/claude-sonnet-4", - researcher_agent_factory: Optional[Callable[[BinaryQuestion], AiAgent]] = None, - structure_output_model: Optional[GeneralLlm] = None, - timeout: int = 480, - ): - """Initialize OpenAI Resolver. - - Args: - model: LLM model for agents (default: claude-sonnet-4) - researcher_agent_factory: Optional factory function that takes a - BinaryQuestion and returns a custom researcher AiAgent. - If None, uses default Perplexity-based researcher. - structure_output_model: Model for structured output parsing. - If None, uses gpt-5.2 mini with low temperature. - timeout: Timeout for LLM calls in seconds - """ - self.model = model - self.researcher_agent_factory = ( - researcher_agent_factory or self._create_default_researcher - ) - self.structure_output_model = structure_output_model or GeneralLlm( - "openrouter/openai/gpt-5.2", - temperature=0.2, - timeout=timeout, - ) - self.timeout = timeout - self._last_resolution_metadata = None - logger.info( - f"Initialized OpenAIResolver with model={model}, timeout={timeout}s" - ) - - async def resolve_question( - self, question: MetaculusQuestion - ) -> Optional[ResolutionType]: - """Resolve a Metaculus question. - - Args: - question: The question to resolve - - Returns: - Typed resolution or None if not yet resolvable - - Raises: - StructuredOutputParsingError: If output parsing fails (includes raw output) - Other exceptions: Propagated from agent execution - """ - logger.info( - f"Starting resolution for question {question.id_of_post}: " - f"{question.question_text[:100]}..." - ) - - if isinstance(question, BinaryQuestion): - return await self._resolve_binary(question) - else: - logger.warning(f"Question type {type(question)} not yet supported") - return NotImplemented - - async def _resolve_binary( - self, question: BinaryQuestion - ) -> Optional[BinaryResolution]: - """Resolve a binary question using multi-agent workflow. - - Workflow: - 1. Create researcher, resolver, and orchestrator agents - 2. Run orchestrator (which coordinates handoffs) - 3. Parse final output with structured output - 4. Convert to typed resolution - 5. Store metadata (key_evidence) for later retrieval - - Args: - question: Binary question to resolve - - Returns: - BinaryResolution (True/False/AMBIGUOUS/ANNULLED) or None - - Raises: - StructuredOutputParsingError: If parsing fails - """ - logger.info(f"Creating agent workflow for binary question {question.id_of_post}") - - # Create agents - researcher = self.researcher_agent_factory(question) - resolver = self._create_resolver_agent(question) - orchestrator = self._create_orchestrator_agent(researcher, resolver) - - logger.info("Running orchestrator agent (max_turns=10)") - - # Run the workflow (non-streaming) - result = await AgentRunner.run( - orchestrator, "Please begin the resolution process.", max_turns=10 - ) - - logger.info( - f"Agent workflow completed. Final output length: " - f"{len(result.final_output)} chars" - ) - logger.debug(f"Final output preview: {result.final_output[:200]}...") - - # Parse structured output with error handling - try: - resolution_result = await structure_output( - result.final_output, - BinaryResolutionResult, - model=self.structure_output_model, - ) - logger.info( - f"Successfully parsed resolution: {resolution_result.resolution_status}" - ) - except Exception as e: - logger.error(f"Failed to parse structured output: {e}", exc_info=True) - raise StructuredOutputParsingError( - raw_output=result.final_output, original_error=e - ) from e - - # Store metadata for later retrieval - self._last_resolution_metadata = { - "reasoning": resolution_result.reasoning, - "key_evidence": resolution_result.key_evidence, - } - - # Convert to typed resolution - typed_resolution = self._convert_to_binary_resolution(resolution_result) - logger.info(f"Final resolution: {typed_resolution}") - - return typed_resolution - - def get_last_resolution_metadata(self) -> dict | None: - """Returns metadata from the last resolution attempt. - - Returns: - dict with 'reasoning' and 'key_evidence' keys, or None if no resolution has been attempted - """ - return self._last_resolution_metadata - - def _create_default_researcher(self, question: BinaryQuestion) -> AiAgent: - """Create default Perplexity-based researcher agent. - - This agent performs multiple strategic searches to gather - comprehensive information about the question's resolution status. - - Args: - question: The question to research - - Returns: - Configured researcher AiAgent - """ - logger.debug("Creating default Perplexity-based researcher agent") - - instructions = self._build_researcher_instructions(question) - - return AiAgent( - name="Resolution Researcher", - instructions=instructions, - model=AgentSdkLlm(model=self.model), - tools=[perplexity_reasoning_pro_search], - handoffs=["resolver"], - ) - - def _create_resolver_agent(self, question: BinaryQuestion) -> AiAgent: - """Create resolver agent that determines final resolution. - - This agent receives research from the researcher agent and - makes the final resolution determination. - - Args: - question: The question being resolved - - Returns: - Configured resolver AiAgent - """ - logger.debug("Creating resolver agent") - - instructions = self._build_resolver_instructions(question) - - return AiAgent( - name="resolver", - instructions=instructions, - model=AgentSdkLlm(model=self.model), - tools=[], # No tools - only analyzes research - handoffs=[], # Terminal agent - ) - - def _create_orchestrator_agent( - self, researcher: AiAgent, resolver: AiAgent - ) -> AiAgent: - """Create minimal orchestrator that enables handoffs. - - This is a simple coordinator that connects the researcher - and resolver agents. - - Args: - researcher: The researcher agent - resolver: The resolver agent - - Returns: - Minimal orchestrator AiAgent - """ - logger.debug("Creating minimal orchestrator agent") - - instructions = clean_indents( - """ - You are coordinating a question resolution process. - - Your task is simple: - 1. Hand off to the Resolution Researcher to gather information - 2. The researcher will hand off to the resolver when ready - 3. The resolver will provide the final resolution - - Begin by handing off to the researcher. - """ - ) - - return AiAgent( - name="Resolution Orchestrator", - instructions=instructions, - model=AgentSdkLlm(model=self.model), - tools=[], - handoffs=[researcher, resolver], - ) - - def _build_researcher_instructions(self, question: BinaryQuestion) -> str: - """Build detailed instructions for the researcher agent. - - Args: - question: The question being researched - - Returns: - Formatted instruction string - """ - logger.debug("Building researcher instructions") - - return clean_indents( - f""" - # Your Role - - You are a research assistant gathering information to resolve a forecasting question. - - # The Question - - {question.give_question_details_as_markdown()} - - # Your Task - - Perform multiple strategic Perplexity searches to thoroughly investigate: - - 1. **Current Status**: What is the current state of affairs related to this question? - 2. **Resolution Criteria**: Have the resolution criteria been met? - 3. **Timeline Check**: Consider the scheduled resolution date and current date - 4. **Verification**: Cross-check information from multiple sources - 5. **Edge Cases**: Look for any ambiguities, disputes, or complications - 6. **Validity Check**: Investigate whether the question's subject is valid/possible (for potential annulment) - - # Search Strategy Guidelines - - - Run 3-5 searches total (don't overdo it) - - Run searches in parallel when they're independent - - Use follow-up searches based on initial findings - - Focus on authoritative and recent sources - - Note any contradictions or uncertainties you find - - Pay special attention to dates and timelines - - # Decomposition Strategy for Multi-Entity Questions - - If a question involves multiple entities (companies, people, organizations, etc.): - - 1. **First attempt**: Search for comprehensive data about all entities - 2. **If comprehensive search fails**: Decompose the question and search each entity individually - 3. **Example**: For "Did all Magnificent Seven stocks decline 50%?", search: - - "Microsoft MSFT all-time high 2025" - - "Nvidia NVDA 2025 stock performance" - - "Apple AAPL 2025 pricing" - - And so on for each company - 4. **Then aggregate**: Combine individual findings to answer the comprehensive question - - # Detecting Annulled/Invalid Questions - - Some questions may be fundamentally invalid (annulled). Look for: - - - Studies/experiments that were never conducted, cancelled, or abandoned - - Questions about entities that never existed or were fundamentally misconceived - - Research projects that lost funding, had impossible criteria, or were invalid from the start - - Any indication that the question's subject is impossible or doesn't exist - - **When you cannot find evidence of an event occurring:** - - Search specifically for: "[subject] cancelled", "[subject] abandoned", "[subject] never conducted" - - Search for: "[subject] funding withdrawn", "[subject] fundamental problems", "[subject] invalid" - - If you find evidence the subject was never valid/possible, note this for potential ANNULLED resolution - - # Example Search Sequence - - 1. Broad search: "Current status of [topic] as of [current date]" - 2. Specific search: "Has [specific criterion] occurred?" - 3. Verification: "Latest news about [topic]" - 4. (Optional) Follow-up based on findings - 5. (If no results found) Validity check: "[topic] cancelled", "[topic] validity", "[topic] problems" - - # Important Reminders - - - Be thorough but efficient - - Document your findings clearly - - Note the sources and dates of information - - If you find conflicting information, document both sides - - Decompose multi-entity questions when comprehensive searches fail - - Actively search for evidence of annulment/invalidity when no results are found - - When ready, hand off your research to the resolver - - # Handoff - - When you've gathered sufficient information, hand off to the resolver - with a comprehensive summary of your research findings. - """ - ) - - def _build_resolver_instructions(self, question: BinaryQuestion) -> str: - """Build detailed instructions for the resolver agent. - - Args: - question: The question being resolved - - Returns: - Formatted instruction string - """ - logger.debug("Building resolver instructions") - - return clean_indents( - f""" - # Your Role - - You are a resolution analyst determining the final resolution status - of a forecasting question based on research provided to you. - - # The Question - - {question.give_question_details_as_markdown()} - - # Resolution Options - - You must determine one of the following resolutions: - - ## TRUE - - Resolution criteria have been definitively met - - The outcome is YES/positive - - There is strong evidence supporting this - - ## FALSE - - Resolution criteria have been definitively met - - The outcome is NO/negative - - There is strong evidence supporting this - - ## AMBIGUOUS - - The resolution criteria occurred - - BUT the outcome is unclear or disputed - - Multiple interpretations are reasonable - - Example: A law passed but its scope is unclear - - ## ANNULLED - - A fundamental assumption of the question is false - - The question itself is invalid or malformed - - Example: Question asks about a company that never existed - - ## NOT_YET_RESOLVABLE - - Insufficient information currently available - - OR the resolution date/event hasn't occurred yet - - OR you cannot confidently determine the resolution - - **BE CONSERVATIVE: Default to this when uncertain** - - # Analysis Guidelines - - 1. **Review the research** provided by the researcher carefully - 2. **Check the timeline**: Has the scheduled resolution date passed? - 3. **Assess the evidence**: Is it strong enough for a definitive resolution? - 4. **Consider ambiguity**: Is the outcome clear or disputed? - 5. **Be conservative**: If uncertain, return NOT_YET_RESOLVABLE - - # Critical Distinctions - - **AMBIGUOUS vs ANNULLED:** - - AMBIGUOUS: Question is valid, but answer is unclear - - ANNULLED: Question itself is invalid/malformed - - **FALSE vs NOT_YET_RESOLVABLE:** - - FALSE: Definitively did NOT happen - - NOT_YET_RESOLVABLE: Might still happen or unclear if it happened - - # Output Format - - Provide your analysis in the following format: - - **Resolution Status**: [Your chosen status] - - **Reasoning**: [2-4 sentences explaining your decision] - - **Key Evidence**: - - [Evidence point 1] - - [Evidence point 2] - - [Evidence point 3] - - [Evidence point 4 - optional] - - [Evidence point 5 - optional] - - # Important - - - Be thorough in your reasoning - - Cite specific information from the research - - Acknowledge uncertainties when present - - Your output will be parsed programmatically, so follow the format exactly - """ - ) - - def _convert_to_binary_resolution( - self, result: BinaryResolutionResult - ) -> Optional[BinaryResolution]: - """Convert structured result to typed binary resolution. - - Args: - result: Parsed resolution result - - Returns: - Typed BinaryResolution or None - - Raises: - ValueError: If resolution status is unexpected - """ - logger.debug(f"Converting result status: {result.resolution_status}") - - match result.resolution_status: - case "TRUE": - return True - case "FALSE": - return False - case "AMBIGUOUS": - return CanceledResolution.AMBIGUOUS - case "ANNULLED": - return CanceledResolution.ANNULLED - case "NOT_YET_RESOLVABLE": - return None - case _: - raise ValueError( - f"Unexpected resolution status: {result.resolution_status}" - ) - - -async def main(): - """Test harness for OpenAI Resolver. - - Runs the resolver against 10 random binary questions from the AIB Fall 2025 tournament - and outputs a confusion matrix comparing predicted vs actual resolutions. - """ - from forecasting_tools.agents_and_tools.auto_resolver.assess import ( - ResolutionAssesser, - ) - from forecasting_tools import MetaculusClient, ApiFilter - from dotenv import load_dotenv - import random - - load_dotenv() - - logger.info("Starting OpenAI Resolver assessment") - - # Fetch all resolved binary questions from AIB Fall 2025 - client = MetaculusClient() - filter = ApiFilter( - allowed_tournaments=[MetaculusClient.AIB_FALL_2025_ID], - allowed_statuses=["resolved"], - allowed_types=["binary"], - group_question_mode="exclude", - order_by="-published_time" - ) - - logger.info("Fetching resolved binary questions from AIB Fall 2025...") - all_questions = await client.get_questions_matching_filter(filter) - - # Randomly sample 10 questions - sample_size = min(20, len(all_questions)) - sampled_questions = random.sample(all_questions, sample_size) - question_ids = [q.id_of_post for q in sampled_questions if q.id_of_post is not None] - - logger.info(f"Selected {len(question_ids)} random questions for assessment") - - # Create resolver - resolver = OpenAIResolver() - - # Create assessor with specific question IDs - assesser = ResolutionAssesser( - resolver, allowed_types=["binary"], questions=question_ids - ) - - logger.info(f"Running assessment on {len(question_ids)} questions") - - # Run assessment - report = await assesser.assess_resolver() - - # Print results - print("\n" + "=" * 60) - print("OpenAI Resolver Assessment Results") - print(f"Tested on {len(question_ids)} random questions from AIB Fall 2025") - print("=" * 60) - print(report) - print("=" * 60) - - # Save detailed report to reports directory - try: - report_path = report.write_to_file(directory="reports") - print(f"\nDetailed report saved to: {report_path}") - except Exception as e: - logger.error(f"Failed to save report to reports directory: {e}", exc_info=True) - print(f"\nWarning: Could not save report to reports directory: {e}") - - logger.info("Assessment complete") - - -if __name__ == "__main__": - import asyncio - from dotenv import load_dotenv - - load_dotenv() - asyncio.run(main()) diff --git a/forecasting_tools/agents_and_tools/auto_resolver/resolution_models.py b/forecasting_tools/agents_and_tools/auto_resolver/resolution_models.py index cf85dbef..2a00f70d 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/resolution_models.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/resolution_models.py @@ -3,9 +3,10 @@ This module defines the data models used to parse the output from the resolver agent into typed resolution decisions. """ +from forecasting_tools.data_models.questions import BinaryResolution, CanceledResolution from pydantic import BaseModel, Field -from typing import Literal +from typing import Literal, Optional class BinaryResolutionResult(BaseModel): @@ -34,3 +35,34 @@ class BinaryResolutionResult(BaseModel): min_length=3, max_length=5, ) + + + def convert_to_binary_resolution(self) -> Optional[BinaryResolution]: + """Convert structured result to typed binary resolution. + + Args: + result: Parsed resolution result + + Returns: + Typed BinaryResolution or None + + Raises: + ValueError: If resolution status is unexpected + """ + + match self.resolution_status: + case "TRUE": + return True + case "FALSE": + return False + case "AMBIGUOUS": + return CanceledResolution.AMBIGUOUS + case "ANNULLED": + return CanceledResolution.ANNULLED + case "NOT_YET_RESOLVABLE": + return None + case _: + raise ValueError( + f"Unexpected resolution status: {result.resolution_status}" + ) + From c5605b16650b5a8f677d68a3b50eb2c772bc2741 Mon Sep 17 00:00:00 2001 From: Luke Beukelman Date: Sun, 1 Mar 2026 15:24:00 -0700 Subject: [PATCH 08/14] Added a tui --- .../auto_resolver/agentic/__init__.py | 102 ++++- .../auto_resolver/tui/__init__.py | 6 + .../auto_resolver/tui/__main__.py | 76 ++++ .../agents_and_tools/auto_resolver/tui/app.py | 414 ++++++++++++++++++ .../auto_resolver/tui/models.py | 78 ++++ .../auto_resolver/tui/report.py | 231 ++++++++++ .../auto_resolver/tui/resolver_worker.py | 128 ++++++ .../auto_resolver/tui/widgets/__init__.py | 1 + .../auto_resolver/tui/widgets/feed_panel.py | 188 ++++++++ .../auto_resolver/tui/widgets/home_panel.py | 159 +++++++ .../auto_resolver/tui/widgets/input_modal.py | 266 +++++++++++ .../auto_resolver/tui/widgets/sidebar.py | 134 ++++++ poetry.lock | 140 +++++- pyproject.toml | 1 + 14 files changed, 1920 insertions(+), 4 deletions(-) create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/tui/__init__.py create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/tui/__main__.py create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/tui/app.py create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/tui/models.py create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/tui/report.py create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/tui/resolver_worker.py create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/__init__.py create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/feed_panel.py create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/home_panel.py create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/input_modal.py create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/sidebar.py diff --git a/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py b/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py index dbf9be40..d1e308f8 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py @@ -15,7 +15,9 @@ """ import logging -from typing import Optional, Callable +from typing import AsyncGenerator, Optional, Callable + +from openai.types.responses import ResponseTextDeltaEvent from forecasting_tools.data_models.questions import ( ResolutionType, @@ -33,6 +35,7 @@ AgentRunner, AgentSdkLlm, AiAgent, + event_to_tool_message, ) from forecasting_tools.ai_models.general_llm import GeneralLlm from forecasting_tools.helpers.structure_output import structure_output @@ -222,3 +225,100 @@ def _create_orchestrator_agent( tools=[], handoffs=[researcher, resolver], ) + + async def resolve_question_streamed( + self, question: MetaculusQuestion + ) -> AsyncGenerator[tuple[str, str], None]: + """Resolve a question with streaming, yielding (event_type, message) tuples. + + This is the streaming counterpart to resolve_question(). It yields + intermediate events as they occur during the agent resolution process, + allowing a TUI or other consumer to display live progress. + + Event types: + "status" -- lifecycle status updates (rephrasing, agent creation, etc.) + "text" -- streamed text delta from the agent's response + "tool" -- tool call / handoff / reasoning events + "result" -- final resolution result (yielded once at the end) + "error" -- an error occurred during resolution + + After all events are yielded, the resolver's internal metadata is updated + (same as resolve_question), so get_last_resolution_metadata() will work. + + Args: + question: The Metaculus question to resolve. + + Yields: + Tuples of (event_type, message_text). + """ + if not isinstance(question, BinaryQuestion): + yield ("error", f"Unsupported question type: {type(question).__name__}") + return + + # Step 1: Rephrase if needed + yield ("status", "Checking if question needs rephrasing...") + question = await self._rephrase_question_if_needed(question) + yield ("status", f"Question text: {question.question_text}") + + # Step 2: Create agents + yield ("status", "Creating resolution agents...") + researcher = self._create_researcher(question) + resolver = self._create_resolver_agent(question) + orchestrator = self._create_orchestrator_agent(researcher, resolver) + + # Step 3: Run streamed workflow + yield ("status", "Starting resolution process...") + streamed_text = "" + + result = AgentRunner.run_streamed( + orchestrator, "Please begin the resolution process.", max_turns=10 + ) + + async for event in result.stream_events(): + # Capture text deltas + if event.type == "raw_response_event" and isinstance( + event.data, ResponseTextDeltaEvent + ): + streamed_text += event.data.delta + yield ("text", event.data.delta) + + # Capture tool/handoff/reasoning events + tool_msg = event_to_tool_message(event) + if tool_msg: + yield ("tool", tool_msg) + + # Step 4: Parse structured output + yield ("status", "Parsing resolution output...") + final_output = result.final_output + + try: + resolution_result = await structure_output( + final_output, + BinaryResolutionResult, + model=self.model_for_output_structure, + ) + logger.info( + f"Successfully parsed resolution: {resolution_result.resolution_status}" + ) + except Exception as e: + logger.error(f"Failed to parse structured output: {e}", exc_info=True) + yield ("error", f"Failed to parse structured output: {e}") + self._last_resolution_metadata = None + return + + # Step 5: Store metadata + self._last_resolution_metadata = { + "reasoning": resolution_result.reasoning, + "key_evidence": resolution_result.key_evidence, + } + + typed_resolution = resolution_result.convert_to_binary_resolution() + logger.info(f"Final resolution: {typed_resolution}") + + yield ( + "result", + f"Resolution: {resolution_result.resolution_status}\n" + f"Reasoning: {resolution_result.reasoning}\n" + f"Key Evidence:\n" + + "\n".join(f" - {e}" for e in resolution_result.key_evidence), + ) diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/__init__.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/__init__.py new file mode 100644 index 00000000..41e88b56 --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/__init__.py @@ -0,0 +1,6 @@ +""" +TUI (Terminal User Interface) for the Agentic Auto Resolver. + +Run with: + python -m forecasting_tools.agents_and_tools.auto_resolver.tui +""" diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/__main__.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/__main__.py new file mode 100644 index 00000000..811109f6 --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/__main__.py @@ -0,0 +1,76 @@ +"""Entry point for the Auto Resolver TUI. + +Usage: + python -m forecasting_tools.agents_and_tools.auto_resolver.tui + python -m forecasting_tools.agents_and_tools.auto_resolver.tui --tournament 32813 + python -m forecasting_tools.agents_and_tools.auto_resolver.tui --tournament fall-aib-2025 + python -m forecasting_tools.agents_and_tools.auto_resolver.tui --question 12345 --question 67890 + python -m forecasting_tools.agents_and_tools.auto_resolver.tui --concurrency 5 +""" + +import argparse +import logging + +from dotenv import load_dotenv + +load_dotenv() + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Auto Resolver TUI -- interactive agentic question resolution", + ) + def _parse_tournament_id(value: str) -> int | str: + """Parse a tournament ID as int if numeric, otherwise keep as string slug.""" + try: + return int(value) + except ValueError: + return value + + parser.add_argument( + "--tournament", + type=_parse_tournament_id, + action="append", + default=[], + help="Tournament ID or slug to load on startup (can be repeated)", + ) + parser.add_argument( + "--question", + type=int, + action="append", + default=[], + help="Question post ID to load on startup (can be repeated)", + ) + parser.add_argument( + "--concurrency", + type=int, + default=3, + help="Max concurrent resolutions (default: 3)", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Enable verbose logging to stderr", + ) + args = parser.parse_args() + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + # Suppress noisy loggers but keep warnings+ + logging.basicConfig(level=logging.WARNING) + + from forecasting_tools.agents_and_tools.auto_resolver.tui.app import ( + AutoResolverApp, + ) + + app = AutoResolverApp( + max_concurrency=args.concurrency, + initial_tournaments=args.tournament, + initial_questions=args.question, + ) + app.run() + + +if __name__ == "__main__": + main() diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/app.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/app.py new file mode 100644 index 00000000..20ee8b1c --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/app.py @@ -0,0 +1,414 @@ +"""Main Textual application for the Auto Resolver TUI.""" + +from __future__ import annotations + +import asyncio +import logging +import random +from datetime import datetime +from pathlib import Path +from typing import Optional, cast + +from textual.app import App, ComposeResult +from textual.binding import Binding +from textual.containers import Horizontal +from textual.widgets import Button, Footer, Header + +from forecasting_tools import MetaculusClient, ApiFilter +from forecasting_tools.data_models.questions import MetaculusQuestion, QuestionBasicType +from forecasting_tools.helpers.metaculus_client import QuestionStateAsString +from forecasting_tools.agents_and_tools.auto_resolver.agentic import AgenticResolver +from forecasting_tools.agents_and_tools.auto_resolver.tui.models import ( + HOME_SENTINEL_ID, + QuestionItem, + QuestionSelected, +) +from forecasting_tools.agents_and_tools.auto_resolver.tui.resolver_worker import ( + FeedEvent, + ResolutionComplete, + run_resolution, +) +from forecasting_tools.agents_and_tools.auto_resolver.tui.widgets.sidebar import ( + Sidebar, +) +from forecasting_tools.agents_and_tools.auto_resolver.tui.widgets.feed_panel import ( + FeedPanel, +) +from forecasting_tools.agents_and_tools.auto_resolver.tui.widgets.home_panel import ( + HomePanel, +) +from forecasting_tools.agents_and_tools.auto_resolver.tui.report import ( + generate_markdown_report, +) +from forecasting_tools.agents_and_tools.auto_resolver.tui.widgets.input_modal import ( + AddIdRequested, + InputModal, +) + +logger = logging.getLogger(__name__) + + +class AutoResolverApp(App): + """TUI application for interactive agentic question resolution. + + Layout: + [Sidebar (question list)] | [Home Panel / Feed Panel] + + Keybindings: + a -- Add a question by post ID + t -- Add questions from a tournament + r -- Re-run resolution on the selected question + e -- Export report to markdown + q -- Quit + """ + + TITLE = "Auto Resolver TUI" + CSS = """ + Screen { + layout: horizontal; + } + #main-area { + width: 1fr; + height: 1fr; + } + """ + + BINDINGS = [ + Binding("a", "add_question", "Add Question", show=True), + Binding("t", "add_tournament", "Add Tournament", show=True), + Binding("r", "rerun", "Re-run Selected", show=True), + Binding("e", "export_report", "Export Report", show=True), + Binding("q", "quit", "Quit", show=True), + ] + + def __init__( + self, + max_concurrency: int = 3, + initial_tournaments: list[int | str] | None = None, + initial_questions: list[int] | None = None, + ) -> None: + super().__init__() + self._resolver = AgenticResolver() + self._client = MetaculusClient() + self._items: dict[int, QuestionItem] = {} + self._selected_post_id: Optional[int] = HOME_SENTINEL_ID + self._concurrency_sem = asyncio.Semaphore(max_concurrency) + self._initial_tournaments = initial_tournaments or [] + self._initial_questions = initial_questions or [] + + # ------------------------------------------------------------------ + # Layout + # ------------------------------------------------------------------ + + def compose(self) -> ComposeResult: + yield Header() + with Horizontal(id="main-area"): + yield Sidebar(id="sidebar") + yield HomePanel(id="home-panel") + yield FeedPanel(id="feed-panel") + yield Footer() + + @property + def sidebar(self) -> Sidebar: + return self.query_one("#sidebar", Sidebar) + + @property + def feed_panel(self) -> FeedPanel: + return self.query_one("#feed-panel", FeedPanel) + + @property + def home_panel(self) -> HomePanel: + return self.query_one("#home-panel", HomePanel) + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + async def on_mount(self) -> None: + """Load any questions / tournaments passed via CLI.""" + # Start with home panel visible, feed panel hidden + self.feed_panel.display = False + self.home_panel.display = True + + for tid in self._initial_tournaments: + self._schedule_load_tournament(tid) + for qid in self._initial_questions: + self._schedule_load_question(qid) + + # ------------------------------------------------------------------ + # View switching + # ------------------------------------------------------------------ + + def _show_home(self) -> None: + """Switch to the home overview panel.""" + self.home_panel.display = True + self.feed_panel.display = False + self.home_panel.refresh_table(self._items) + + def _show_feed(self, item: QuestionItem | None) -> None: + """Switch to the feed panel for a specific question.""" + self.home_panel.display = False + self.feed_panel.display = True + self.feed_panel.show_question(item) + + # ------------------------------------------------------------------ + # Actions (keybindings) + # ------------------------------------------------------------------ + + def action_add_question(self) -> None: + self.push_screen(InputModal(default_type="question")) + + def action_add_tournament(self) -> None: + self.push_screen(InputModal(default_type="tournament")) + + def action_rerun(self) -> None: + pid = self.sidebar.get_selected_post_id() + if pid and pid in self._items: + item = self._items[pid] + if item.status not in ("running",): + self._start_resolution(item) + + def action_export_report(self) -> None: + """Export the current results to a markdown file.""" + if not self._items: + self.notify("No questions to export.", severity="warning") + return + + report = self._generate_markdown_report() + + reports_dir = Path("reports") + reports_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filepath = reports_dir / f"tui_report_{timestamp}.md" + + filepath.write_text(report, encoding="utf-8") + self.notify(f"Report exported to {filepath}") + + # ------------------------------------------------------------------ + # Message handlers + # ------------------------------------------------------------------ + + def on_button_pressed(self, event: Button.Pressed) -> None: + """Handle sidebar button clicks.""" + if event.button.id == "btn-add-tournament": + self.action_add_tournament() + elif event.button.id == "btn-add-question": + self.action_add_question() + + def on_add_id_requested(self, message: AddIdRequested) -> None: + if message.id_type == "tournament": + self._schedule_load_tournament( + message.id_value, + allowed_types=message.allowed_types, + max_questions=message.max_questions, + exclude_unresolved=message.exclude_unresolved, + ) + else: + self._schedule_load_question(int(message.id_value)) + + def on_question_selected(self, message: QuestionSelected) -> None: + self._selected_post_id = message.post_id + + if message.post_id == HOME_SENTINEL_ID: + self._show_home() + else: + item = self._items.get(message.post_id) + self._show_feed(item) + + def on_feed_event(self, message: FeedEvent) -> None: + """Route a live feed event to the feed panel if it's for the selected question.""" + if message.post_id == self._selected_post_id: + self.feed_panel.feed_log.append_event(message.event_type, message.text) + # Also update the sidebar entry + item = self._items.get(message.post_id) + if item: + self._refresh_sidebar_entry(item) + + def on_resolution_complete(self, message: ResolutionComplete) -> None: + """Handle completion: refresh sidebar + header if selected, refresh home table.""" + item = self._items.get(message.post_id) + if item: + self._refresh_sidebar_entry(item) + # Flush any remaining text in the feed log + if message.post_id == self._selected_post_id: + self.feed_panel.feed_log.flush_text() + self.feed_panel.header.render_item(item) + + # Always refresh home table so it stays up to date + if self.home_panel.display: + self.home_panel.refresh_table(self._items) + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _schedule_load_question(self, post_id: int) -> None: + """Kick off an async worker to load a single question from Metaculus.""" + self.run_worker(self._load_question(post_id), exclusive=False) + + def _schedule_load_tournament( + self, + tournament_id: int | str, + allowed_types: list[QuestionBasicType] | None = None, + max_questions: int | None = None, + exclude_unresolved: bool = False, + ) -> None: + """Kick off an async worker to load all questions from a tournament.""" + self.run_worker( + self._load_tournament( + tournament_id, + allowed_types=allowed_types, + max_questions=max_questions, + exclude_unresolved=exclude_unresolved, + ), + exclusive=False, + ) + + async def _load_question(self, post_id: int) -> None: + """Fetch a single question by post ID and add it to the list.""" + if post_id in self._items: + self.notify(f"Question {post_id} already loaded.", severity="warning") + return + + self.notify(f"Loading question {post_id}...") + try: + question = await asyncio.to_thread( + self._client.get_question_by_post_id, post_id + ) + except Exception as e: + self.notify(f"Failed to load question {post_id}: {e}", severity="error") + return + + if isinstance(question, list): + self.notify( + f"Question {post_id} is a group question (not supported). Skipping.", + severity="warning", + ) + return + + self._add_question(question) + + async def _load_tournament( + self, + tournament_id: int | str, + allowed_types: list[QuestionBasicType] | None = None, + max_questions: int | None = None, + exclude_unresolved: bool = False, + ) -> None: + """Fetch questions from a tournament and add them. + + Pages are fetched one at a time on a background thread so the + event loop stays free and the UI remains responsive. When no + random sampling is needed (``max_questions is None``), questions + are added to the sidebar as each page arrives. When random + sampling *is* requested, all pages are collected first, then a + random subset is sampled and added. + """ + self.notify(f"Loading tournament {tournament_id}...") + try: + allowed_statuses: list[QuestionStateAsString] | None = None + if exclude_unresolved: + allowed_statuses = cast( + list[QuestionStateAsString], ["resolved"] + ) + + api_filter = ApiFilter( + allowed_tournaments=[tournament_id], + allowed_types=allowed_types or ["binary"], + allowed_statuses=allowed_statuses, + group_question_mode="exclude", + order_by="-published_time", + ) + + # Fetch page-by-page on a thread so we don't block the + # event loop (the underlying client uses time.sleep + + # requests.get which are synchronous). + page_size = self._client.MAX_QUESTIONS_FROM_QUESTION_API_PER_REQUEST + all_questions: list[MetaculusQuestion] = [] + added = 0 + page_num = 0 + more_available = True + needs_sampling = max_questions is not None + + while more_available: + offset = page_num * page_size + new_questions, continue_searching = await asyncio.to_thread( + self._client._grab_filtered_questions_with_offset, + api_filter, + offset, + ) + + if needs_sampling: + # Collect for later random sampling + all_questions.extend(new_questions) + else: + # Add to sidebar immediately — no sampling needed + for q in new_questions: + if q.id_of_post and q.id_of_post not in self._items: + self._add_question(q) + added += 1 + + if not continue_searching: + more_available = False + page_num += 1 + + # Apply random sampling if max_questions was requested + if needs_sampling: + if max_questions is not None and len(all_questions) > max_questions: + all_questions = random.sample(all_questions, max_questions) + for q in all_questions: + if q.id_of_post and q.id_of_post not in self._items: + self._add_question(q) + added += 1 + + except Exception as e: + self.notify( + f"Failed to load tournament {tournament_id}: {e}", severity="error" + ) + return + + self.notify(f"Loaded {added} questions from tournament {tournament_id}.") + + # Refresh home table after batch load + if self.home_panel.display: + self.home_panel.refresh_table(self._items) + + def _add_question(self, question: MetaculusQuestion) -> None: + """Register a question and add it to the sidebar, then start resolution.""" + post_id = question.id_of_post + if post_id is None: + return + item = QuestionItem(question=question) + self._items[post_id] = item + + display = f"{item.status_icon} [{post_id}] {item.title}" + self.sidebar.add_question_entry(post_id, display) + + # Refresh home table if visible + if self.home_panel.display: + self.home_panel.refresh_table(self._items) + + # Start resolution immediately + self._start_resolution(item) + + def _start_resolution(self, item: QuestionItem) -> None: + """Launch a bounded async worker for resolution.""" + + async def _bounded_resolve() -> None: + async with self._concurrency_sem: + await run_resolution(item, self._resolver, self.sidebar) + + self.run_worker(_bounded_resolve(), exclusive=False) + + def _refresh_sidebar_entry(self, item: QuestionItem) -> None: + """Update the sidebar text for a question.""" + display = f"{item.status_icon} [{item.post_id}] {item.title}" + self.sidebar.update_question_entry(item.post_id, display) + + # ------------------------------------------------------------------ + # Markdown report generation + # ------------------------------------------------------------------ + + def _generate_markdown_report(self) -> str: + """Build a markdown report matching the assess.py format.""" + return generate_markdown_report(self._items) diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/models.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/models.py new file mode 100644 index 00000000..1ce9b30f --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/models.py @@ -0,0 +1,78 @@ +"""Data models for TUI question state tracking.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Literal, Optional + +from textual.message import Message + +from forecasting_tools.data_models.questions import MetaculusQuestion, ResolutionType + + +# Sentinel post_id used by the "Home" sidebar entry. +HOME_SENTINEL_ID: int = -1 + + +class QuestionSelected(Message): + """Posted when the user selects a question in the sidebar list or home table.""" + + def __init__(self, post_id: int) -> None: + super().__init__() + self.post_id = post_id + + +QuestionStatus = Literal["pending", "running", "completed", "error"] + + +@dataclass +class QuestionItem: + """Tracks the state of a single question through the resolution pipeline. + + Attributes: + question: The Metaculus question being resolved. + status: Current resolution status. + resolution: The typed resolution result, or None if not yet resolved. + resolution_status_str: Raw status string from the resolver + (e.g. "TRUE", "FALSE", "NOT_YET_RESOLVABLE"). + resolution_metadata: Reasoning and key evidence from the resolver. + feed_lines: Accumulated agent feed messages for the live log. + error_message: Error details if status is "error". + """ + + question: MetaculusQuestion + status: QuestionStatus = "pending" + resolution: Optional[ResolutionType] = None + resolution_status_str: Optional[str] = None + resolution_metadata: Optional[dict] = None + feed_lines: list[str] = field(default_factory=list) + error_message: Optional[str] = None + cost: float = 0.0 + + @property + def post_id(self) -> int: + return self.question.id_of_post # type: ignore[return-value] + + @property + def title(self) -> str: + text = self.question.question_text or "Untitled" + return text[:80] + ("..." if len(text) > 80 else "") + + @property + def status_icon(self) -> str: + return { + "pending": " -- ", + "running": " >> ", + "completed": " OK ", + "error": " !! ", + }[self.status] + + @property + def resolution_display(self) -> str: + if self.status == "error": + return f"Error: {self.error_message or 'unknown'}" + if self.resolution_status_str is not None: + return self.resolution_status_str + if self.status == "running": + return "Resolving..." + return "Pending" diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/report.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/report.py new file mode 100644 index 00000000..cdfc3f93 --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/report.py @@ -0,0 +1,231 @@ +"""Shared report / confusion-matrix logic for the TUI. + +Builds a ``ResolutionAssessmentReport`` from a dict of ``QuestionItem`` +objects so both the home panel and the markdown export can reuse the same +data and formatting that ``assess.py`` already provides. +""" + +from __future__ import annotations + +from datetime import datetime +from typing import Optional + +from forecasting_tools.agents_and_tools.auto_resolver.assess import ( + QuestionAssessmentResult, + ResolutionAssessmentReport, +) +from forecasting_tools.agents_and_tools.auto_resolver.tui.models import QuestionItem +from forecasting_tools.data_models.questions import ( + BinaryResolution, + CanceledResolution, + ResolutionType, +) + + +# ------------------------------------------------------------------ +# Mapping resolver status strings to typed resolutions +# ------------------------------------------------------------------ + +def _status_str_to_resolution(status: str | None) -> Optional[ResolutionType]: + """Convert a resolution_status_str (e.g. 'TRUE') to a typed resolution.""" + if status is None: + return None + s = status.strip().upper() + if s == "TRUE": + return True + elif s == "FALSE": + return False + elif s == "AMBIGUOUS": + return CanceledResolution.AMBIGUOUS + elif s == "ANNULLED": + return CanceledResolution.ANNULLED + elif s == "NOT_YET_RESOLVABLE": + return None + else: + return None + + +# ------------------------------------------------------------------ +# Build a ResolutionAssessmentReport from TUI items +# ------------------------------------------------------------------ + +def build_report_from_items( + items: dict[int, QuestionItem], +) -> ResolutionAssessmentReport: + """Populate a ``ResolutionAssessmentReport`` from the TUI question items. + + Only completed items whose questions have a ground-truth resolution are + included in the confusion matrix. Items that are still pending / running / + errored are silently skipped. + """ + report = ResolutionAssessmentReport() + + for post_id, item in items.items(): + if item.status != "completed": + continue + + true_resolution = item.question.typed_resolution + predicted_resolution = _status_str_to_resolution(item.resolution_status_str) + + key_evidence: list[str] | None = None + if item.resolution_metadata: + key_evidence = item.resolution_metadata.get("key_evidence") + + outcome_category: str | None = None + + # Classify into the confusion matrix — mirrors assess.py logic + if isinstance(true_resolution, bool): + match true_resolution, predicted_resolution: + # Positive actual + case True, True: + report.pp.append(post_id) + outcome_category = "True Positive" + case True, False: + report.np.append(post_id) + outcome_category = "False Negative" + case True, None: + report.xp.append(post_id) + outcome_category = "Missed Positive" + case True, CanceledResolution(): + report.cp.append(post_id) + outcome_category = "Positive Incorrectly Predicted as Cancelled" + # Negative actual + case False, True: + report.pn.append(post_id) + outcome_category = "False Positive" + case False, False: + report.nn.append(post_id) + outcome_category = "True Negative" + case False, None: + report.xn.append(post_id) + outcome_category = "Missed Negative" + case False, CanceledResolution(): + report.cn.append(post_id) + outcome_category = "Negative Incorrectly Predicted as Cancelled" + case _: + if true_resolution is True: + report.xp.append(post_id) + outcome_category = "Unmatched - Positive" + else: + report.xn.append(post_id) + outcome_category = "Unmatched - Negative" + + elif isinstance(true_resolution, CanceledResolution): + match predicted_resolution: + case True: + report.pc.append(post_id) + outcome_category = "Cancelled Incorrectly Predicted as Positive" + case False: + report.nc.append(post_id) + outcome_category = "Cancelled Incorrectly Predicted as Negative" + case CanceledResolution(): + report.cc.append(post_id) + outcome_category = "Correct Cancel" + case None: + report.xc.append(post_id) + outcome_category = "Cancelled Not Answered" + case _: + report.xc.append(post_id) + outcome_category = "Unmatched - Cancelled" + else: + # No ground truth (unresolved question) — skip from matrix + continue + + question_result = QuestionAssessmentResult( + question_id=post_id, + question_title=(item.question.question_text or "No title")[:100], + question_text=item.question.question_text or "No text available", + question_url=item.question.page_url or f"https://metaculus.com/{post_id}", + actual_resolution=true_resolution, + predicted_resolution=predicted_resolution, + key_evidence=key_evidence, + outcome_category=outcome_category, + ) + report.question_results[post_id] = question_result + + return report + + +# ------------------------------------------------------------------ +# Full markdown report (mirrors assess.py detailed_report + cost) +# ------------------------------------------------------------------ + +def generate_markdown_report(items: dict[int, QuestionItem]) -> str: + """Generate a markdown report matching the ``assess.py`` format, with cost info.""" + report = build_report_from_items(items) + + total_cost = sum(item.cost for item in items.values()) + + lines: list[str] = [] + lines.append("# Auto Resolver Assessment Report\n") + lines.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") + + # Summary section + lines.append("## Summary\n") + lines.append(report.binary_results_table()) + lines.append("") + + # Totals + total = ( + report.n_pp + report.n_pn + report.n_pc + + report.n_np + report.n_nn + report.n_nc + + report.n_cp + report.n_cn + report.n_cc + + report.n_xp + report.n_xn + report.n_xc + ) + correct = report.n_pp + report.n_nn + report.n_cc + accuracy = (correct / total * 100) if total > 0 else 0 + + lines.append(f"**Total Questions:** {total}") + lines.append(f"**Correct Predictions:** {correct} ({accuracy:.1f}%)") + lines.append(f"**Total Cost:** ${total_cost:.4f}") + lines.append("") + + # Detailed results section + lines.append("## Detailed Results\n") + lines.append("") + + # Sort by category then question ID (same as assess.py) + sorted_results = sorted( + report.question_results.values(), + key=lambda x: (x.outcome_category or "", x.question_id), + ) + + for result in sorted_results: + lines.append(f"### Question {result.question_id}\n") + lines.append(f"**Title:** {result.question_title}") + lines.append(f"**URL:** {result.question_url}") + lines.append("") + + # Question contents + lines.append("**Question Contents:**\n") + lines.append("> " + result.question_text.replace("\n", "\n> ")) + lines.append("") + + # Resolution comparison table + lines.append("| Output Resolution | Correct Resolution |") + lines.append("|-------------------|--------------------|") + actual_str = ResolutionAssessmentReport._resolution_to_str( + result.actual_resolution + ) + predicted_str = ResolutionAssessmentReport._resolution_to_str( + result.predicted_resolution + ) + lines.append(f"| {predicted_str} | {actual_str} |") + lines.append("") + + # Cost per question + item = items.get(result.question_id) + if item and item.cost > 0: + lines.append(f"**Cost:** ${item.cost:.4f}") + lines.append("") + + # Key evidence + if result.key_evidence: + lines.append("**Key Evidence:**") + for evidence in result.key_evidence: + lines.append(f"- {evidence}") + lines.append("") + + lines.append("---\n") + + return "\n".join(lines) diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/resolver_worker.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/resolver_worker.py new file mode 100644 index 00000000..531673dc --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/resolver_worker.py @@ -0,0 +1,128 @@ +"""Async worker that runs the streaming AgenticResolver and posts events to the TUI.""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +from textual.message import Message +from textual.widget import Widget + +from forecasting_tools.agents_and_tools.auto_resolver.agentic import AgenticResolver +from forecasting_tools.agents_and_tools.auto_resolver.tui.models import QuestionItem +from forecasting_tools.ai_models.resource_managers.monetary_cost_manager import ( + MonetaryCostManager, +) + +if TYPE_CHECKING: + from forecasting_tools.data_models.questions import MetaculusQuestion + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Textual Messages posted by the worker so the App can react +# --------------------------------------------------------------------------- + +class FeedEvent(Message): + """A single line of agent feed output for a question.""" + + def __init__(self, post_id: int, event_type: str, text: str) -> None: + super().__init__() + self.post_id = post_id + self.event_type = event_type + self.text = text + + +class ResolutionComplete(Message): + """Emitted when a question's resolution finishes (success or error).""" + + def __init__(self, post_id: int) -> None: + super().__init__() + self.post_id = post_id + + +# --------------------------------------------------------------------------- +# Worker coroutine +# --------------------------------------------------------------------------- + +async def run_resolution( + item: QuestionItem, + resolver: AgenticResolver, + poster: Widget, +) -> None: + """Run streaming resolution on a single question, posting messages to the TUI. + + This is intended to be called via Textual's Worker system + (e.g. ``self.run_worker(run_resolution(...))``) so it executes + on a background asyncio task while the UI remains responsive. + + Args: + item: The QuestionItem to resolve -- mutated in place. + resolver: The AgenticResolver instance (shared, but each call is independent). + poster: A Textual Widget whose ``.post_message()`` method will be used + to deliver events back to the app's message loop. + """ + item.status = "running" + item.feed_lines.clear() + item.cost = 0.0 + post_id = item.post_id + + cost_manager = MonetaryCostManager(hard_limit=0) + + try: + with cost_manager: + async for event_type, text in resolver.resolve_question_streamed(item.question): + # Accumulate non-delta lines (deltas are single tokens, too noisy individually) + if event_type == "text": + # For text deltas we still post them but accumulate separately + poster.post_message(FeedEvent(post_id, event_type, text)) + elif event_type == "error": + item.feed_lines.append(f"[ERROR] {text}") + item.status = "error" + item.error_message = text + item.cost = cost_manager.current_usage + poster.post_message(FeedEvent(post_id, event_type, text)) + poster.post_message(ResolutionComplete(post_id)) + return + elif event_type == "result": + item.feed_lines.append(text) + # Extract resolution info directly from the result text + # to avoid race conditions with shared resolver state. + lines = text.splitlines() + for line in lines: + if line.startswith("Resolution: "): + item.resolution_status_str = line.removeprefix("Resolution: ").strip() + elif line.startswith("Reasoning: "): + reasoning = line.removeprefix("Reasoning: ").strip() + item.resolution_metadata = item.resolution_metadata or {} + item.resolution_metadata["reasoning"] = reasoning + # Gather key evidence lines + evidence: list[str] = [] + in_evidence = False + for line in lines: + if line.strip() == "Key Evidence:": + in_evidence = True + continue + if in_evidence and line.strip().startswith("- "): + evidence.append(line.strip().removeprefix("- ")) + if evidence: + item.resolution_metadata = item.resolution_metadata or {} + item.resolution_metadata["key_evidence"] = evidence + + item.status = "completed" + poster.post_message(FeedEvent(post_id, event_type, text)) + else: + # "status" or "tool" events + item.feed_lines.append(text) + poster.post_message(FeedEvent(post_id, event_type, text)) + + except Exception as exc: + logger.exception("Resolution failed for question %s", post_id) + item.status = "error" + item.error_message = str(exc) + item.feed_lines.append(f"[EXCEPTION] {exc}") + poster.post_message(FeedEvent(post_id, "error", str(exc))) + + item.cost = cost_manager.current_usage + poster.post_message(ResolutionComplete(post_id)) diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/__init__.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/__init__.py new file mode 100644 index 00000000..7bbdce10 --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/__init__.py @@ -0,0 +1 @@ +"""TUI widget components for the auto resolver interface.""" diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/feed_panel.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/feed_panel.py new file mode 100644 index 00000000..3a0656cf --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/feed_panel.py @@ -0,0 +1,188 @@ +"""Main content panel showing resolution status and live agent feed.""" + +from __future__ import annotations + +from typing import Optional + +from textual.app import ComposeResult +from textual.containers import Vertical +from textual.widgets import RichLog, Static + +from forecasting_tools.agents_and_tools.auto_resolver.tui.models import QuestionItem + + +# --------------------------------------------------------------------------- +# Helpers for normalising and colour-coding resolution values +# --------------------------------------------------------------------------- + +def _resolution_color(value: str) -> str: + """Return a Rich colour name for a resolution string.""" + v = value.strip().upper() + if v == "TRUE": + return "green" + elif v == "FALSE": + return "red" + elif v in ("NOT_YET_RESOLVABLE", "RESOLVING...", "PENDING"): + return "yellow" + else: + return "cyan" # AMBIGUOUS, ANNULLED, errors, etc. + + +def _normalize_ground_truth(raw: str) -> str: + """Convert Metaculus ground-truth strings to True/False style.""" + lowered = raw.strip().lower() + if lowered == "yes": + return "True" + elif lowered == "no": + return "False" + elif lowered == "ambiguous": + return "Ambiguous" + elif lowered == "annulled": + return "Annulled" + else: + return raw + + +class ResolutionHeader(Static): + """Displays the resolution status and metadata for the selected question.""" + + DEFAULT_CSS = """ + ResolutionHeader { + dock: top; + height: auto; + max-height: 14; + padding: 1 2; + background: $surface; + border-bottom: solid $primary; + color: $text; + } + """ + + def render_item(self, item: Optional[QuestionItem]) -> None: + if item is None: + self.update("No question selected. Press [bold]a[/bold] to add questions.") + return + + parts: list[str] = [] + parts.append(f"[bold]{item.question.question_text or 'Untitled'}[/bold]") + + if item.question.page_url: + parts.append(f"[dim]{item.question.page_url}[/dim]") + + # Predicted resolution line (colour-coded) + parts.append("") + resolution_str = item.resolution_display + res_color = _resolution_color(resolution_str) + parts.append( + f"Predicted: [{res_color}]{resolution_str}[/{res_color}]" + ) + + # Ground truth (normalised and colour-coded) + if item.question.resolution_string: + gt_normalized = _normalize_ground_truth(item.question.resolution_string) + gt_color = _resolution_color(gt_normalized) + parts.append( + f"Ground Truth: [{gt_color}]{gt_normalized}[/{gt_color}]" + ) + + # Cost + if item.cost > 0: + parts.append(f"Cost: [bold]${item.cost:.4f}[/bold]") + + # Metadata (reasoning + evidence) once available + if item.resolution_metadata: + reasoning = item.resolution_metadata.get("reasoning", "") + if reasoning: + parts.append("") + parts.append(f"[italic]Reasoning:[/italic] {reasoning}") + evidence = item.resolution_metadata.get("key_evidence", []) + if evidence: + parts.append("[italic]Evidence:[/italic]") + for ev in evidence: + parts.append(f" - {ev}") + + self.update("\n".join(parts)) + + +class AgentFeedLog(RichLog): + """Scrollable live log of agent events.""" + + DEFAULT_CSS = """ + AgentFeedLog { + height: 1fr; + border-top: solid $accent; + padding: 0 1; + } + """ + + def __init__(self, **kwargs) -> None: + super().__init__(highlight=True, markup=True, wrap=True, **kwargs) + self._current_text_block: str = "" + + def append_event(self, event_type: str, text: str) -> None: + """Append a single event to the log with type-appropriate styling.""" + if event_type == "text": + # Accumulate text deltas into a running block rather than one-per-token + self._current_text_block += text + return + else: + # Flush any accumulated text block first + self._flush_text_block() + + if event_type == "status": + self.write(f"[bold blue]>>> {text}[/bold blue]") + elif event_type == "tool": + self.write(f"[yellow]{text}[/yellow]") + elif event_type == "result": + self.write(f"[bold green]--- RESULT ---[/bold green]") + self.write(f"[green]{text}[/green]") + elif event_type == "error": + self.write(f"[bold red]ERROR: {text}[/bold red]") + else: + self.write(text) + + def flush_text(self) -> None: + """Flush any buffered text delta block to the log.""" + self._flush_text_block() + + def _flush_text_block(self) -> None: + if self._current_text_block: + self.write(self._current_text_block) + self._current_text_block = "" + + def clear_feed(self) -> None: + """Clear the log and reset state.""" + self._current_text_block = "" + self.clear() + + +class FeedPanel(Vertical): + """Composite widget: resolution header + scrollable agent feed log.""" + + DEFAULT_CSS = """ + FeedPanel { + width: 1fr; + height: 1fr; + } + """ + + def compose(self) -> ComposeResult: + yield ResolutionHeader(id="resolution-header") + yield AgentFeedLog(id="agent-feed-log") + + @property + def header(self) -> ResolutionHeader: + return self.query_one("#resolution-header", ResolutionHeader) + + @property + def feed_log(self) -> AgentFeedLog: + return self.query_one("#agent-feed-log", AgentFeedLog) + + def show_question(self, item: Optional[QuestionItem]) -> None: + """Switch the panel to display a specific question's data.""" + self.header.render_item(item) + self.feed_log.clear_feed() + if item is not None: + # Replay existing feed lines + for line in item.feed_lines: + self.feed_log.write(line) diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/home_panel.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/home_panel.py new file mode 100644 index 00000000..a9d54f59 --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/home_panel.py @@ -0,0 +1,159 @@ +"""Home panel showing a live-updating confusion matrix and question table.""" + +from __future__ import annotations + +from textual.app import ComposeResult +from textual.containers import Vertical +from textual.widgets import DataTable, Static + +from forecasting_tools.agents_and_tools.auto_resolver.tui.models import ( + HOME_SENTINEL_ID, + QuestionItem, + QuestionSelected, +) +from forecasting_tools.agents_and_tools.auto_resolver.tui.report import ( + build_report_from_items, +) +from forecasting_tools.agents_and_tools.auto_resolver.tui.widgets.feed_panel import ( + _normalize_ground_truth, +) + + +def _match_str(predicted: str | None, ground_truth: str | None) -> str: + """Return a match indicator comparing predicted and ground truth.""" + if predicted is None or ground_truth is None: + return "-" + p = predicted.strip().upper() + g = ground_truth.strip().upper() + if p in ("PENDING", "RESOLVING...", "NOT_YET_RESOLVABLE"): + return "-" + if g in ("", "NONE", "-"): + return "-" + return "Y" if p == g else "N" + + +class HomePanel(Vertical): + """Overview panel with confusion matrix, question table, and summary.""" + + DEFAULT_CSS = """ + HomePanel { + width: 1fr; + height: 1fr; + } + HomePanel > #home-title { + dock: top; + text-align: center; + text-style: bold; + padding: 1 2; + background: $surface; + border-bottom: solid $primary; + } + HomePanel > #home-matrix { + dock: top; + height: auto; + padding: 1 2; + background: $surface; + border-bottom: solid $accent; + } + HomePanel > DataTable { + height: 1fr; + } + HomePanel > #home-summary { + dock: bottom; + height: auto; + padding: 1 2; + background: $surface; + border-top: solid $primary; + } + """ + + def compose(self) -> ComposeResult: + yield Static("Auto Resolver - Overview", id="home-title") + yield Static("", id="home-matrix") + table: DataTable = DataTable(id="home-table") + table.cursor_type = "row" + table.zebra_stripes = True + yield table + yield Static("No questions loaded.", id="home-summary") + + def on_mount(self) -> None: + table = self.query_one("#home-table", DataTable) + table.add_columns( + "ID", "Question", "Status", "Predicted", "Ground Truth", "Match", "Cost" + ) + + def on_data_table_row_selected(self, event: DataTable.RowSelected) -> None: + """Navigate to the selected question's feed view.""" + try: + post_id = int(str(event.row_key.value)) + except (ValueError, TypeError): + return + self.post_message(QuestionSelected(post_id)) + + def refresh_table(self, items: dict[int, QuestionItem]) -> None: + """Rebuild the confusion matrix, table rows, and summary.""" + table = self.query_one("#home-table", DataTable) + table.clear() + + total = 0 + completed = 0 + correct = 0 + total_cost = 0.0 + + for post_id, item in items.items(): + total += 1 + total_cost += item.cost + + predicted = item.resolution_display + gt_raw = item.question.resolution_string + gt = _normalize_ground_truth(gt_raw) if gt_raw else "-" + match = _match_str(predicted, gt) + + if item.status == "completed": + completed += 1 + if match == "Y": + correct += 1 + + cost_str = f"${item.cost:.4f}" if item.cost > 0 else "-" + + table.add_row( + str(post_id), + item.title, + item.status.title(), + predicted, + gt, + match, + cost_str, + key=str(post_id), + ) + + # --- Confusion matrix --- + report = build_report_from_items(items) + matrix_text = report.binary_results_table() + + matrix_total = ( + report.n_pp + report.n_pn + report.n_pc + + report.n_np + report.n_nn + report.n_nc + + report.n_cp + report.n_cn + report.n_cc + + report.n_xp + report.n_xn + report.n_xc + ) + matrix_correct = report.n_pp + report.n_nn + report.n_cc + matrix_accuracy = (matrix_correct / matrix_total * 100) if matrix_total > 0 else 0 + + matrix_display = ( + f"{matrix_text}\n\n" + f"Correct: {matrix_correct}/{matrix_total} " + f"Accuracy: {matrix_accuracy:.1f}%" + ) + self.query_one("#home-matrix", Static).update(matrix_display) + + # --- Summary footer --- + accuracy = (correct / completed * 100) if completed > 0 else 0 + summary_parts = [ + f"Total: {total}", + f"Completed: {completed}", + f"Correct: {correct}", + f"Accuracy: {accuracy:.1f}%", + f"Total Cost: ${total_cost:.4f}", + ] + self.query_one("#home-summary", Static).update(" | ".join(summary_parts)) diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/input_modal.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/input_modal.py new file mode 100644 index 00000000..f2cd2fe1 --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/input_modal.py @@ -0,0 +1,266 @@ +"""Modal dialog for adding tournament or question IDs.""" + +from __future__ import annotations + +from typing import Literal + +from textual.app import ComposeResult +from textual.containers import Vertical, Horizontal +from textual.message import Message +from textual.reactive import reactive +from textual.screen import ModalScreen +from textual.widgets import ( + Button, + Checkbox, + Input, + Label, + RadioButton, + RadioSet, + Static, +) + +from forecasting_tools.data_models.questions import QuestionBasicType + + +QUESTION_TYPES: list[QuestionBasicType] = [ + "binary", + "numeric", + "multiple_choice", + "date", + "discrete", + "conditional", +] + + +class AddIdRequested(Message): + """Posted when the user submits an ID from the modal.""" + + def __init__( + self, + id_type: Literal["tournament", "question"], + id_value: int | str, + allowed_types: list[QuestionBasicType] | None = None, + max_questions: int | None = None, + exclude_unresolved: bool = False, + ) -> None: + super().__init__() + self.id_type = id_type + self.id_value = id_value + self.allowed_types = allowed_types or ["binary"] + self.max_questions = max_questions + self.exclude_unresolved = exclude_unresolved + + +class InputModal(ModalScreen[None]): + """A modal screen for entering a tournament or question ID.""" + + DEFAULT_CSS = """ + InputModal { + align: center middle; + } + InputModal > #modal-container { + width: 60; + height: auto; + max-height: 40; + overflow-y: auto; + background: $surface; + border: thick $primary; + padding: 2 3; + } + InputModal > #modal-container > #modal-title { + text-align: center; + text-style: bold; + margin-bottom: 1; + } + InputModal > #modal-container > RadioSet { + height: auto; + margin-bottom: 1; + } + InputModal > #modal-container > Input { + margin-bottom: 1; + } + InputModal > #modal-container > #tournament-options { + height: auto; + margin-bottom: 1; + padding: 1; + background: $boost; + } + InputModal > #modal-container > #tournament-options > .option-label { + text-style: bold; + margin-bottom: 1; + } + InputModal > #modal-container > #tournament-options > #type-checkboxes { + height: auto; + margin-bottom: 1; + } + InputModal > #modal-container > #tournament-options > #type-checkboxes Checkbox { + height: auto; + padding: 0; + margin: 0; + } + InputModal > #modal-container > #tournament-options > #filters-section { + height: auto; + margin-bottom: 1; + } + InputModal > #modal-container > #tournament-options > #filters-section Checkbox { + height: auto; + padding: 0; + margin: 0; + } + InputModal > #modal-container > #modal-buttons { + height: auto; + align: center middle; + } + InputModal > #modal-container > #modal-buttons > Button { + margin: 0 2; + } + InputModal > #modal-container > #modal-error { + color: $error; + height: auto; + margin-bottom: 1; + } + """ + + BINDINGS = [ + ("escape", "cancel", "Cancel"), + ] + + _is_tournament: reactive[bool] = reactive(False) + + def __init__(self, default_type: Literal["tournament", "question"] = "question") -> None: + super().__init__() + self._default_type = default_type + + def compose(self) -> ComposeResult: + with Vertical(id="modal-container"): + yield Label("Add Questions", id="modal-title") + yield RadioSet( + RadioButton("Tournament ID", value=self._default_type == "tournament"), + RadioButton("Question ID", value=self._default_type == "question"), + id="id-type-radio", + ) + yield Input( + placeholder="Enter ID (numeric or slug for tournaments)...", + id="id-input", + ) + with Vertical(id="tournament-options"): + yield Static("Question Types:", classes="option-label") + with Vertical(id="type-checkboxes"): + for qtype in QUESTION_TYPES: + yield Checkbox( + qtype.replace("_", " ").title(), + value=(qtype == "binary"), + id=f"cb-{qtype}", + ) + with Vertical(id="filters-section"): + yield Static("Filters:", classes="option-label") + yield Checkbox( + "Exclude unresolved questions", + value=False, + id="cb-exclude-unresolved", + ) + yield Static("Random Sample (optional):", classes="option-label") + yield Input( + placeholder="Max number of questions to load (leave empty for all)", + id="max-questions-input", + ) + yield Label("", id="modal-error") + with Horizontal(id="modal-buttons"): + yield Button("Submit", id="btn-submit", variant="primary") + yield Button("Cancel", id="btn-cancel", variant="default") + + def on_mount(self) -> None: + self.query_one("#id-input", Input).focus() + radio_set = self.query_one("#id-type-radio", RadioSet) + self._is_tournament = radio_set.pressed_index == 0 + self._update_tournament_options_visibility() + + def on_radio_set_changed(self, event: RadioSet.Changed) -> None: + if event.radio_set.id == "id-type-radio": + self._is_tournament = event.radio_set.pressed_index == 0 + self._update_tournament_options_visibility() + + def _update_tournament_options_visibility(self) -> None: + try: + tournament_options = self.query_one("#tournament-options", Vertical) + tournament_options.display = self._is_tournament + except Exception: + pass + + def action_cancel(self) -> None: + self.dismiss(None) + + def on_button_pressed(self, event: Button.Pressed) -> None: + if event.button.id == "btn-cancel": + self.dismiss(None) + elif event.button.id == "btn-submit": + self._submit() + + def on_input_submitted(self, event: Input.Submitted) -> None: + self._submit() + + def _submit(self) -> None: + error_label = self.query_one("#modal-error", Label) + raw_value = self.query_one("#id-input", Input).value.strip() + + if not raw_value: + error_label.update("Please enter an ID.") + return + + radio_set = self.query_one("#id-type-radio", RadioSet) + id_type: Literal["tournament", "question"] = ( + "tournament" if radio_set.pressed_index == 0 else "question" + ) + + id_value: int | str + try: + id_value = int(raw_value) + except ValueError: + if id_type == "question": + error_label.update("Question ID must be a numeric integer.") + return + id_value = raw_value + + allowed_types: list[QuestionBasicType] = [] + max_questions: int | None = None + exclude_unresolved = False + + if id_type == "tournament": + for qtype in QUESTION_TYPES: + try: + cb = self.query_one(f"#cb-{qtype}", Checkbox) + if cb.value: + allowed_types.append(qtype) + except Exception: + pass + + if not allowed_types: + error_label.update("Please select at least one question type.") + return + + max_input = self.query_one("#max-questions-input", Input).value.strip() + if max_input: + try: + max_questions = int(max_input) + if max_questions <= 0: + error_label.update("Max questions must be a positive number.") + return + except ValueError: + error_label.update("Max questions must be a number.") + return + + try: + exclude_unresolved = self.query_one("#cb-exclude-unresolved", Checkbox).value + except Exception: + pass + + self.app.post_message( + AddIdRequested( + id_type=id_type, + id_value=id_value, + allowed_types=allowed_types if id_type == "tournament" else None, + max_questions=max_questions, + exclude_unresolved=exclude_unresolved, + ) + ) + self.dismiss(None) diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/sidebar.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/sidebar.py new file mode 100644 index 00000000..0eab8637 --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/sidebar.py @@ -0,0 +1,134 @@ +"""Sidebar widget: action buttons and navigable question list.""" + +from __future__ import annotations + +from typing import Optional + +from textual.app import ComposeResult +from textual.containers import Vertical +from textual.widgets import Button, ListItem, ListView, Static + +from forecasting_tools.agents_and_tools.auto_resolver.tui.models import ( + HOME_SENTINEL_ID, + QuestionSelected, +) + + +class SidebarListItem(ListItem): + """A single entry in the sidebar question list.""" + + DEFAULT_CSS = """ + SidebarListItem { + height: auto; + padding: 0 1; + } + SidebarListItem:hover { + background: $boost; + } + SidebarListItem.-highlight { + background: $accent 30%; + } + """ + + def __init__(self, post_id: int, display_text: str, **kwargs) -> None: + super().__init__(**kwargs) + self.post_id = post_id + self._label = Static(display_text) + + def compose(self) -> ComposeResult: + yield self._label + + def update_text(self, text: str) -> None: + self._label.update(text) + + +class QuestionListView(ListView): + """Navigable list of questions with keyboard support.""" + + DEFAULT_CSS = """ + QuestionListView { + height: 1fr; + border-top: solid $primary; + } + """ + + def on_list_view_selected(self, event: ListView.Selected) -> None: + item = event.item + if isinstance(item, SidebarListItem): + self.post_message(QuestionSelected(item.post_id)) + + def on_list_view_highlighted(self, event: ListView.Highlighted) -> None: + item = event.item + if isinstance(item, SidebarListItem): + self.post_message(QuestionSelected(item.post_id)) + + +class Sidebar(Vertical): + """Left sidebar with action buttons and the question list.""" + + DEFAULT_CSS = """ + Sidebar { + width: 40; + min-width: 30; + max-width: 60; + height: 1fr; + border-right: solid $primary; + background: $surface; + } + Sidebar > .sidebar-header { + dock: top; + height: auto; + padding: 1 1; + text-align: center; + text-style: bold; + color: $text; + background: $primary 20%; + } + Sidebar > .sidebar-buttons { + dock: top; + height: auto; + padding: 1 1; + } + Sidebar > .sidebar-buttons Button { + width: 100%; + margin-bottom: 1; + } + """ + + def compose(self) -> ComposeResult: + yield Static("Auto Resolver", classes="sidebar-header") + with Vertical(classes="sidebar-buttons"): + yield Button("Add Tournament [t]", id="btn-add-tournament", variant="primary") + yield Button("Add Question [a]", id="btn-add-question", variant="default") + yield QuestionListView(id="question-list") + + def on_mount(self) -> None: + """Insert the Home entry as the first (and initially highlighted) item.""" + home_item = SidebarListItem( + HOME_SENTINEL_ID, "[bold]Home[/bold] Overview", id="q-home" + ) + self.question_list.append(home_item) + + @property + def question_list(self) -> QuestionListView: + return self.query_one("#question-list", QuestionListView) + + def add_question_entry(self, post_id: int, display_text: str) -> None: + """Append a question to the list.""" + list_item = SidebarListItem(post_id, display_text, id=f"q-{post_id}") + self.question_list.append(list_item) + + def update_question_entry(self, post_id: int, display_text: str) -> None: + """Update an existing question entry's display text.""" + try: + item = self.question_list.query_one(f"#q-{post_id}", SidebarListItem) + item.update_text(display_text) + except Exception: + pass # Item may not exist yet + + def get_selected_post_id(self) -> Optional[int]: + """Return the post_id of the currently highlighted item, or None.""" + highlighted = self.question_list.highlighted_child + if isinstance(highlighted, SidebarListItem): + return highlighted.post_id + return None diff --git a/poetry.lock b/poetry.lock index c0e13dff..d64c107f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2606,6 +2606,27 @@ interegular = ["interegular (>=0.3.1,<0.4.0)"] nearley = ["js2py"] regex = ["regex"] +[[package]] +name = "linkify-it-py" +version = "2.1.0" +description = "Links recognition library with FULL unicode support." +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "linkify_it_py-2.1.0-py3-none-any.whl", hash = "sha256:0d252c1594ecba2ecedc444053db5d3a9b7ec1b0dd929c8f1d74dce89f86c05e"}, + {file = "linkify_it_py-2.1.0.tar.gz", hash = "sha256:43360231720999c10e9328dc3691160e27a718e280673d444c38d7d3aaa3b98b"}, +] + +[package.dependencies] +uc-micro-py = "*" + +[package.extras] +benchmark = ["pytest", "pytest-benchmark"] +dev = ["black", "flake8", "isort", "pre-commit", "pyproject-flake8"] +doc = ["myst-parser", "sphinx", "sphinx_book_theme"] +test = ["coverage", "pytest", "pytest-cov"] + [[package]] name = "litellm" version = "1.80.7" @@ -2641,6 +2662,32 @@ proxy = ["PyJWT (>=2.10.1,<3.0.0) ; python_version >= \"3.9\"", "apscheduler (>= semantic-router = ["semantic-router (>=0.1.12) ; python_version >= \"3.9\" and python_version < \"3.14\""] utils = ["numpydoc"] +[[package]] +name = "markdown-it-py" +version = "4.0.0" +description = "Python port of markdown-it. Markdown parsing, done right!" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147"}, + {file = "markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3"}, +] + +[package.dependencies] +linkify-it-py = {version = ">=1,<3", optional = true, markers = "extra == \"linkify\""} +mdit-py-plugins = {version = ">=0.5.0", optional = true, markers = "extra == \"plugins\""} +mdurl = ">=0.1,<1.0" + +[package.extras] +benchmarking = ["psutil", "pytest", "pytest-benchmark"] +compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "markdown-it-pyrs", "mistletoe (>=1.0,<2.0)", "mistune (>=3.0,<4.0)", "panflute (>=2.3,<3.0)"] +linkify = ["linkify-it-py (>=1,<3)"] +plugins = ["mdit-py-plugins (>=0.5.0)"] +profiling = ["gprof2dot"] +rtd = ["ipykernel", "jupyter_sphinx", "mdit-py-plugins (>=0.5.0)", "myst-parser", "pyyaml", "sphinx", "sphinx-book-theme (>=1.0,<2.0)", "sphinx-copybutton", "sphinx-design"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions", "requests"] + [[package]] name = "markupsafe" version = "3.0.3" @@ -2888,6 +2935,38 @@ cli = ["python-dotenv (>=1.0.0)", "typer (>=0.16.0)"] rich = ["rich (>=13.9.4)"] ws = ["websockets (>=15.0.1)"] +[[package]] +name = "mdit-py-plugins" +version = "0.5.0" +description = "Collection of plugins for markdown-it-py" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "mdit_py_plugins-0.5.0-py3-none-any.whl", hash = "sha256:07a08422fc1936a5d26d146759e9155ea466e842f5ab2f7d2266dd084c8dab1f"}, + {file = "mdit_py_plugins-0.5.0.tar.gz", hash = "sha256:f4918cb50119f50446560513a8e311d574ff6aaed72606ddae6d35716fe809c6"}, +] + +[package.dependencies] +markdown-it-py = ">=2.0.0,<5.0.0" + +[package.extras] +code-style = ["pre-commit"] +rtd = ["myst-parser", "sphinx-book-theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + +[[package]] +name = "mdurl" +version = "0.1.2" +description = "Markdown URL utilities" +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, + {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, +] + [[package]] name = "mistune" version = "3.1.4" @@ -3867,7 +3946,7 @@ version = "4.5.1" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." optional = false python-versions = ">=3.10" -groups = ["dev"] +groups = ["main", "dev"] files = [ {file = "platformdirs-4.5.1-py3-none-any.whl", hash = "sha256:d03afa3963c806a9bed9d5125c8f4cb2fdaf74a55ab60e5d59b3fde758104d31"}, {file = "platformdirs-4.5.1.tar.gz", hash = "sha256:61d5cdcc6065745cdd94f0f878977f8de9437be93de97c1c12f853c9c0cdcbda"}, @@ -4478,7 +4557,7 @@ version = "2.19.2" description = "Pygments is a syntax highlighting package written in Python." optional = false python-versions = ">=3.8" -groups = ["dev"] +groups = ["main", "dev"] files = [ {file = "pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b"}, {file = "pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887"}, @@ -5222,6 +5301,25 @@ lark = ">=1.2.2" [package.extras] testing = ["pytest (>=8.3.5)"] +[[package]] +name = "rich" +version = "14.3.3" +description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" +optional = false +python-versions = ">=3.8.0" +groups = ["main"] +files = [ + {file = "rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d"}, + {file = "rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b"}, +] + +[package.dependencies] +markdown-it-py = ">=2.2.0" +pygments = ">=2.13.0,<3.0.0" + +[package.extras] +jupyter = ["ipywidgets (>=7.5.1,<9)"] + [[package]] name = "rpds-py" version = "0.30.0" @@ -5781,6 +5879,27 @@ files = [ {file = "text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8"}, ] +[[package]] +name = "textual" +version = "3.7.1" +description = "Modern Text User Interface framework" +optional = false +python-versions = "<4.0.0,>=3.8.1" +groups = ["main"] +files = [ + {file = "textual-3.7.1-py3-none-any.whl", hash = "sha256:ab5d153f4f65e77017977fa150d0376409e0acf5f1d2e25e2e4ab9de6c0d61ff"}, + {file = "textual-3.7.1.tar.gz", hash = "sha256:a76ba0c8a6c194ef24fd5c3681ebfddca55e7127c064a014128c84fbd7f5d271"}, +] + +[package.dependencies] +markdown-it-py = {version = ">=2.1.0", extras = ["linkify", "plugins"]} +platformdirs = ">=3.6.0,<5" +rich = ">=13.3.3" +typing-extensions = ">=4.4.0,<5.0.0" + +[package.extras] +syntax = ["tree-sitter (>=0.23.0) ; python_version >= \"3.9\"", "tree-sitter-bash (>=0.23.0) ; python_version >= \"3.9\"", "tree-sitter-css (>=0.23.0) ; python_version >= \"3.9\"", "tree-sitter-go (>=0.23.0) ; python_version >= \"3.9\"", "tree-sitter-html (>=0.23.0) ; python_version >= \"3.9\"", "tree-sitter-java (>=0.23.0) ; python_version >= \"3.9\"", "tree-sitter-javascript (>=0.23.0) ; python_version >= \"3.9\"", "tree-sitter-json (>=0.24.0) ; python_version >= \"3.9\"", "tree-sitter-markdown (>=0.3.0) ; python_version >= \"3.9\"", "tree-sitter-python (>=0.23.0) ; python_version >= \"3.9\"", "tree-sitter-regex (>=0.24.0) ; python_version >= \"3.9\"", "tree-sitter-rust (>=0.23.0,<=0.23.2) ; python_version >= \"3.9\"", "tree-sitter-sql (>=0.3.0,<0.3.8) ; python_version >= \"3.9\"", "tree-sitter-toml (>=0.6.0) ; python_version >= \"3.9\"", "tree-sitter-xml (>=0.7.0) ; python_version >= \"3.9\"", "tree-sitter-yaml (>=0.6.0) ; python_version >= \"3.9\""] + [[package]] name = "threadpoolctl" version = "3.6.0" @@ -6216,6 +6335,21 @@ files = [ {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, ] +[[package]] +name = "uc-micro-py" +version = "2.0.0" +description = "Micro subset of unicode data files for linkify-it-py projects." +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "uc_micro_py-2.0.0-py3-none-any.whl", hash = "sha256:3603a3859af53e5a39bc7677713c78ea6589ff188d70f4fee165db88e22b242c"}, + {file = "uc_micro_py-2.0.0.tar.gz", hash = "sha256:c53691e495c8db60e16ffc4861a35469b0ba0821fe409a8a7a0a71864d33a811"}, +] + +[package.extras] +test = ["coverage", "pytest", "pytest-cov"] + [[package]] name = "unidecode" version = "1.4.0" @@ -6567,4 +6701,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "2a47fec44b5f12100f2aaee440bfe0bdbd1fb8b73b6b4e28d167f92f6bdf7457" +content-hash = "66b8e6b61f3f4bf5b578c943b8919314820ef5d5b6885a107d9ae88c972030ee" diff --git a/pyproject.toml b/pyproject.toml index 0b9d8129..54ea559f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ unidecode = "^1.4.0" hyperbrowser = ">=0.53.0,<0.76.0" pendulum = "^3.1.0" openai-agents = {extras = ["litellm"], version = ">=0.2.0,<0.7.0"} +textual = ">=3.0.0,<4.0.0" [tool.poetry.group.dev.dependencies] time-machine = ">=2.19.0,<4.0.0" From a9abc1ba48c83e660e85b4fb75e02075133a0f14 Mon Sep 17 00:00:00 2001 From: Luke Beukelman Date: Sun, 1 Mar 2026 15:49:28 -0700 Subject: [PATCH 09/14] Added a comment to __init__.py in the tui to indicate that it should not be treated as dependable software. --- .../agents_and_tools/auto_resolver/tui/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/__init__.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/__init__.py index 41e88b56..654a8190 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/tui/__init__.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/__init__.py @@ -3,4 +3,7 @@ Run with: python -m forecasting_tools.agents_and_tools.auto_resolver.tui + +This is just a vibe coded tool for interfacing with the agentic auto resolver. This should be +presently be treated as functional technical debt. """ From 77e77f3614ef32df1c24450741e31b2da2f4e012 Mon Sep 17 00:00:00 2001 From: Luke Beukelman Date: Sun, 1 Mar 2026 20:23:33 -0700 Subject: [PATCH 10/14] Updates for the weekend, integrated asknews --- .../auto_resolver/agentic/__init__.py | 4 +- .../auto_resolver/agentic/instructions.py | 28 ++- .../auto_resolver/tui/__main__.py | 13 +- .../agents_and_tools/auto_resolver/tui/app.py | 166 +++++++++++++++++- .../auto_resolver/tui/widgets/log_panel.py | 61 +++++++ .../agents_and_tools/minor_tools.py | 56 ++++++ forecasting_tools/helpers/asknews_searcher.py | 79 +++++++++ 7 files changed, 395 insertions(+), 12 deletions(-) create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/log_panel.py diff --git a/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py b/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py index d1e308f8..1021f614 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py @@ -29,6 +29,7 @@ from forecasting_tools.agents_and_tools.auto_resolver.resolution_models import BinaryResolutionResult from forecasting_tools.agents_and_tools.auto_resolver import AutoResolver from forecasting_tools.agents_and_tools.minor_tools import ( + create_date_filtered_asknews_tool, perplexity_reasoning_pro_search, ) from forecasting_tools.ai_models.agent_wrappers import ( @@ -182,11 +183,12 @@ async def _rephrase_question_if_needed( def _create_researcher(self, question: BinaryQuestion) -> AiAgent: instructions = researcher_instructions(question) + asknews_tool = create_date_filtered_asknews_tool(question.close_time) return AiAgent( name="Resolution Researcher", instructions=instructions, model=AgentSdkLlm(model=self.model_for_researcher), - tools=[perplexity_reasoning_pro_search], + tools=[perplexity_reasoning_pro_search, asknews_tool], handoffs=[], ) diff --git a/forecasting_tools/agents_and_tools/auto_resolver/agentic/instructions.py b/forecasting_tools/agents_and_tools/auto_resolver/agentic/instructions.py index abfd4d86..265040eb 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/agentic/instructions.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/agentic/instructions.py @@ -28,7 +28,7 @@ def researcher_instructions(question: BinaryQuestion) -> str: # Your Task - Perform multiple strategic Perplexity searches to thoroughly investigate: + Perform multiple strategic searches to thoroughly investigate: 1. **Current Status**: What is the current state of affairs related to this question? 2. **Resolution Criteria**: Have the resolution criteria been met? @@ -37,10 +37,30 @@ def researcher_instructions(question: BinaryQuestion) -> str: 5. **Edge Cases**: Look for any ambiguities, disputes, or complications 6. **Validity Check**: Investigate whether the question's subject is valid/possible (for potential annulment) + # Available Tools + + You have two search tools: + + 1. **Perplexity** (`perplexity_reasoning_pro_search`): Best for analytical queries, + reference lookups, and questions requiring reasoning over multiple sources. + Returns an LLM-synthesized answer with citations. + + 2. **AskNews** (`query_asknews_date_filtered`): Best for current-events and + news-driven questions. Searches international news across many languages. + **Important**: Results are automatically date-filtered to only include + articles published before the question's close date, so you will not see + news about events that happened after the question's context window closed. + This prevents false positives. Use this tool when the question is about + a specific event, policy action, election, conflict, or other newsworthy + topic. + # Search Strategy Guidelines - Run 3-5 searches total (don't overdo it) - Run searches in parallel when they're independent + - Use AskNews for event-driven / news-driven queries + - Use Perplexity for analytical / reference queries + - Using both tools for the same topic provides valuable cross-checking - Use follow-up searches based on initial findings - Focus on authoritative and recent sources - Note any contradictions or uncertainties you find @@ -75,9 +95,9 @@ def researcher_instructions(question: BinaryQuestion) -> str: # Example Search Sequence - 1. Broad search: "Current status of [topic] as of [current date]" - 2. Specific search: "Has [specific criterion] occurred?" - 3. Verification: "Latest news about [topic]" + 1. AskNews search: "[topic]" — get date-filtered news coverage + 2. Perplexity broad search: "Current status of [topic] as of [current date]" + 3. Specific search: "Has [specific criterion] occurred?" 4. (Optional) Follow-up based on findings 5. (If no results found) Validity check: "[topic] cancelled", "[topic] validity", "[topic] problems" diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/__main__.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/__main__.py index 811109f6..0d32f108 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/tui/__main__.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/__main__.py @@ -54,11 +54,13 @@ def _parse_tournament_id(value: str) -> int | str: ) args = parser.parse_args() - if args.verbose: - logging.basicConfig(level=logging.DEBUG) - else: - # Suppress noisy loggers but keep warnings+ - logging.basicConfig(level=logging.WARNING) + log_level = logging.DEBUG if args.verbose else logging.WARNING + + # Do NOT call logging.basicConfig() here -- it would install a + # StreamHandler on sys.stderr which corrupts the Textual TUI display. + # Instead, set the root logger level and let the App install a custom + # handler that routes records into a TUI log panel. + logging.getLogger().setLevel(log_level) from forecasting_tools.agents_and_tools.auto_resolver.tui.app import ( AutoResolverApp, @@ -68,6 +70,7 @@ def _parse_tournament_id(value: str) -> int | str: max_concurrency=args.concurrency, initial_tournaments=args.tournament, initial_questions=args.question, + log_level=log_level, ) app.run() diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/app.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/app.py index 20ee8b1c..1439e313 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/tui/app.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/app.py @@ -5,9 +5,10 @@ import asyncio import logging import random +import sys from datetime import datetime from pathlib import Path -from typing import Optional, cast +from typing import Literal, Optional, cast from textual.app import App, ComposeResult from textual.binding import Binding @@ -37,6 +38,9 @@ from forecasting_tools.agents_and_tools.auto_resolver.tui.widgets.home_panel import ( HomePanel, ) +from forecasting_tools.agents_and_tools.auto_resolver.tui.widgets.log_panel import ( + LogPanel, +) from forecasting_tools.agents_and_tools.auto_resolver.tui.report import ( generate_markdown_report, ) @@ -48,6 +52,77 @@ logger = logging.getLogger(__name__) +# --------------------------------------------------------------------------- +# Logging / stderr capture helpers +# --------------------------------------------------------------------------- + +_LOG_LEVEL_STYLES: dict[int, tuple[str, str]] = { + logging.DEBUG: ("dim", "dim"), + logging.INFO: ("", ""), + logging.WARNING: ("yellow", "yellow"), + logging.ERROR: ("bold red", "bold red"), + logging.CRITICAL: ("bold white on red", "bold white on red"), +} + + +class TuiLoggingHandler(logging.Handler): + """Routes Python log records into the TUI :class:`LogPanel`. + + Uses :meth:`App.call_from_thread` so that records emitted from + background threads (litellm cost callbacks, ``asyncio.to_thread`` + workers, etc.) are safely delivered to the Textual event loop. + """ + + def __init__(self, app: "AutoResolverApp") -> None: + super().__init__() + self._app = app + + def emit(self, record: logging.LogRecord) -> None: + try: + msg = self.format(record) + self._app.call_from_thread( + self._app._append_log, msg, record.levelno + ) + except Exception: + self.handleError(record) + + +class _StderrCapture: + """File-like replacement for ``sys.stderr`` that routes writes into the + TUI :class:`LogPanel` instead of the real terminal stderr. + + Any write that fails (e.g. the app is not yet mounted or is shutting + down) falls back to the real stderr so output is never silently lost. + """ + + def __init__(self, app: "AutoResolverApp") -> None: + self._app = app + self._real_stderr = sys.__stderr__ + + def write(self, text: str) -> int: + if not text or not text.strip(): + return len(text) if text else 0 + try: + self._app.call_from_thread( + self._app._append_log, text.rstrip(), logging.WARNING + ) + except Exception: + if self._real_stderr: + self._real_stderr.write(text) + return len(text) + + def flush(self) -> None: + pass + + # Some libraries check for these attributes on file-like objects. + @property + def encoding(self) -> str: + return "utf-8" + + def isatty(self) -> bool: + return False + + class AutoResolverApp(App): """TUI application for interactive agentic question resolution. @@ -59,6 +134,7 @@ class AutoResolverApp(App): t -- Add questions from a tournament r -- Re-run resolution on the selected question e -- Export report to markdown + l -- Toggle log panel (captured logging + stderr) q -- Quit """ @@ -78,6 +154,7 @@ class AutoResolverApp(App): Binding("t", "add_tournament", "Add Tournament", show=True), Binding("r", "rerun", "Re-run Selected", show=True), Binding("e", "export_report", "Export Report", show=True), + Binding("l", "toggle_logs", "Logs", show=True), Binding("q", "quit", "Quit", show=True), ] @@ -86,6 +163,7 @@ def __init__( max_concurrency: int = 3, initial_tournaments: list[int | str] | None = None, initial_questions: list[int] | None = None, + log_level: int = logging.WARNING, ) -> None: super().__init__() self._resolver = AgenticResolver() @@ -95,6 +173,10 @@ def __init__( self._concurrency_sem = asyncio.Semaphore(max_concurrency) self._initial_tournaments = initial_tournaments or [] self._initial_questions = initial_questions or [] + self._log_level = log_level + self._tui_log_handler: TuiLoggingHandler | None = None + self._original_stderr = sys.stderr + self._view_before_logs: Literal["home", "feed"] = "home" # ------------------------------------------------------------------ # Layout @@ -106,6 +188,7 @@ def compose(self) -> ComposeResult: yield Sidebar(id="sidebar") yield HomePanel(id="home-panel") yield FeedPanel(id="feed-panel") + yield LogPanel(id="log-panel") yield Footer() @property @@ -120,16 +203,43 @@ def feed_panel(self) -> FeedPanel: def home_panel(self) -> HomePanel: return self.query_one("#home-panel", HomePanel) + @property + def log_panel(self) -> LogPanel: + return self.query_one("#log-panel", LogPanel) + # ------------------------------------------------------------------ # Lifecycle # ------------------------------------------------------------------ async def on_mount(self) -> None: """Load any questions / tournaments passed via CLI.""" - # Start with home panel visible, feed panel hidden + # Start with home panel visible, feed/log panels hidden self.feed_panel.display = False + self.log_panel.display = False self.home_panel.display = True + # Install the TUI logging handler on the root logger so ALL + # Python logging records (ours + third-party) are captured. + self._tui_log_handler = TuiLoggingHandler(self) + self._tui_log_handler.setLevel(self._log_level) + self._tui_log_handler.setFormatter( + logging.Formatter("%(asctime)s %(levelname)-8s %(name)s %(message)s", + datefmt="%H:%M:%S") + ) + root = logging.getLogger() + # Remove any pre-existing stderr handlers that basicConfig may + # have installed before us (defensive). + for h in list(root.handlers): + if isinstance(h, logging.StreamHandler) and getattr(h, "stream", None) is sys.stderr: + root.removeHandler(h) + root.addHandler(self._tui_log_handler) + + # Redirect sys.stderr so that non-logging stderr writes (e.g. + # from C extensions, Python warnings, litellm debug output) are + # also captured instead of corrupting the TUI display. + self._original_stderr = sys.stderr + sys.stderr = _StderrCapture(self) # type: ignore[assignment] + for tid in self._initial_tournaments: self._schedule_load_tournament(tid) for qid in self._initial_questions: @@ -143,12 +253,14 @@ def _show_home(self) -> None: """Switch to the home overview panel.""" self.home_panel.display = True self.feed_panel.display = False + self.log_panel.display = False self.home_panel.refresh_table(self._items) def _show_feed(self, item: QuestionItem | None) -> None: """Switch to the feed panel for a specific question.""" self.home_panel.display = False self.feed_panel.display = True + self.log_panel.display = False self.feed_panel.show_question(item) # ------------------------------------------------------------------ @@ -184,6 +296,56 @@ def action_export_report(self) -> None: filepath.write_text(report, encoding="utf-8") self.notify(f"Report exported to {filepath}") + def action_toggle_logs(self) -> None: + """Toggle between the log panel and the previous view.""" + if self.log_panel.display: + # Return to whichever view was active before + if self._view_before_logs == "feed": + item = self._items.get(self._selected_post_id) # type: ignore[arg-type] + self._show_feed(item) + else: + self._show_home() + else: + # Remember which view is currently active, then show logs + if self.feed_panel.display: + self._view_before_logs = "feed" + else: + self._view_before_logs = "home" + self.home_panel.display = False + self.feed_panel.display = False + self.log_panel.display = True + + # ------------------------------------------------------------------ + # Log panel helpers + # ------------------------------------------------------------------ + + def _append_log(self, text: str, level: int = logging.INFO) -> None: + """Append a formatted log line to the :class:`LogPanel`. + + Called from :class:`TuiLoggingHandler` and :class:`_StderrCapture` + via ``call_from_thread``. Must only be called on the Textual + event-loop thread. + """ + open_tag, close_tag = "", "" + for threshold in sorted(_LOG_LEVEL_STYLES, reverse=True): + if level >= threshold: + style_open, _ = _LOG_LEVEL_STYLES[threshold] + if style_open: + open_tag = f"[{style_open}]" + close_tag = f"[/{style_open}]" + break + self.log_panel.append_log(f"{open_tag}{text}{close_tag}") + + def _teardown_logging(self) -> None: + """Remove our logging handler and restore stderr.""" + if self._tui_log_handler is not None: + logging.getLogger().removeHandler(self._tui_log_handler) + self._tui_log_handler = None + sys.stderr = self._original_stderr + + async def on_unmount(self) -> None: + self._teardown_logging() + # ------------------------------------------------------------------ # Message handlers # ------------------------------------------------------------------ diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/log_panel.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/log_panel.py new file mode 100644 index 00000000..1bff8330 --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/log_panel.py @@ -0,0 +1,61 @@ +"""Global log panel for displaying captured logging and stderr output.""" + +from __future__ import annotations + +from textual.app import ComposeResult +from textual.containers import Vertical +from textual.widgets import RichLog, Static + + +class LogPanel(Vertical): + """Full-screen panel showing captured Python logging and stderr output. + + Toggled via the ``l`` keybinding. All log records from every logger + in the process (including third-party libraries like litellm, httpx, + and the OpenAI SDK) are routed here instead of to stderr so they do + not corrupt the TUI display. + """ + + DEFAULT_CSS = """ + LogPanel { + width: 1fr; + height: 1fr; + } + LogPanel > .log-panel-header { + dock: top; + height: auto; + padding: 0 2; + background: $surface; + border-bottom: solid $primary; + text-align: center; + text-style: bold; + } + LogPanel > #log-output { + height: 1fr; + padding: 0 1; + } + """ + + def compose(self) -> ComposeResult: + yield Static( + "Logs [dim](press [bold]l[/bold] to return)[/dim]", + classes="log-panel-header", + ) + yield RichLog( + id="log-output", + highlight=True, + markup=True, + wrap=True, + ) + + @property + def log_output(self) -> RichLog: + return self.query_one("#log-output", RichLog) + + def append_log(self, text: str) -> None: + """Append a pre-formatted log line to the panel.""" + self.log_output.write(text) + + def clear_logs(self) -> None: + """Clear all log output.""" + self.log_output.clear() diff --git a/forecasting_tools/agents_and_tools/minor_tools.py b/forecasting_tools/agents_and_tools/minor_tools.py index fee3b477..b6d15726 100644 --- a/forecasting_tools/agents_and_tools/minor_tools.py +++ b/forecasting_tools/agents_and_tools/minor_tools.py @@ -1,6 +1,7 @@ import asyncio import logging import random +from datetime import datetime from forecasting_tools.agents_and_tools.question_generators.simple_question import ( SimpleQuestion, @@ -31,6 +32,61 @@ async def query_asknews(topic: str) -> str: return await AskNewsSearcher().get_formatted_news_async(topic) +def create_date_filtered_asknews_tool( + cutoff_date: datetime | None = None, +) -> AgentTool: + """Factory that returns an AskNews search tool filtered to a date cutoff. + + The returned tool only surfaces articles published **before** + ``cutoff_date``, preventing false positives from events that occurred + after a question's context window closed. + + If ``cutoff_date`` is ``None`` the tool falls back to the standard + unfiltered AskNews search. + + Args: + cutoff_date: Only return articles published before this datetime. + Typically set to the question's ``close_time``. + + Returns: + An ``@agent_tool``-decorated async function suitable for inclusion + in an agent's ``tools`` list. + """ + if cutoff_date is not None: + date_str = cutoff_date.strftime("%Y-%m-%d") + description = ( + f"Search international news articles using AskNews. " + f"Results are automatically filtered to only include articles " + f"published BEFORE {date_str} (the question's close date) to " + f"avoid false positives from events that occurred after the " + f"question's context window. Returns ~16 article summaries with " + f"title, summary, URL, and publication date." + ) + else: + description = ( + "Search international news articles using AskNews. " + "Returns ~16 article summaries with title, summary, URL, and " + "publication date. Can search international news from other " + "languages." + ) + + @agent_tool(description_override=description) + async def query_asknews_date_filtered(topic: str) -> str: + logger.info( + f"TOOL: Querying AskNews for topic: {topic}" + + (f" (before {date_str})" if cutoff_date else "") + ) + searcher = AskNewsSearcher() + if cutoff_date is not None: + return await searcher.get_formatted_news_before_date_async( + topic, cutoff_date + ) + else: + return await searcher.get_formatted_news_async(topic) + + return query_asknews_date_filtered + + @agent_tool async def perplexity_reasoning_pro_search(query: str) -> str: """ diff --git a/forecasting_tools/helpers/asknews_searcher.py b/forecasting_tools/helpers/asknews_searcher.py index d780b9b5..331e9a52 100644 --- a/forecasting_tools/helpers/asknews_searcher.py +++ b/forecasting_tools/helpers/asknews_searcher.py @@ -3,6 +3,7 @@ import asyncio import logging import os +from datetime import datetime, timedelta, timezone from typing import Literal from asknews_sdk import AsyncAskNewsSDK @@ -132,6 +133,84 @@ async def get_formatted_news_async(self, query: str) -> str: self.cache.set(query, formatted_articles) return formatted_articles + async def get_formatted_news_before_date_async( + self, + query: str, + before_date: datetime, + ) -> str: + """Search AskNews for articles published before a specific date. + + This is designed for resolution contexts where news published after + a question's close date should not influence the resolution. It uses + the AskNews ``end_timestamp`` parameter filtered on ``pub_date`` to + ensure only articles published before the cutoff are returned. + + Args: + query: Natural language search query. + before_date: Only return articles published before this datetime. + + Returns: + Formatted string of matching news articles. + """ + cache_key = f"{query}__before__{before_date.isoformat()}" + cached_result = self.cache.get(cache_key) + if cached_result is not None: + logger.info(f"Found cached result for date-filtered query: {query}") + return cached_result + + end_ts = int(before_date.timestamp()) + now_utc = datetime.now(timezone.utc) + cutoff_is_recent = (now_utc - before_date) < timedelta(hours=48) + + async with AsyncAskNewsSDK( + client_id=self.client_id, + client_secret=self.client_secret, + api_key=self.api_key, + scopes=set(["news"]), + ) as ask: + # If cutoff is within the last 48h, also search the hot database + hot_articles = [] + if cutoff_is_recent: + hot_response = await ask.news.search_news( + query=query, + n_articles=6, + return_type="both", + strategy="default", + end_timestamp=end_ts, + time_filter="pub_date", + historical=False, + ) + hot_articles = hot_response.as_dicts or [] + await asyncio.sleep(self._default_rate_limit) + + # Search the historical archive (back to 2023) + historical_response = await ask.news.search_news( + query=query, + n_articles=10, + return_type="both", + strategy="default", + end_timestamp=end_ts, + time_filter="pub_date", + historical=True, + ) + historical_articles = historical_response.as_dicts or [] + + formatted_articles = ( + f"Here are the relevant news articles " + f"(filtered to articles published before " + f"{before_date.strftime('%Y-%m-%d')}):\n\n" + ) + + if hot_articles: + formatted_articles += self._format_articles(hot_articles) + if historical_articles: + formatted_articles += self._format_articles(historical_articles) + if not hot_articles and not historical_articles: + formatted_articles += "No articles were found.\n\n" + + self.cache.set(cache_key, formatted_articles) + return formatted_articles + def _format_articles(self, articles: list[SearchResponseDictItem]) -> str: formatted_articles = "" sorted_articles = sorted(articles, key=lambda x: x.pub_date, reverse=True) From 588340bc65e923cca0083bda824f8fa2595e539b Mon Sep 17 00:00:00 2001 From: Luke Beukelman Date: Sun, 1 Mar 2026 20:28:33 -0700 Subject: [PATCH 11/14] Addeding latest report --- .../reports/tui_report_20260301_202819.md | 1192 +++++++++++++++++ 1 file changed, 1192 insertions(+) create mode 100644 forecasting_tools/agents_and_tools/auto_resolver/reports/tui_report_20260301_202819.md diff --git a/forecasting_tools/agents_and_tools/auto_resolver/reports/tui_report_20260301_202819.md b/forecasting_tools/agents_and_tools/auto_resolver/reports/tui_report_20260301_202819.md new file mode 100644 index 00000000..91b8f615 --- /dev/null +++ b/forecasting_tools/agents_and_tools/auto_resolver/reports/tui_report_20260301_202819.md @@ -0,0 +1,1192 @@ +# Auto Resolver Assessment Report + +**Generated:** 2026-03-01 20:28:19 + +## Summary + +| Actual \ Predicted | Positive | Negative | Cancelled | Not Answered | +|--------------------|----------|----------|-----------|--------------| +| Positive | 13 | 0 | 1 | 0 | +| Negative | 0 | 29 | 0 | 0 | +| Cancelled | 3 | 2 | 1 | 0 | + +**Total Questions:** 49 +**Correct Predictions:** 43 (87.8%) +**Total Cost:** $7.9477 + +## Detailed Results + + +### Question 39056 + +**Title:** [PRACTICE] Will Shigeru Ishiba cease to be Prime Minister of Japan before September 2025? +**URL:** https://www.metaculus.com/questions/39056 + +**Question Contents:** + +> [PRACTICE] Will Shigeru Ishiba cease to be Prime Minister of Japan before September 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | CanceledResolution.ANNULLED | + +**Cost:** $0.1234 + +**Key Evidence:** +- Ishiba announced his resignation on September 7, 2025, which is after the resolution cutoff of September 1, 2025. +- During July and August 2025, Ishiba actively denied resignation reports, calling them 'completely unfounded,' and stated he would focus on tariff negotiations with the U.S. +- The LDP performed poorly in the House of Councillors election on July 20, 2025, generating calls for resignation, but Ishiba resisted throughout July and August 2025. +- After completing U.S.-Japan trade deal negotiations in early September 2025, internal LDP pressure intensified, leading to his resignation announcement on September 7, 2025. +- Sanae Takaichi succeeded Ishiba and took office on approximately October 21, 2025, confirming Ishiba remained PM through at least September 7, 2025. + +--- + +### Question 39621 + +**Title:** Will Sam Altman Use Microsoft Teams before 2026? +**URL:** https://www.metaculus.com/questions/39621 + +**Question Contents:** + +> Will Sam Altman Use Microsoft Teams before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | CanceledResolution.ANNULLED | + +**Cost:** $0.1739 + +**Key Evidence:** +- No news articles, official statements, or credible reports document Altman using Microsoft Teams across multiple searches. +- Sam Altman was fired from OpenAI on November 17, 2023, via a Google Meet call — not Microsoft Teams — which generated social media jokes about Teams being snubbed. +- Altman was hired by Microsoft on November 20, 2023, but returned to OpenAI as CEO just days later (November 21, 2023), before ever meaningfully starting his Microsoft role. +- Searches for 'Sam Altman' + 'Microsoft Teams' return only social media jokes and irony commentary about OpenAI using Google Meet instead, with zero credible reports of him actually using Teams. +- The original question appears to be asking a humorous/ironic question prompted by the Google Meet firing, and no resolution evidence (YES) has been documented. + +--- + +### Question 39387 + +**Title:** Will diplomatic engagement between the U.S. and China on nuclear risk reduction increase in 2025? +**URL:** https://www.metaculus.com/questions/39387 + +**Question Contents:** + +> Will diplomatic engagement between the U.S. and China on nuclear risk reduction increase in 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | CanceledResolution.ANNULLED | + +**Cost:** $0.1398 + +**Key Evidence:** +- A Track II dialogue was held in Shanghai, July 7–11, 2025, organized by Harvard Kennedy School's Project on Managing the Atom, the American Academy of Arts and Sciences, and China's Shanghai Academy of Social Sciences, explicitly addressing nuclear risk reduction. +- The American Academy of Arts and Sciences officially acknowledged the engagement, publishing a winter 2026 bulletin article titled 'Opportunities and Challenges: U.S.-China Nuclear Arms Control and Risk Reduction.' +- The July 2025 Shanghai dialogue explicitly addressed nuclear risk reduction, strategic stability, nuclear forces and policies, no-first-use commitments, nonproliferation, and risk-reduction mechanisms. +- The Trump administration pursued a multilateral approach to nuclear arms control throughout 2025, engaging both Russia and China, with Trump discussing nuclear weapons reduction talks with China. +- China did not rule out bilateral nuclear risk reduction talks with the U.S., and a Trump-Xi meeting occurred in November 2025 in South Korea, which included broader diplomatic engagement. + +--- + +### Question 39460 + +**Title:** In 2025, will IRA's Advanced Manufacturing Production Tax Credits (45Χ) start requiring the use of d +**URL:** https://www.metaculus.com/questions/39460 + +**Question Contents:** + +> In 2025, will IRA's Advanced Manufacturing Production Tax Credits (45Χ) start requiring the use of domestic materials or subcomponents? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | CanceledResolution.ANNULLED | + +**Cost:** $0.1806 + +**Key Evidence:** +- The One Big Beautiful Bill Act (H.R. 1, Public Law 119-21) was signed into law on July 4, 2025 — before January 1, 2026 +- The OBBBA introduced restrictions prohibiting 'material assistance from any prohibited foreign entity' for eligible components, effective for tax years beginning after July 4, 2025 +- The OBBBA established a Material Assistance Cost Ratio (MACR) calculation framework that restricts foreign-sourced materials +- For tax years beginning after December 31, 2026, at least 65% of the direct material cost of an integrated component must come from primary components manufactured in the U.S. +- The resolution criteria fine print states 'Any requirement for domestic materials or subcomponents will resolve this question as Yes' + +--- + +### Question 39600 + +**Title:** Will Maersk resume shipping in the Red Sea in 2025? +**URL:** https://www.metaculus.com/questions/39600 + +**Question Contents:** + +> Will Maersk resume shipping in the Red Sea in 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | CanceledResolution.AMBIGUOUS | + +**Cost:** $0.0966 + +**Key Evidence:** +- On December 18–19, 2025, Maersk's vessel Maersk Sebarok successfully completed a transit through the Bab el-Mandeb Strait and the Red Sea, marking the company's first container ship passage through the Red Sea since early 2024. +- Maersk confirmed the transit on December 22, 2025, via an official announcement on maersk.com, calling it 'a significant step forward.' +- This was part of Maersk's MECL service (Middle East Container Line) and represented a formal (if cautious) change to its routing policy. +- In early 2025, Maersk and Hapag-Lloyd through their Gemini Cooperation planned to route one shared IMX service through the Red Sea and Suez Canal, though by late February 2025 some sailings had to be rerouted back around the Cape of Good Hope. +- After the successful December transit, a subsequent transit by the U.S.-flagged Maersk Denver occurred in January 2026, and Maersk announced a 'structural return' of MECL to the trans-Suez route on January 15, 2026. + +--- + +### Question 39385 + +**Title:** Will real-money betting on Polymarket or Manifold become legal in the United States in 2025? +**URL:** https://www.metaculus.com/questions/39385 + +**Question Contents:** + +> Will real-money betting on Polymarket or Manifold become legal in the United States in 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| CanceledResolution.AMBIGUOUS | CanceledResolution.ANNULLED | + +**Cost:** $0.2094 + +**Key Evidence:** +- The synced Metaculus question (post_id 31340, question_id 30969) resolved as Ambiguous on September 24, 2025, before Polymarket's December 2025 CFTC approval and US launch. +- The question explicitly states: 'This question will resolve to the same value as the synced question.' +- Polymarket's US beta launch occurred in December 2025 — after the synced Metaculus question had already closed/resolved. +- Manifold discontinued its real money 'sweepcash' feature on March 28, 2025, and did not pursue legalization in the US. +- A related but separate Metaculus question (question_id 39385, opened October 16, 2025) resolved as Annulled, further supporting ambiguity around this topic. + +--- + +### Question 39580 + +**Title:** Will global average wealth per adult increase in 2024 compared to 2023? +**URL:** https://www.metaculus.com/questions/39580 + +**Question Contents:** + +> Will global average wealth per adult increase in 2024 compared to 2023? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| CanceledResolution.AMBIGUOUS | TRUE | + +**Cost:** $0.3324 + +**Key Evidence:** +- UBS Global Wealth Report (2025): Global wealth grew 4.6% in 2024, following a 4.2% increase in 2023, with growth heavily concentrated in North America (+11%). +- Allianz Global Wealth Report 2025 confirms global net financial assets grew in 2024. +- WID/World Inequality Report data shows global wealth-income ratios rose to over 625% of world net domestic product by 2025, continuing upward trends. +- No direct confirmation was found of the specific WID metric `anweal_p0p100_z` values for 2023 and 2024 in search results. +- More than half of the 56 markets sampled by UBS did not participate in global wealth growth in 2024 and actually saw their average wealth per adult decline in USD real terms, which could affect the median differently from the mean. + +--- + +### Question 39355 + +**Title:** Will a federal court rule that the Trump administration violated the Impoundment Control Act in atte +**URL:** https://www.metaculus.com/questions/39355 + +**Question Contents:** + +> Will a federal court rule that the Trump administration violated the Impoundment Control Act in attempting to permanently withhold or cancel congressionally allocated funds by September 27, 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.2257 + +**Key Evidence:** +- All federal court rulings related to the Trump administration's impoundment of funds before the deadline were preliminary in nature (TROs or preliminary injunctions), which the resolution criteria explicitly excludes. +- On August 13, 2025, the D.C. Circuit Court of Appeals ruled that humanitarian groups lacked standing (only the U.S. Comptroller General could challenge an impoundment under the ICA) — a procedural ruling, not a violation finding. +- On September 26, 2025, the Supreme Court stayed Judge Ali's August order, allowing the Trump administration to withhold ~$4 billion in foreign aid, and explicitly stated this was not a final determination on the merits. +- The GAO issued findings on July 23, 2025 that the Trump administration violated the ICA by withholding Head Start program funds and Renew America's Schools Program funds, but the GAO is a legislative branch agency — not a federal court — so these findings do not satisfy the resolution criteria. +- Judge Ali's August 2025 expanded preliminary injunction found the freeze 'likely violated both federal law and the Constitution,' but this was still a preliminary ruling and does not qualify as a formal decision on the merits. + +--- + +### Question 39359 + +**Title:** Will 2025 be the warmest year on record globally, according to the ECMWF? +**URL:** https://www.metaculus.com/questions/39359 + +**Question Contents:** + +> Will 2025 be the warmest year on record globally, according to the ECMWF? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.0527 + +**Key Evidence:** +- 2025 was the third-warmest year on record according to ECMWF, NOT the warmest. +- 2025 global surface air temperatures were 1.47°C above pre-industrial levels (1850–1900). +- 2025 was 0.01°C cooler than 2023 and 0.13°C cooler than 2024, with 2024 remaining the hottest year on record. +- ECMWF published this finding at: https://www.ecmwf.int/en/about/media-centre/news/2025/2025-third-warmest-year +- NOAA also confirmed 2025 as the third-warmest year globally. + +--- + +### Question 39381 + +**Title:** In 2025, will the domestic content requirements become mandatory for obtaining the IRA's Investment +**URL:** https://www.metaculus.com/questions/39381 + +**Question Contents:** + +> In 2025, will the domestic content requirements become mandatory for obtaining the IRA's Investment Tax Credits (48Ε)? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.2685 + +**Key Evidence:** +- The One Big Beautiful Bill Act (OBBBA) was signed into law on July 4, 2025, which is before January 1, 2026, but did NOT make domestic content mandatory for all 48E ITC claimants. +- For standard taxpayers claiming the ITC, domestic content compliance remains voluntary/optional, and meeting the requirements simply unlocks a bonus credit (10 or 2 percentage points). +- For direct pay ('applicable') entities (tax-exempt orgs, state/local governments, tribal governments, rural cooperatives), there are mandatory domestic content requirements or 'haircut' provisions, but this was already the case under the original IRA for projects ≥1 MW. +- The resolution criteria's fine print states: 'The domestic requirements becoming mandatory for ITC only for specific categories of projects will not resolve this question.' +- No legislation enacted before January 1, 2026 converted domestic content requirements into a universal eligibility prerequisite for the 48E ITC for all taxpayers. + +--- + +### Question 39384 + +**Title:** Will Russia have significantly expanded its controlled territory in Ukraine on January 1, 2026? +**URL:** https://www.metaculus.com/questions/39384 + +**Question Contents:** + +> Will Russia have significantly expanded its controlled territory in Ukraine on January 1, 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.1845 + +**Key Evidence:** +- All five 'must control all' locations remain under Russian de facto control as of January 1, 2026 (Simferopol, Nova Kakhovka, Svatove, Donetsk, Tokmak). +- Kharkiv: Russian forces were operating in northern Kharkiv Oblast but did not control the city as of January 2026. +- Zaporizhzhia: Russian forces advanced ~18 km south of the city but did not control the city. +- Sumy: Russian forces captured small border villages near Sumy Oblast but did not control the city. +- Odesa, Mykolaiv, Lozova, and Chernihiv showed no evidence of Russian control. + +--- + +### Question 39389 + +**Title:** Will at least 500,000 Palestinians be displaced from Gaza before December 31st, 2025? +**URL:** https://www.metaculus.com/questions/39389 + +**Question Contents:** + +> Will at least 500,000 Palestinians be displaced from Gaza before December 31st, 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.2153 + +**Key Evidence:** +- According to the Palestinian Central Bureau of Statistics, approximately 100,000 Palestinians were forced to leave Gaza (cross the border entirely) since October 2023. +- Estimates of ~110,000 Palestinians fled to Egypt as refugees/asylum seekers. +- The Rafah crossing was largely closed after Israel occupied the Palestinian side in May 2024, severely limiting departures after that point. +- ~1.9 million Palestinians were internally displaced within Gaza, but this is movement within Gaza and does not count toward the threshold. +- The Gaza population declined by ~254,000 people (10.6%) by end of 2025, reflecting deaths, emigration, and other factors combined — far short of indicating 500,000 external departures. + +--- + +### Question 39406 + +**Title:** At the end of 2025, will any of GiveWell's top charities perform mass deworming as their primary int +**URL:** https://www.metaculus.com/questions/39406 + +**Question Contents:** + +> At the end of 2025, will any of GiveWell's top charities perform mass deworming as their primary intervention? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.3005 + +**Key Evidence:** +- GiveWell's four top charities as of January 1, 2026 are: Against Malaria Foundation (insecticide-treated nets), Malaria Consortium (seasonal malaria chemoprevention), Helen Keller International (vitamin A supplementation), and New Incentives (conditional cash transfers for vaccinations in Nigeria). +- Evidence Action's Deworm the World Initiative was previously a GiveWell top charity, but is no longer; GiveWell does not accept donations designated for this charity. +- Sightsavers receives GiveWell exit funding through ~March 2027 for deworming, but is not a top charity. +- The alternative resolution path (10% of regrants going to deworming) does not apply because GiveWell still maintains a top charities list; even if it did apply, deworming grants appear to be well under 10% of GiveWell's total 2025 grantmaking of $418M. +- None of GiveWell's four top charities perform mass deworming as their primary intervention. + +--- + +### Question 39421 + +**Title:** Will the number of UFO sightings reported by the National UFO Reporting Center exceed 350 for Septem +**URL:** https://www.metaculus.com/questions/39421 + +**Question Contents:** + +> Will the number of UFO sightings reported by the National UFO Reporting Center exceed 350 for September 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.1118 + +**Key Evidence:** +- September 2025 had 337 UFO sightings reported to NUFORC — well below the 350 threshold required for a 'Yes' resolution. +- The resolution criteria requires greater than 350 UFO/UAP sighting reports for September 2025. +- For comparison, August 2025 just barely exceeded 350 with 351 reports, while September fell short. +- The data is sourced directly from the NUFORC national event index (nuforc.org/ndx/?id=event), which is the exact source specified in the resolution criteria. +- October 2025 had 415 reports and July 2025 had 381, showing September 2025's 337 was notably lower than surrounding months. + +--- + +### Question 39423 + +**Title:** Will Boeing file for bankruptcy before January 1, 2026? +**URL:** https://www.metaculus.com/questions/39423 + +**Question Contents:** + +> Will Boeing file for bankruptcy before January 1, 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.0528 + +**Key Evidence:** +- Boeing has not filed for bankruptcy at any point in 2024 or 2025 and continues to operate as a going concern. +- Boeing faced significant headwinds including the 2024 machinists' strike (which cost ~$5 billion), a 737 MAX door plug blowout incident in January 2024, production delays, FAA restrictions, and mounting debt, but no filing occurred. +- Boeing continued reporting financial results through Q4 2025 and completed acquisitions (Spirit AeroSystems) and strategic divestitures in 2025. +- CEO Kelly Ortberg stated the company 'made significant progress on our recovery in 2025.' +- Boeing's consolidated debt stands at ~$54.1 billion as of end of 2025 — heavy, but being managed. + +--- + +### Question 39427 + +**Title:** Will Bitcoin close 2025 higher than it started? +**URL:** https://www.metaculus.com/questions/39427 + +**Question Contents:** + +> Will Bitcoin close 2025 higher than it started? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.1168 + +**Key Evidence:** +- Resolution Threshold: $93,429.20 (Bitcoin's price at the start of 2025) +- December 31, 2025 Closing Price: Approximately $87,500–$88,430 — below the threshold +- CoinMarketCap data indicates a closing price of ~$88,430 on December 31, 2025 +- StatMuse data corroborates a closing price of ~$87,502 on December 31, 2025 +- Bitcoin fell approximately 19.1% from September 1 to December 31, 2025, resulting in a year-end price well below the starting price + +--- + +### Question 39430 + +**Title:** Will ARC find that GPT-5 has autonomous replication capabilities? +**URL:** https://www.metaculus.com/questions/39430 + +**Question Contents:** + +> Will ARC find that GPT-5 has autonomous replication capabilities? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.2907 + +**Key Evidence:** +- METR (the successor to ARC Evals) evaluated GPT-5 and concluded GPT-5 is NOT capable of autonomous replication and does not pose catastrophic risks via rogue replication threat models. +- GPT-5's 50%-time horizon was measured at 2 hours 17 minutes, far below the concerning threshold of ~40 hours, with even upper-bound uncertainty estimates (~4.5 hours) well below what would trigger concern. +- GPT-5 was released on August 7, 2025, satisfying the timeline requirement for the question. +- There is no evidence that ARC (the original organization) published any evaluation of GPT-5 specifically; the evaluation work has entirely migrated to METR. +- No evaluation from either ARC or METR found GPT-5 to be capable of autonomous replication, making a Yes/TRUE resolution unsupported by current evidence. + +--- + +### Question 39441 + +**Title:** Will SpaceX launch the Starship three or more times in the second half of 2025? +**URL:** https://www.metaculus.com/questions/39441 + +**Question Contents:** + +> Will SpaceX launch the Starship three or more times in the second half of 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.1607 + +**Key Evidence:** +- Flight 10 launched on August 26, 2025 at 23:30 UTC — clearly within H2 2025 +- Flight 11 launched on October 13, 2025 at 23:23 UTC — clearly within H2 2025 +- Flight 12 did NOT launch in H2 2025 — it is targeted for March 2026 +- No other Starship launches occurred between July 1 and December 31, 2025 +- Only 2 Starship launches occurred in the second half of 2025; the threshold of 3 or more launches was NOT met + +--- + +### Question 39453 + +**Title:** Will the number of deaths from antibiotic-resistant infections per year double by December 31, 2025? +**URL:** https://www.metaculus.com/questions/39453 + +**Question Contents:** + +> Will the number of deaths from antibiotic-resistant infections per year double by December 31, 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.1773 + +**Key Evidence:** +- 2019 CDC Threats Report: ~35,000 deaths/year (the most comprehensive and most cited figure); ~48,000 when C. difficile is included +- 2013 CDC Report: ~23,000 deaths/year from antibiotic-resistant infections in the US (later revised upward to ~44,000 using improved methodology) +- July 2024 CDC update (2021-2022 data): Documented a 20% rise in hospital-onset infections, but did not provide a new comprehensive national death toll +- 2025 CDC report on NDM-CRE: Documented a 460% surge in one specific pathogen between 2019-2023, but this is not a comprehensive mortality estimate +- No report found — from the CDC or any other credible source — citing 70,000 or more annual US deaths from antibiotic-resistant infections, at any point through 2025 + +--- + +### Question 39463 + +**Title:** Will 1 Euro be worth less than 1 US Dollar before 2026? +**URL:** https://www.metaculus.com/questions/39463 + +**Question Contents:** + +> Will 1 Euro be worth less than 1 US Dollar before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.1372 + +**Key Evidence:** +- The lowest EUR/USD rate in 2024 was 1.0350, recorded on December 31, 2024. The euro never fell to or below parity (1.0000) during 2024. +- The lowest EUR/USD rate in January 2025 was 1.0257, recorded on January 10, 2025. This was the weakest point of 2025 and still remained above parity. +- After the January 2025 low, the EUR/USD rate recovered significantly throughout 2025, reaching approximately 1.13–1.17 range by late 2025, ending the year around 1.1714. +- While analysts in early 2025 warned parity was possible, it never materialized. The floor was approximately 1.0257. +- At no point during the January 1, 2024 – January 1, 2026 window did the EUR/USD exchange rate fall below 1.0000. + +--- + +### Question 39466 + +**Title:** Will Walmart stores in the United States return to 24-hour service before 2026? +**URL:** https://www.metaculus.com/questions/39466 + +**Question Contents:** + +> Will Walmart stores in the United States return to 24-hour service before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.1970 + +**Key Evidence:** +- Most Walmart locations in the U.S. continued to operate on limited hours (typically 6:00 a.m. to midnight or similar schedules), rather than around the clock, as of January 2026. +- In August 2022, Walmart stated it had no plans to return to 24/7 operations, citing labor shortages, and no subsequent announcements reversed this policy through the resolution date. +- On December 24, 2025, Walmart stores operated only from 6:00 a.m. to 6:00 p.m., and all 4,600+ U.S. Walmart stores were completely closed on December 25, 2025. +- No official announcement or credible reporting confirmed a return to majority 24/7 operations as of January 2026. +- Store hours remained limited (not 24/7) throughout 2024 and 2025, with staffing shortages and higher operational costs cited as reasons for maintaining limited hours. + +--- + +### Question 39473 + +**Title:** Will Ukraine launch a nuclear weapons program before 2026? +**URL:** https://www.metaculus.com/questions/39473 + +**Question Contents:** + +> Will Ukraine launch a nuclear weapons program before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.1384 + +**Key Evidence:** +- Ukraine's Foreign Ministry repeatedly reaffirmed its commitment to the Nuclear Non-Proliferation Treaty (NPT) and stated it does not intend to acquire nuclear weapons. +- President Zelensky stated in February 2026: 'Unfortunately, there are no nuclear weapons in Ukraine.' +- The IAEA maintained continuous monitoring at Ukraine's nuclear sites throughout 2025, with no findings of weapons-related activities. +- All highly enriched uranium was removed from Ukraine by 2012, and IAEA safeguards remain in place at all Ukrainian nuclear sites. +- Statements about Ukraine's nuclear capability (e.g., a Ukrainian think tank in November 2024 and a 2024 Ministry of Defence report) referred to technical capability assessments, not an active program. + +--- + +### Question 39489 + +**Title:** In 2025, will the domestic content requirements become mandatory for obtaining the IRA's Production +**URL:** https://www.metaculus.com/questions/39489 + +**Question Contents:** + +> In 2025, will the domestic content requirements become mandatory for obtaining the IRA's Production Tax Credits (45Y)? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.2259 + +**Key Evidence:** +- The domestic content bonus under Section 45Y is optional, not mandatory — meeting the domestic content thresholds entitles taxpayers to a bonus credit of up to 10% on top of the base credit, but taxpayers can still claim the base 45Y credit without meeting domestic content requirements. +- The One Big Beautiful Bill Act (OBBA) was signed into law on July 4, 2025, which is before January 1, 2026, but it did NOT make domestic content requirements mandatory for claiming the base 45Y credit. +- The OBBA introduced separate FEOC (Foreign Entity of Concern) restrictions, which are mandatory exclusions, but these are distinct from the existing IRA domestic content bonus credit requirements. +- The OBBA also terminated the 45Y credit for wind and solar projects placed in service after December 31, 2027, but kept domestic content as optional bonus credits. +- No other legislation enacted before January 1, 2026 made domestic content requirements mandatory for 45Y credits. + +--- + +### Question 39499 + +**Title:** Will China operate at least one military base in a BRICS country before 2026? +**URL:** https://www.metaculus.com/questions/39499 + +**Question Contents:** + +> Will China operate at least one military base in a BRICS country before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.2188 + +**Key Evidence:** +- China's only confirmed overseas military base is in Djibouti (operational since 2017), and Djibouti is not a BRICS member. +- Pentagon's 2024 Annual China Report (December 2024) states China 'probably has considered' various countries for PLA military logistics facilities — language of planning/consideration, not confirmation. +- Pentagon's 2025 Annual China Report (December 2025) lists Pakistan, Bangladesh, Cuba, Angola, Indonesia, and others as countries where China has 'likely also considered basing' — again not confirmation of operational bases. +- UAE was assessed by US intelligence as a suspected location (~2020-2022), but no confirmed operational base was established, and construction was reportedly halted. +- No BRICS member countries (Brazil, Russia, India, China, South Africa, Egypt, Ethiopia, Iran, UAE, Saudi Arabia) have a confirmed Chinese military base per any official public announcement. + +--- + +### Question 39520 + +**Title:** Will Argentina's year-over-year inflation be below 30% for December 2025? +**URL:** https://www.metaculus.com/questions/39520 + +**Question Contents:** + +> Will Argentina's year-over-year inflation be below 30% for December 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.0542 + +**Key Evidence:** +- Argentina's year-over-year inflation for December 2025 was 31.5%, according to INDEC data cited by Investing.com. +- Itaú BBA Analysis confirmed Argentina's 2025 annual inflation fell to 31.5%, from 117.8% in 2024. +- BBVA Research and the Buenos Aires Herald both reported Argentina's 2025 inflation was 31.5%, the lowest in 8 years. +- The question's resolution threshold requires YoY inflation to be strictly less than 30.0%; 31.5% exceeds this threshold. +- Monthly (MoM) inflation for December 2025 was 2.8%, with core inflation at 3.0% MoM. + +--- + +### Question 39539 + +**Title:** Will Israel establish direct governmental control of the Gaza Strip in 2025? +**URL:** https://www.metaculus.com/questions/39539 + +**Question Contents:** + +> Will Israel establish direct governmental control of the Gaza Strip in 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.1359 + +**Key Evidence:** +- Israeli military control expanded to approximately 75% of Gaza by mid-2025, but this was de facto military control — which explicitly does not count per the resolution criteria. +- A Gaza peace plan signed on October 9, 2025 established a National Committee for the Administration of Gaza (NCAG) — 15 Palestinian technocrats, not Israelis — headed by Ali Shaath, a former Palestinian Authority deputy minister. +- Israel did not formally annex any part of the Gaza Strip in 2025; annexation debates focused on the West Bank, not Gaza. +- While Defense Minister Israel Katz proposed military settlements in Gaza for 'security needs' in December 2024, this was never formally enacted as law or policy during 2025. +- No formal civilian government headed by Israelis was established over 50% or more of the Gaza Strip during 2025. + +--- + +### Question 39545 + +**Title:** Will an H5 virus receive an "emergence" risk rating categorized as "high" by the US CDC Influenza Ri +**URL:** https://www.metaculus.com/questions/39545 + +**Question Contents:** + +> Will an H5 virus receive an "emergence" risk rating categorized as "high" by the US CDC Influenza Risk Assessment Tool before January 1, 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.0963 + +**Key Evidence:** +- The most recent H5N1 IRAT assessment before the deadline was for A/Texas/37/2024, assessed in June 2024, which received an emergence score of 5.79 — well within the moderate risk category (4.0–7.9). +- In March 2025, two new clade 2.3.4.4b viruses were assessed: A/California/147/2024 (emergence score of 5.59) and A/Washington/239/2024 (emergence score of 5.21) — lower than the Texas virus and still in the moderate risk category. +- The highest emergence score ever recorded by the IRAT is 7.50 for an H1N1 swine flu strain (A/swine/Shandong/1207/2016). No influenza virus of any type has ever received a score ≥ 8.0 on IRAT. +- No H5 virus has ever approached the 8.0 threshold, let alone exceeded it. + +--- + +### Question 39559 + +**Title:** Will Alexandre de Moraes cease to be a minister of the Supreme Federal Court of Brazil before 2026? +**URL:** https://www.metaculus.com/questions/39559 + +**Question Contents:** + +> Will Alexandre de Moraes cease to be a minister of the Supreme Federal Court of Brazil before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.0898 + +**Key Evidence:** +- As recently as February 24, 2026, Moraes was presiding over the trial of suspects in the Marielle Franco killing, confirming he remains a sitting justice. +- U.S. Magnitsky Act sanctions were imposed in late July 2025, but Moraes publicly declared he would ignore the sanctions and remain in his position — which he did. +- While political groups called for his removal and Elon Musk publicly criticized him, no formal impeachment proceeded to completion. +- Supreme Federal Court justices in Brazil are appointed for life with mandatory retirement at 75; Moraes is 55, far from mandatory retirement. +- There is no evidence whatsoever that Alexandre de Moraes ceased to be a minister of Brazil's Supreme Federal Court before the resolution cutoff date of January 1, 2026. + +--- + +### Question 39574 + +**Title:** Will Brazil ban the 6x1 work schedule before 1 January 2026? +**URL:** https://www.metaculus.com/questions/39574 + +**Question Contents:** + +> Will Brazil ban the 6x1 work schedule before 1 January 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.0570 + +**Key Evidence:** +- A proposed constitutional amendment (PEC 8/2025) to eliminate the 6x1 work schedule has advanced through Brazil's legislature, but has not been enacted into law. +- By late 2025, the Senate's Constitution and Justice Committee had approved the amendment proposal — but committee-level approval is not the same as the full legislative enactment required for it to become law. +- The Federal Government publicly declared support for ending the 6x1 model, but verbal/executive support does not constitute an official ban. +- As of early 2026, the legislative timeline projects admissibility analysis completion at end of March 2026, with a potential Chamber of Deputies vote in May 2026. +- Some Brazilian companies have voluntarily transitioned away from the 6x1 schedule, but this is not a legal ban. + +--- + +### Question 39578 + +**Title:** Will the bubble in the Magnificent Seven pop before 2026? +**URL:** https://www.metaculus.com/questions/39578 + +**Question Contents:** + +> Will the bubble in the Magnificent Seven pop before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.2524 + +**Key Evidence:** +- None of the Magnificent Seven stocks came close to falling 50% below their all-time high during the resolution window (before January 1, 2026). +- The largest declines from ATH were in Microsoft (~29%) and Tesla (~19.5%), still well short of the 50% threshold. +- Most all-time highs for these stocks were set during 2025 — meaning prices were generally rising, not collapsing. +- Some all-time highs were set in late 2025 or early 2026 (e.g., Alphabet's ATH on February 3, 2026), confirming prices were still near historic highs rather than crashing. +- The question's resolution window (before January 1, 2026) has already passed, and there is no evidence that any of these stocks fell 50% from their all-time highs during that period. + +--- + +### Question 39587 + +**Title:** Will Iran possess a nuclear weapon before 2026? +**URL:** https://www.metaculus.com/questions/39587 + +**Question Contents:** + +> Will Iran possess a nuclear weapon before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.1783 + +**Key Evidence:** +- U.S. Intelligence Assessment (May 2025): 'Iran almost certainly is not producing nuclear weapons, but Iran has undertaken activities in recent years that better position it to produce them, if it chooses to do so.' +- Arms Control Association fact sheet confirms Iran is NOT a nuclear-weapons state, noting it can produce weapons-grade uranium for multiple bombs within weeks, but has not done so. +- Iran possessed ~972 lbs of 60%-enriched uranium as of mid-June 2025, but never enriched to weapons grade. +- Iran continues to officially deny any intention to develop nuclear weapons, insisting its program is entirely peaceful. +- There is no evidence that Iran announced possession of, or conducted a test of, a nuclear weapon at any point before 2026. + +--- + +### Question 39589 + +**Title:** Will US offer India a nuclear submarine before 2026? +**URL:** https://www.metaculus.com/questions/39589 + +**Question Contents:** + +> Will US offer India a nuclear submarine before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.1760 + +**Key Evidence:** +- All searches confirm there was no formal US offer, agreement, or deal with India regarding nuclear-powered submarines during the resolution period (January 1, 2022 to January 1, 2026). +- The US has consistently maintained that American domestic laws prevent sharing nuclear submarine propulsion technology with any nation outside formal treaty relationships — including India, despite the 2008 US-India Civil Nuclear Agreement. +- The AUKUS nuclear submarine agreement (US, UK, Australia) was not extended to include India. India is not part of AUKUS and no comparable arrangement was offered to India. +- India sealed a deal to lease a nuclear-powered Akula-class submarine (Chakra-III) from Russia in December 2025, not from the US. +- The Modi-Trump joint statement (February 13, 2025) focused exclusively on civil nuclear energy cooperation (reactors), not military/submarine nuclear technology. + +--- + +### Question 39609 + +**Title:** Will evidence of alien technology be recovered near the IM1 crash site before 2026? +**URL:** https://www.metaculus.com/questions/39609 + +**Question Contents:** + +> Will evidence of alien technology be recovered near the IM1 crash site before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.2222 + +**Key Evidence:** +- The Chemical Geology paper (September 2024) by Avi Loeb and the Galileo Project involved four independent laboratories (Harvard, UC Berkeley, Bruker Corporation, and University of Technology in Papua New Guinea), satisfying the ≥3 labs criterion, but did not conclude with 95% confidence that the material was non-terrestrial. +- An October 2024 paper in Research Notes of the AAS (Patricio Gallardo) concluded the spherules match coal ash contaminants, with meteoritic origin 'disfavored.' +- A November 2024 preprint (Desch & Jackson) showed iron isotope ratios align with terrestrial fractionation lines, concluding that 'an interstellar origin can be effectively ruled out.' +- Research published in 2024 found the seismic signal used to locate the crash site was likely caused by a truck on a nearby road, not the meteor, meaning the expedition may have searched 100+ miles from the actual impact zone. +- No peer-reviewed study concluded with 95% confidence that the recovered material reflects a technological origin — Loeb's speculations about technological origin were not formal conclusions of any peer-reviewed paper. + +--- + +### Question 39623 + +**Title:** Will 3M file for bankruptcy before 2026? +**URL:** https://www.metaculus.com/questions/39623 + +**Question Contents:** + +> Will 3M file for bankruptcy before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.0958 + +**Key Evidence:** +- 3M Company (CIK 66740) did NOT file for bankruptcy under Chapter 11 or Chapter 7 at any point before January 1, 2026. +- Aearo Technologies (subsidiary) filed Chapter 11 on July 26, 2022, but this was a 3M subsidiary, not 3M itself; the bankruptcy court dismissed this filing in June 2023. +- 3M explicitly disclaimed bankruptcy intent in a formal SEC 8-K filing (August 29, 2023): '3M has no present intention to file a petition for relief under the Bankruptcy Code.' +- 3M settled the Combat Arms Earplug lawsuits for $6 billion (announced August 29, 2023), with ~$5.8 billion already paid out to veterans by May 2025. +- 3M remained financially healthy throughout this period: Full-year 2024 sales of $24.6 billion with $4.9 billion in adjusted free cash flow; full-year 2025 sales of $24.9 billion with adjusted EPS of $8.06. + +--- + +### Question 40604 + +**Title:** Will Iran allow the IAEA to inspect its stockpile of enriched uranium before 2026? +**URL:** https://www.metaculus.com/questions/40604 + +**Question Contents:** + +> Will Iran allow the IAEA to inspect its stockpile of enriched uranium before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.1421 + +**Key Evidence:** +- The September 9, 2025 'Cairo agreement' to resume nuclear inspections at the bombed sites was suspended by Iran on September 28, 2025, following UN Security Council reimposition of sanctions via the 'snapback' mechanism. +- By October 20, 2025, Reuters reported that Iran declared the IAEA cooperation deal void/scrapped. +- In November 2025, Iran formally canceled the Cairo agreement after the IAEA Board of Governors demanded full inspector access to all nuclear facilities. +- The IAEA conducted 12 inspection visits in Iran during this period, but none at the bombed sites (Fordow, Natanz, Isfahan). +- As of January 2026, the IAEA stated it could no longer verify Iran's nuclear activities, and satellite imagery showed construction activity at damaged sites with no IAEA inspector access. + +--- + +### Question 40867 + +**Title:** Will the People's Party win an absolute majority in the next regional elections in Extremadura, Spai +**URL:** https://www.metaculus.com/questions/40867 + +**Question Contents:** + +> Will the People's Party win an absolute majority in the next regional elections in Extremadura, Spain? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| FALSE | FALSE | + +**Cost:** $0.0897 + +**Key Evidence:** +- Official results (99.89% counted) show PP won 29 seats with 43.12–43.18% of the vote +- The absolute majority threshold in the 65-seat Assembly of Extremadura is 33 seats +- PP fell 4 seats short of an absolute majority +- Multiple authoritative Spanish sources (RTVE, El País, El Periódico, El Diario) and Wikipedia confirm the 29-seat result +- PP's María Guardiola will need support from Vox (11 seats) or other parties to form a government + +--- + +### Question 39353 + +**Title:** Will Elon Musk be the world's richest person on December 31, 2025? +**URL:** https://www.metaculus.com/questions/39353 + +**Question Contents:** + +> Will Elon Musk be the world's richest person on December 31, 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Cost:** $0.0941 + +**Key Evidence:** +- Bloomberg estimated Elon Musk's net worth at approximately $622–627 billion as of December 29–30, 2025, ranking him #1 on the Bloomberg Billionaires Index. +- SpaceX's private share sale in mid-December 2025 valued the company at $800 billion, adding over $100 billion to Musk's fortune. +- On December 20, 2025, the Delaware Supreme Court reinstated his 2018 Tesla stock options, adding ~$138–139 billion and pushing him past $700 billion. +- Larry Ellison briefly surpassed Musk in September 2025 but finished at #5 by year-end, with no other individual close to Musk's net worth. +- There is no indication of any other individual being co-ranked at #1 alongside Musk, and no indication of Bloomberg Index unavailability on December 31, 2025. + +--- + +### Question 39449 + +**Title:** Will Spain's economy grow more than the Eurozone's between Q3 2024 and Q3 2025? +**URL:** https://www.metaculus.com/questions/39449 + +**Question Contents:** + +> Will Spain's economy grow more than the Eurozone's between Q3 2024 and Q3 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Cost:** $0.1287 + +**Key Evidence:** +- Spain's year-on-year GDP growth in Q3 2025 was 2.8%, per INE and consistent with Eurostat data. +- The Eurozone's year-on-year GDP growth in Q3 2025 was 1.4%, per official Eurostat figures (published December 5, 2025 release: 2-05122025-ap). +- Spain's 2.8% exceeds the Eurozone's 1.4% by a margin of +1.4 percentage points. +- On a quarterly basis, Spain (0.6%) also outpaced the Eurozone (0.3%), confirming the result across both measurement methods. +- Eurostat published Q3 2025 data in December 2025, before the January 7, 2026 deadline, so the question is not void. + +--- + +### Question 39450 + +**Title:** Will Microsoft stock close 2025 higher than it started? +**URL:** https://www.metaculus.com/questions/39450 + +**Question Contents:** + +> Will Microsoft stock close 2025 higher than it started? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Cost:** $0.0824 + +**Key Evidence:** +- MSFT closing price on December 31, 2025 (last business day of 2025) was $483.62 (Source: StatMuse) +- MSFT opening price on January 2, 2025 (first trading day) was $422.41 (Source: StatMuse) +- The resolution threshold is $421.50, and $483.62 > $421.50, representing a margin of ~$62.12 (~14.7%) +- No stock splits occurred in 2025 that would require price adjustment +- December 31, 2025 was confirmed to be the last business day of 2025 + +--- + +### Question 39511 + +**Title:** Will transformer derived architectures still be state of the art for language modeling in December 2 +**URL:** https://www.metaculus.com/questions/39511 + +**Question Contents:** + +> Will transformer derived architectures still be state of the art for language modeling in December 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Cost:** $0.3846 + +**Key Evidence:** +- OpenAI GPT-4.5, GPT-4o, o1, o3-mini, DeepSeek R1, Qwen2.5, Llama 3, and Gemini 1.5 — all transformer-derived — are the top-cited 2025 models dominating benchmarks. +- The 'de facto standard stack' in 2025 LLMs is pre-norm (RMSNorm) + RoPE + SwiGLU + KV-sharing — all transformer refinements, including MoE extensions. +- Mamba/SSM-based models lag significantly on associative recall, in-context learning, and most standard benchmarks, and have not claimed SOTA on major NLP benchmarks. +- A Metaculus forecast explicitly states 95% probability that transformers would maintain SOTA through late 2025. +- The nlpprogress.com WikiText-2 top entry in December 2025 describes a transformer-derived language model, as evidenced by direct reference in the Metaculus question discussion. + +--- + +### Question 39523 + +**Title:** Will China be able to mass produce humanoid robots by the end of 2025? +**URL:** https://www.metaculus.com/questions/39523 + +**Question Contents:** + +> Will China be able to mass produce humanoid robots by the end of 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Cost:** $0.1829 + +**Key Evidence:** +- UBTECH reached its 1,000th Walker S2 robot milestone by late 2025, with Omdia confirming 1,000 shipments — the Walker S2 is a single model, meeting the 'identical model' requirement. +- Unitree Robotics shipped over 5,500 humanoid robots in 2025 with total production exceeding 6,500 units; given a small number of primary models (G1 and H1/H1-2), the G1 alone very likely exceeded 1,000 units. +- AgiBot announced the rollout of its 5,000th mass-produced humanoid robot in December 2025, with the A-Series alone reaching 1,742 units. +- China accounted for over 80% of global humanoid robot installations in 2025, with approximately 10,000–13,000 units installed in China out of 12,800–16,000 globally. +- China received over 30,000 orders for humanoid robots in 2025, with over 140 domestic manufacturers releasing more than 330 different models. + +--- + +### Question 39529 + +**Title:** Will the S&P 500 Index increase over the year 2025? +**URL:** https://www.metaculus.com/questions/39529 + +**Question Contents:** + +> Will the S&P 500 Index increase over the year 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Cost:** $0.1211 + +**Key Evidence:** +- S&P 500 closed at 5,881.63 on the last trading day of 2024 (Dec 31, 2024) +- S&P 500 closed at 6,845.50 on the last trading day of 2025 (Dec 31, 2025) +- Change was +963.87 points (+16.4% price return), with total return including dividends of +17.9% +- The S&P 500 posted its third consecutive year of double-digit gains in 2025 +- Multiple sources (StatMuse, RBC Wealth Management, FT Portfolios) confirm the year-end close above 2024's level + +--- + +### Question 39543 + +**Title:** Will Nvidia stock close 2025 higher than it started? +**URL:** https://www.metaculus.com/questions/39543 + +**Question Contents:** + +> Will Nvidia stock close 2025 higher than it started? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Cost:** $0.1206 + +**Key Evidence:** +- Nvidia's closing price on December 31, 2025 was $186.50 (sourced from Digrin / StatMuse) +- The resolution threshold is $134.29, described as Nvidia's starting price for 2025 (the close on the last business day of 2024) +- Dec 31, 2024 closing price was $134.25 (sourced from StatMuse) +- $186.50 exceeds $134.29 by approximately 39% +- No stock splits appear to have occurred in 2025 that would complicate this comparison + +--- + +### Question 39560 + +**Title:** Will the highest-grossing film of 2025 be part of a franchise? +**URL:** https://www.metaculus.com/questions/39560 + +**Question Contents:** + +> Will the highest-grossing film of 2025 be part of a franchise? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Cost:** $0.1629 + +**Key Evidence:** +- Ne Zha 2 is the highest-grossing film of 2025 worldwide with approximately $2.0–2.26 billion in total worldwide gross, confirmed by Box Office Mojo and The Numbers. +- Ne Zha 2 is the direct sequel to Ne Zha (2019), which earned over $742 million worldwide, forming a two-part animated franchise based on the Chinese mythological character Nezha. +- Both films are directed by the same director (Jiaozi/Yu Yang) and Ne Zha 2 picks up immediately where the original ended, making it unambiguously a franchise film. +- The data is confirmed by Box Office Mojo, satisfying the specified reliable source requirement in the resolution criteria. + +--- + +### Question 39569 + +**Title:** Will the IBEX 35 close 2025 higher than it started? +**URL:** https://www.metaculus.com/questions/39569 + +**Question Contents:** + +> Will the IBEX 35 close 2025 higher than it started? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Cost:** $0.0449 + +**Key Evidence:** +- IBEX 35 opening value (start of 2025): 11,595.00 +- IBEX 35 closing value (December 31, 2025): ~17,315 points according to Trading Economics +- Year-over-year gain of ~49.3% (nearly 50%) +- The closing value of ~17,315 is dramatically higher than the resolution threshold of 11,595.00 +- A gain of nearly 50% for the full year 2025 is a milestone not seen since 1993 + +--- + +### Question 40210 + +**Title:** Will the United States bailout Argentina before 2026? +**URL:** https://www.metaculus.com/questions/40210 + +**Question Contents:** + +> Will the United States bailout Argentina before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Cost:** $0.0933 + +**Key Evidence:** +- On October 9, 2025, the Trump administration announced a $20 billion Exchange Stabilization Agreement (ESA) between the U.S. Treasury's Exchange Stabilization Fund (ESF) and Argentina's Central Bank (BCRA). +- In October 2025, the BCRA drew $2.5 billion from this swap line, exchanging Argentine pesos for U.S. dollars. +- The U.S. also provided an additional $872 million in dollar liquidity via SDR transactions to help Argentina make IMF payments. +- By December 2025, Argentina fully repaid the $2.5 billion draw, and the swap transaction was closed. +- Treasury Secretary Scott Bessent confirmed in January 2026 that Argentina had 'quickly and fully repaid its limited draw.' + +--- + +### Question 40252 + +**Title:** Will the following companies be added to the S&P 500 before 2026? (CVNA) +**URL:** https://www.metaculus.com/questions/40252 + +**Question Contents:** + +> Will the following companies be added to the S&P 500 before 2026? (CVNA) + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Cost:** $0.0562 + +**Key Evidence:** +- Official announcement from S&P Dow Jones Indices confirmed on December 5, 2025 that Carvana (CVNA) would be added to the S&P 500 +- The addition took effect on December 22, 2025 (before market open) +- Both the announcement AND the actual addition occurred before January 1, 2026, which is the resolution deadline +- Carvana joined alongside CRH and Comfort Systems USA (FIX) as part of the index's quarterly rebalancing +- The announcement came directly from S&P Dow Jones Indices' press release, which is the authoritative source per the resolution criteria + +--- + +### Question 40283 + +**Title:** Will Hamas no longer have any hostages before 2026? +**URL:** https://www.metaculus.com/questions/40283 + +**Question Contents:** + +> Will Hamas no longer have any hostages before 2026? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Cost:** $0.0917 + +**Key Evidence:** +- All 20 living hostages released on October 13, 2025 — well before the January 1, 2026 deadline. +- A peace deal was signed by Israel and Hamas on October 9, 2025, in Sharm el-Sheikh, Egypt. +- On October 13, 2025, all 20 surviving living hostages were released in two batches (7 in Gaza City at 8 a.m., and 13 in southern Gaza at ~11 a.m.), transferred via the Red Cross to Israeli custody. +- No living hostages remained in Hamas custody after October 13, 2025 — prior to the January 1, 2026 deadline. +- Israel released ~2,000 Palestinian prisoners in exchange, including 250 life-sentence prisoners and ~1,700 detainees. + +--- + +### Question 40285 + +**Title:** Will Ukraine extend martial law beyond November 5, 2025? +**URL:** https://www.metaculus.com/questions/40285 + +**Question Contents:** + +> Will Ukraine extend martial law beyond November 5, 2025? + +| Output Resolution | Correct Resolution | +|-------------------|--------------------| +| TRUE | TRUE | + +**Cost:** $0.0489 + +**Key Evidence:** +- Ukraine's parliament voted to extend martial law for 90 days starting November 5, 2025, with 317 votes in favor. +- Parliament subsequently approved another 90-day extension of both martial law and general mobilization, now set to expire May 4, 2026. +- Martial law was originally declared on February 24, 2022, in response to the Russian invasion and has been extended systematically every 90 days. +- Sources include Ukrinform, Anadolu Agency, and NV Ukraine (English) confirming the extensions. +- Ukraine's constitution prohibits national elections while martial law is in effect, providing additional context for the ongoing extensions. + +--- From 6154eb2306291fa80af69c534c02c973c7f906ac Mon Sep 17 00:00:00 2001 From: Luke Beukelman Date: Mon, 9 Mar 2026 19:16:49 -0600 Subject: [PATCH 12/14] Added a resolver for annulled/ambiguous specificity, also fixed some error logging --- .../auto_resolver/agentic/__init__.py | 18 +- .../auto_resolver/agentic/instructions.py | 181 ++++++++++++++++++ .../auto_resolver/tui/__main__.py | 11 +- .../agents_and_tools/auto_resolver/tui/app.py | 131 +------------ 4 files changed, 199 insertions(+), 142 deletions(-) diff --git a/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py b/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py index 1021f614..f504607f 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py @@ -104,7 +104,8 @@ async def _resolve_binary( # Create agents researcher = self._create_researcher(question) - resolver = self._create_resolver_agent(question) + annulled_ambiguous_resolver = self._create_annulled_ambiguous_resolver_agent(question) + resolver = self._create_resolver_agent(question, annulled_ambiguous_resolver) orchestrator = self._create_orchestrator_agent(researcher, resolver) # Run the workflow (non-streaming) @@ -192,13 +193,23 @@ def _create_researcher(self, question: BinaryQuestion) -> AiAgent: handoffs=[], ) - def _create_resolver_agent(self, question: BinaryQuestion) -> AiAgent: + def _create_resolver_agent(self, question: BinaryQuestion, annulled_ambiguous_resolver: AiAgent) -> AiAgent: instructions = binary_resolver_instructions(question) return AiAgent( name="resolver", instructions=instructions, model=AgentSdkLlm(model=self.model_for_resolver), tools=[], # No tools - only analyzes research + handoffs=[annulled_ambiguous_resolver], # Can hand off to specialized agent + ) + + def _create_annulled_ambiguous_resolver_agent(self, question: BinaryQuestion) -> AiAgent: + instructions = annulled_ambiguous_resolver_instructions(question) + return AiAgent( + name="annulled_ambiguous_resolver", + instructions=instructions, + model=AgentSdkLlm(model=self.model_for_resolver), + tools=[], # No tools - only analyzes research handoffs=[], # Terminal agent ) @@ -265,7 +276,8 @@ async def resolve_question_streamed( # Step 2: Create agents yield ("status", "Creating resolution agents...") researcher = self._create_researcher(question) - resolver = self._create_resolver_agent(question) + annulled_ambiguous_resolver = self._create_annulled_ambiguous_resolver_agent(question) + resolver = self._create_resolver_agent(question, annulled_ambiguous_resolver) orchestrator = self._create_orchestrator_agent(researcher, resolver) # Step 3: Run streamed workflow diff --git a/forecasting_tools/agents_and_tools/auto_resolver/agentic/instructions.py b/forecasting_tools/agents_and_tools/auto_resolver/agentic/instructions.py index 265040eb..fe700384 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/agentic/instructions.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/agentic/instructions.py @@ -273,6 +273,13 @@ def binary_resolver_instructions(question: BinaryQuestion) -> str: - FALSE: Definitively did NOT happen - NOT_YET_RESOLVABLE: Might still happen or unclear if it happened + # Special Handling for AMBIGUOUS and ANNULLED + + If your initial assessment is AMBIGUOUS or ANNULLED, you MUST hand off + to the annulled_ambiguous_resolver agent for specialized review. This + agent has detailed instructions for distinguishing between these two + complex resolution types. + # Output Format Provide your analysis in the following format: @@ -294,6 +301,180 @@ def binary_resolver_instructions(question: BinaryQuestion) -> str: - Cite specific information from the research - Acknowledge uncertainties when present - Your output will be parsed programmatically, so follow the format exactly + - If you determine AMBIGUOUS or ANNULLED, hand off to the specialized agent + """ + ) + + +def annulled_ambiguous_resolver_instructions(question: BinaryQuestion) -> str: + """Build detailed instructions for distinguishing between ANNULLED and AMBIGUOUS resolutions. + + This agent receives the question and research from the main resolver and makes + a final determination on whether to resolve as ANNULLED or AMBIGUOUS (or potentially + another resolution type if the initial assessment was incorrect). + + Args: + question: The question being resolved + + Returns: + Formatted instruction string + """ + + return clean_indents( + f""" + # Your Role + + You are a specialized resolution analyst focusing exclusively on distinguishing + between ANNULLED and AMBIGUOUS resolutions. You have received this case because + the main resolver determined this question may need to be cancelled. + + # The Question + + {question.give_question_details_as_markdown()} + + # Your Task + + Review the research and determine whether this question should be resolved as: + - ANNULLED (question itself is invalid) + - AMBIGUOUS (question is valid but reality is unclear) + - Or if the initial assessment was incorrect, provide the correct resolution + + # Detailed Resolution Criteria + + ## AMBIGUOUS Resolution + + **Definition**: Reserved for questions where **reality is not clear**. The question + is valid and well-formed, but we cannot determine what actually happened. + + **When to use AMBIGUOUS**: + + ### 1. No Clear Consensus + - Conflicting or unclear media reports about what happened + - A data source that was expected to provide resolution information is no longer available + - Multiple reasonable interpretations of the same event + - Insufficient information to arrive at an appropriate resolution + + **Examples of AMBIGUOUS**: + + *Example 1*: "Will Russian troops enter Kyiv, Ukraine before December 31, 2022?" + - It was clear that some Russian troops entered Ukraine, and probable there were more than 100 + - However, there was no clear evidence that could be used to resolve definitively + - Resolution: AMBIGUOUS (reality unclear despite question being valid) + + *Example 2*: "What will the average cost of a ransomware kit be in 2022?" + - Question relied on data from a Microsoft report + - Microsoft's report no longer contained the relevant data + - No alternate sources could be found + - Resolution: AMBIGUOUS (expected data source unavailable) + + ## ANNULLED Resolution + + **Definition**: Reserved for questions where **reality is clear but the question is not**. + The question failed to adequately capture a method for clear resolution. + + **When to use ANNULLED**: + + ### 1. Underspecified Questions + - The question did not clearly describe an appropriate method to resolve + - An outcome occurred that doesn't correspond to the resolution criteria + - No clear direction for how the question resolves in the actual outcome + + **Examples of Underspecified**: + + *Example 1*: "What will Substack's Google Trends index be at end of 2022?" + - Did not clearly specify how Google Trends would be used + - Index value depends on the date range specified in Google Trends + - Resolution: ANNULLED (underspecified methodology) + + *Example 2*: "When will a fusion reactor reach ignition?" + - Did not clearly define what "ignition" means + - Definition varies by researchers and fusion method + - Resolution: ANNULLED (underspecified criteria) + + ### 2. Subverted Assumptions + - The question made assumptions about the present or future that were violated + - Expected data sources changed fundamentally (not just unavailable) + - The premise of the question was false + + **Examples of Subverted Assumptions**: + + *Example 1*: "Will a technical problem be identified as the cause of the crash of China Eastern Airlines Flight 5735?" + - Question relied on a future NTSB report + - This was a Chinese incident, unlikely NTSB would publish + - No date specified for when the report must be published + - Resolution: ANNULLED (assumption of NTSB report violated) + + *Example 2*: "What will the Federal Reserves' Industrial Production Index be for November 2021, for semiconductors?" + - Question anticipated possibility of base period changing + - However, the entire methodology changed (not just base period) + - Unwritten assumption of consistent methodology was violated + - Resolution: ANNULLED (methodology assumption violated) + + ### 3. Imbalanced Outcomes and Consistent Incentives + - Binary question lacks clear mechanism for BOTH Yes and No resolution + - Burden of proof tips scales toward one outcome + - Would require unreasonable research to prove negative + - Creates bad incentives for forecasters + + **Examples of Imbalanced Outcomes**: + + *Example 1*: "Will any prediction market cause users to lose at least $1M before 2023?" + - Clear mechanism for Yes: prominent report of incident + - No clear mechanism for No: would require extensive research + - To resolve No would need absurd amount of research + - Creates bad incentives (savvy forecasters know it's Yes or Annulled) + - Resolution: ANNULLED (imbalanced resolution mechanisms) + + *Example 2*: "Will any remaining FTX depositor withdraw any amount of tradeable assets from FTX before 2023?" + - Required knowledge of FTX withdrawal details unavailable to admins + - No real mechanism to resolve as No + - Question could only truly resolve as Yes or be Annulled + - Resolution: ANNULLED (imbalanced outcomes) + + # Decision Framework + + Use this framework to make your determination: + + **Step 1: Is the question valid?** + - Are the resolution criteria clear and complete? + - Are there reasonable methods to resolve as both Yes and No (for binary)? + - Are the assumptions reasonable and not violated? + - If NO to any → Likely ANNULLED + + **Step 2: Is reality clear?** + - Can we determine what actually happened? + - Is there sufficient information available? + - Are there conflicting reports or interpretations? + - If NO to any → Likely AMBIGUOUS + + **Step 3: What went wrong?** + - If the question's design is the problem → ANNULLED + - If reality/information is the problem → AMBIGUOUS + + # Output Format + + Provide your final determination in the following format: + + **Final Resolution**: [ANNULLED or AMBIGUOUS or other resolution type] + + **Primary Reason**: [UNDERSPECIFIED / SUBVERTED_ASSUMPTIONS / IMBALANCED_OUTCOMES / NO_CLEAR_CONSENSUS] + + **Detailed Reasoning**: [3-5 sentences explaining your decision, citing specific evidence] + + **Key Evidence**: + - [Evidence point 1 - specific to this determination] + - [Evidence point 2] + - [Evidence point 3] + - [Evidence point 4 - optional] + - [Evidence point 5 - optional] + + # Important Reminders + + - Focus specifically on the ANNULLED vs AMBIGUOUS distinction + - Cite specific evidence from the research + - Consider the examples above as guidance + - Your decision will be the final resolution + - Be thorough and precise in your reasoning """ ) diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/__main__.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/__main__.py index 0d32f108..ea8d60c2 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/tui/__main__.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/__main__.py @@ -9,7 +9,6 @@ """ import argparse -import logging from dotenv import load_dotenv @@ -54,14 +53,7 @@ def _parse_tournament_id(value: str) -> int | str: ) args = parser.parse_args() - log_level = logging.DEBUG if args.verbose else logging.WARNING - - # Do NOT call logging.basicConfig() here -- it would install a - # StreamHandler on sys.stderr which corrupts the Textual TUI display. - # Instead, set the root logger level and let the App install a custom - # handler that routes records into a TUI log panel. - logging.getLogger().setLevel(log_level) - + from forecasting_tools.agents_and_tools.auto_resolver.tui.app import ( AutoResolverApp, ) @@ -70,7 +62,6 @@ def _parse_tournament_id(value: str) -> int | str: max_concurrency=args.concurrency, initial_tournaments=args.tournament, initial_questions=args.question, - log_level=log_level, ) app.run() diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/app.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/app.py index 1439e313..90ab7241 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/tui/app.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/app.py @@ -3,7 +3,6 @@ from __future__ import annotations import asyncio -import logging import random import sys from datetime import datetime @@ -48,81 +47,6 @@ AddIdRequested, InputModal, ) - -logger = logging.getLogger(__name__) - - -# --------------------------------------------------------------------------- -# Logging / stderr capture helpers -# --------------------------------------------------------------------------- - -_LOG_LEVEL_STYLES: dict[int, tuple[str, str]] = { - logging.DEBUG: ("dim", "dim"), - logging.INFO: ("", ""), - logging.WARNING: ("yellow", "yellow"), - logging.ERROR: ("bold red", "bold red"), - logging.CRITICAL: ("bold white on red", "bold white on red"), -} - - -class TuiLoggingHandler(logging.Handler): - """Routes Python log records into the TUI :class:`LogPanel`. - - Uses :meth:`App.call_from_thread` so that records emitted from - background threads (litellm cost callbacks, ``asyncio.to_thread`` - workers, etc.) are safely delivered to the Textual event loop. - """ - - def __init__(self, app: "AutoResolverApp") -> None: - super().__init__() - self._app = app - - def emit(self, record: logging.LogRecord) -> None: - try: - msg = self.format(record) - self._app.call_from_thread( - self._app._append_log, msg, record.levelno - ) - except Exception: - self.handleError(record) - - -class _StderrCapture: - """File-like replacement for ``sys.stderr`` that routes writes into the - TUI :class:`LogPanel` instead of the real terminal stderr. - - Any write that fails (e.g. the app is not yet mounted or is shutting - down) falls back to the real stderr so output is never silently lost. - """ - - def __init__(self, app: "AutoResolverApp") -> None: - self._app = app - self._real_stderr = sys.__stderr__ - - def write(self, text: str) -> int: - if not text or not text.strip(): - return len(text) if text else 0 - try: - self._app.call_from_thread( - self._app._append_log, text.rstrip(), logging.WARNING - ) - except Exception: - if self._real_stderr: - self._real_stderr.write(text) - return len(text) - - def flush(self) -> None: - pass - - # Some libraries check for these attributes on file-like objects. - @property - def encoding(self) -> str: - return "utf-8" - - def isatty(self) -> bool: - return False - - class AutoResolverApp(App): """TUI application for interactive agentic question resolution. @@ -163,8 +87,7 @@ def __init__( max_concurrency: int = 3, initial_tournaments: list[int | str] | None = None, initial_questions: list[int] | None = None, - log_level: int = logging.WARNING, - ) -> None: + ) -> None: super().__init__() self._resolver = AgenticResolver() self._client = MetaculusClient() @@ -173,8 +96,6 @@ def __init__( self._concurrency_sem = asyncio.Semaphore(max_concurrency) self._initial_tournaments = initial_tournaments or [] self._initial_questions = initial_questions or [] - self._log_level = log_level - self._tui_log_handler: TuiLoggingHandler | None = None self._original_stderr = sys.stderr self._view_before_logs: Literal["home", "feed"] = "home" @@ -218,28 +139,11 @@ async def on_mount(self) -> None: self.log_panel.display = False self.home_panel.display = True - # Install the TUI logging handler on the root logger so ALL - # Python logging records (ours + third-party) are captured. - self._tui_log_handler = TuiLoggingHandler(self) - self._tui_log_handler.setLevel(self._log_level) - self._tui_log_handler.setFormatter( - logging.Formatter("%(asctime)s %(levelname)-8s %(name)s %(message)s", - datefmt="%H:%M:%S") - ) - root = logging.getLogger() - # Remove any pre-existing stderr handlers that basicConfig may - # have installed before us (defensive). - for h in list(root.handlers): - if isinstance(h, logging.StreamHandler) and getattr(h, "stream", None) is sys.stderr: - root.removeHandler(h) - root.addHandler(self._tui_log_handler) - # Redirect sys.stderr so that non-logging stderr writes (e.g. # from C extensions, Python warnings, litellm debug output) are # also captured instead of corrupting the TUI display. self._original_stderr = sys.stderr - sys.stderr = _StderrCapture(self) # type: ignore[assignment] - + for tid in self._initial_tournaments: self._schedule_load_tournament(tid) for qid in self._initial_questions: @@ -315,37 +219,6 @@ def action_toggle_logs(self) -> None: self.feed_panel.display = False self.log_panel.display = True - # ------------------------------------------------------------------ - # Log panel helpers - # ------------------------------------------------------------------ - - def _append_log(self, text: str, level: int = logging.INFO) -> None: - """Append a formatted log line to the :class:`LogPanel`. - - Called from :class:`TuiLoggingHandler` and :class:`_StderrCapture` - via ``call_from_thread``. Must only be called on the Textual - event-loop thread. - """ - open_tag, close_tag = "", "" - for threshold in sorted(_LOG_LEVEL_STYLES, reverse=True): - if level >= threshold: - style_open, _ = _LOG_LEVEL_STYLES[threshold] - if style_open: - open_tag = f"[{style_open}]" - close_tag = f"[/{style_open}]" - break - self.log_panel.append_log(f"{open_tag}{text}{close_tag}") - - def _teardown_logging(self) -> None: - """Remove our logging handler and restore stderr.""" - if self._tui_log_handler is not None: - logging.getLogger().removeHandler(self._tui_log_handler) - self._tui_log_handler = None - sys.stderr = self._original_stderr - - async def on_unmount(self) -> None: - self._teardown_logging() - # ------------------------------------------------------------------ # Message handlers # ------------------------------------------------------------------ From e9dbf008a6c2874f929837b6eb47e8786ab25778 Mon Sep 17 00:00:00 2001 From: Luke Beukelman Date: Mon, 9 Mar 2026 21:28:42 -0600 Subject: [PATCH 13/14] Final commit for now --- .../auto_resolver/agentic/__init__.py | 164 +++++++++++++++++- .../auto_resolver/agentic/instructions.py | 148 +++++++++++++++- .../agents_and_tools/auto_resolver/assess.py | 61 +++++-- .../auto_resolver/resolution_models.py | 25 +++ .../agents_and_tools/auto_resolver/tui/app.py | 31 ---- .../auto_resolver/tui/report.py | 28 ++- .../auto_resolver/tui/widgets/feed_panel.py | 17 +- .../auto_resolver/tui/widgets/home_panel.py | 17 +- .../auto_resolver/tui/widgets/log_panel.py | 61 ------- 9 files changed, 423 insertions(+), 129 deletions(-) delete mode 100644 forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/log_panel.py diff --git a/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py b/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py index f504607f..58c7cb4e 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/agentic/__init__.py @@ -17,6 +17,8 @@ import logging from typing import AsyncGenerator, Optional, Callable +import pendulum + from openai.types.responses import ResponseTextDeltaEvent from forecasting_tools.data_models.questions import ( @@ -26,7 +28,10 @@ ) from forecasting_tools import MetaculusQuestion, BinaryQuestion from forecasting_tools.agents_and_tools.auto_resolver.agentic.instructions import * -from forecasting_tools.agents_and_tools.auto_resolver.resolution_models import BinaryResolutionResult +from forecasting_tools.agents_and_tools.auto_resolver.resolution_models import ( + BinaryResolutionResult, + DeadlineCheckResult, +) from forecasting_tools.agents_and_tools.auto_resolver import AutoResolver from forecasting_tools.agents_and_tools.minor_tools import ( create_date_filtered_asknews_tool, @@ -85,11 +90,145 @@ def __init__(self, self.model_for_researcher = model_for_researcher self.model_for_rephraser = model_for_rephraser self.timeout = timeout - - + + # ------------------------------------------------------------------ + # Deadline checking (two-tier) + # ------------------------------------------------------------------ + + _DEADLINE_CHECK_MODEL = "openrouter/openai/gpt-4.1-mini" + + def _is_before_scheduled_deadline(self, question: MetaculusQuestion) -> bool: + """Tier 1 (instant, free): check the metadata ``scheduled_resolution_time``. + + Returns ``True`` if the field is set and still in the future, meaning + the question's scheduled resolution has not yet arrived. Returns + ``False`` when the field is ``None`` or the date has already passed. + """ + if question.scheduled_resolution_time is None: + return False + now = pendulum.now("UTC") + return now < question.scheduled_resolution_time + + async def _check_implicit_deadline( + self, question: MetaculusQuestion + ) -> tuple[bool, str | None]: + """Tier 2 (cheap LLM call): analyse the question text for an implicit deadline. + + Returns: + A tuple of ``(should_skip, reason)``. ``should_skip`` is ``True`` + when the LLM found a deadline that has **not** yet passed. + ``reason`` is a human-readable explanation (or ``None``). + """ + prompt = deadline_check_instructions(question) + llm = GeneralLlm(model=self._DEADLINE_CHECK_MODEL, temperature=0.0) + + try: + raw_response = await llm.invoke(prompt) + result = await structure_output( + raw_response, + DeadlineCheckResult, + model=self._DEADLINE_CHECK_MODEL, + ) + except Exception as e: + logger.warning( + "Implicit deadline check failed — allowing resolution to " + "proceed. Error: %s", + e, + exc_info=True, + ) + return False, None + + if not result.has_deadline or result.deadline_date is None: + logger.info( + "No implicit deadline found for question %s: %s", + question.id_of_post, + result.reasoning, + ) + return False, None + + # Parse the deadline date and compare to now + try: + parsed = pendulum.parse(result.deadline_date, tz="UTC") + if not isinstance(parsed, pendulum.DateTime): + raise ValueError( + f"Expected a DateTime, got {type(parsed).__name__}" + ) + deadline = parsed + except Exception as e: + logger.warning( + "Could not parse deadline date '%s' from LLM response — " + "allowing resolution to proceed. Error: %s", + result.deadline_date, + e, + ) + return False, None + + now = pendulum.now("UTC") + if now < deadline: + reason = ( + f"Implicit deadline {result.deadline_date} has not yet passed " + f"(current date: {now.format('YYYY-MM-DD')}). " + f"{result.reasoning}" + ) + logger.info( + "Question %s: implicit deadline not reached — skipping resolution. %s", + question.id_of_post, + reason, + ) + return True, reason + + logger.info( + "Question %s: implicit deadline %s has passed. %s", + question.id_of_post, + result.deadline_date, + result.reasoning, + ) + return False, None + + async def _should_skip_resolution( + self, question: MetaculusQuestion + ) -> tuple[bool, str | None]: + """Decide whether to skip resolution entirely because the deadline has not passed. + + Tier 1 — free, instant check of ``scheduled_resolution_time``. + Tier 2 — cheap LLM analysis of the question text for an implicit deadline. + + Returns: + ``(should_skip, reason)`` where *reason* is a human-readable + explanation when *should_skip* is ``True``, or ``None`` otherwise. + """ + # Tier 1: metadata field + if self._is_before_scheduled_deadline(question): + reason = ( + f"Scheduled resolution time ({question.scheduled_resolution_time}) " + f"has not yet passed." + ) + logger.info( + "Question %s: %s Skipping resolution.", + question.id_of_post, + reason, + ) + return True, reason + + # Tier 2: LLM-based implicit deadline analysis + return await self._check_implicit_deadline(question) + + # ------------------------------------------------------------------ + # Public resolution entry points + # ------------------------------------------------------------------ + async def resolve_question( self, question: MetaculusQuestion ) -> Optional[ResolutionType]: + should_skip, reason = await self._should_skip_resolution(question) + if should_skip: + logger.info( + "Question %s — skipping resolution: %s", + question.id_of_post, + reason, + ) + return None + if isinstance(question, BinaryQuestion): return await self._resolve_binary(question) else: @@ -268,6 +407,25 @@ async def resolve_question_streamed( yield ("error", f"Unsupported question type: {type(question).__name__}") return + yield ("status", "Checking whether the question deadline has passed...") + should_skip, reason = await self._should_skip_resolution(question) + if should_skip: + now = pendulum.now("UTC") + yield ( + "status", + f"Deadline has not yet passed. Skipping resolution. {reason}", + ) + yield ( + "result", + f"Resolution: NOT_YET_RESOLVABLE\n" + f"Reasoning: {reason}\n" + f"Key Evidence:\n" + f" - {reason}\n" + f" - Current time: {now}\n" + f" - Deadline has not yet passed", + ) + return + # Step 1: Rephrase if needed yield ("status", "Checking if question needs rephrasing...") question = await self._rephrase_question_if_needed(question) diff --git a/forecasting_tools/agents_and_tools/auto_resolver/agentic/instructions.py b/forecasting_tools/agents_and_tools/auto_resolver/agentic/instructions.py index fe700384..f158108b 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/agentic/instructions.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/agentic/instructions.py @@ -4,7 +4,83 @@ import pendulum -from forecasting_tools import clean_indents, BinaryQuestion +from forecasting_tools import clean_indents, BinaryQuestion, MetaculusQuestion + + +def deadline_check_instructions(question: MetaculusQuestion) -> str: + """Build instructions for the LLM-based implicit deadline analysis. + + This prompt asks the LLM to examine the question text, resolution + criteria, fine print, and scheduled resolution date to identify the + effective deadline — the date by which the question's event must have + occurred for it to be resolvable. + + Args: + question: The question to analyze for an implicit deadline. + + Returns: + Formatted instruction string. + """ + today_string = pendulum.now(tz="UTC").strftime("%Y-%m-%d") + scheduled_resolution = ( + question.scheduled_resolution_time.strftime("%Y-%m-%d") + if question.scheduled_resolution_time + else "Not specified" + ) + + return clean_indents( + f""" + # Your Task + + You are a deadline analyst for a forecasting resolution system. Your job + is to determine the **effective deadline** of a forecasting question — + the date by which the described event must have occurred for the question + to be resolvable. + + # The Question + + {question.question_text} + + # Additional Context + + Resolution criteria: {question.resolution_criteria} + + Fine print: {question.fine_print} + + Background information: {question.background_info} + + Scheduled resolution date (from platform metadata): {scheduled_resolution} + + Today's date (UTC): {today_string} + + # Instructions + + 1. Examine the question text, resolution criteria, fine print, and + background information to identify any explicit or implicit deadlines. + 2. The deadline is the date by which the event or condition described in + the question must have occurred. For example: + - "Will X happen by March 2026?" → deadline is 2026-03-31 + - "Will X happen before January 1, 2027?" → deadline is 2026-12-31 + - "Will X happen in 2025?" → deadline is 2025-12-31 + - "Will X happen by the end of Q2 2026?" → deadline is 2026-06-30 + 3. If the question text contains a specific deadline, use that — it takes + precedence over the platform's scheduled resolution date. + 4. If the question text does NOT contain a specific deadline but a + scheduled resolution date is provided, use the scheduled resolution + date as the effective deadline. + 5. If you cannot identify any deadline at all, set has_deadline to false. + + # Output Format + + Return a JSON object with these fields: + - "has_deadline": true/false — whether the question has a discernible deadline + - "deadline_date": "YYYY-MM-DD" — the effective deadline date, or null if none found + - "reasoning": "..." — 1-2 sentences explaining how you determined the deadline + + Return ONLY the JSON object. No additional text. + """ + ) + def researcher_instructions(question: BinaryQuestion) -> str: """Build detailed instructions for the researcher agent. @@ -15,6 +91,40 @@ def researcher_instructions(question: BinaryQuestion) -> str: Returns: Formatted instruction string """ + open_time_str = ( + question.open_time.strftime("%Y-%m-%d") + if question.open_time + else None + ) + close_time_str = ( + question.close_time.strftime("%Y-%m-%d") + if question.close_time + else None + ) + + if open_time_str: + end_date = close_time_str or "its close/resolution date" + temporal_section = clean_indents( + f""" + # Temporal Window — CRITICAL + + This question was opened on **{open_time_str}**. Only events that + occurred **on or after {open_time_str}** are relevant for resolution. + + If your research finds that the described event occurred *before* the + question was opened, that evidence must be clearly flagged as + **pre-question** and should NOT be treated as satisfying the resolution + criteria. The question is asking whether the event occurs during the + question's active period ({open_time_str} to {end_date}), not whether + it ever happened historically. + + When searching, focus on events between {open_time_str} and + {end_date}. Always note the exact date of any event you find and + whether it falls inside or outside this window. + """ + ) + else: + temporal_section = "" return clean_indents( f""" @@ -37,6 +147,8 @@ def researcher_instructions(question: BinaryQuestion) -> str: 5. **Edge Cases**: Look for any ambiguities, disputes, or complications 6. **Validity Check**: Investigate whether the question's subject is valid/possible (for potential annulment) + {temporal_section} + # Available Tools You have two search tools: @@ -212,6 +324,38 @@ def question_rephraser_instructions(question: BinaryQuestion) -> str: def binary_resolver_instructions(question: BinaryQuestion) -> str: + open_time_str = ( + question.open_time.strftime("%Y-%m-%d") + if question.open_time + else None + ) + + if open_time_str: + temporal_rule = clean_indents( + f""" + # Temporal Window Rule — CRITICAL + + This question was opened on **{open_time_str}**. Only events that + occurred **on or after {open_time_str}** count toward resolution. + + If the research shows the event happened *before* the question was + opened, that is **NOT** sufficient for a TRUE resolution. The question + is asking whether the event occurs during the question's active period, + not whether it ever happened historically. + + **Example**: If the question asks "Will event X happen before Jan 1, + 2025?" and the question was opened on April 1, 2024, only evidence of + event X occurring on or after April 1, 2024 is relevant. An occurrence + on March 3, 2024 (before the question opened) would NOT make this + resolve TRUE. + + When evaluating evidence, always check the date of each event against + the question's open date ({open_time_str}) and disregard any events + that predate it. + """ + ) + else: + temporal_rule = "" return clean_indents( f""" @@ -263,6 +407,8 @@ def binary_resolver_instructions(question: BinaryQuestion) -> str: 4. **Consider ambiguity**: Is the outcome clear or disputed? 5. **Be conservative**: If uncertain, return NOT_YET_RESOLVABLE + {temporal_rule} + # Critical Distinctions **AMBIGUOUS vs ANNULLED:** diff --git a/forecasting_tools/agents_and_tools/auto_resolver/assess.py b/forecasting_tools/agents_and_tools/auto_resolver/assess.py index 48f22bf8..ce67ca74 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/assess.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/assess.py @@ -48,9 +48,15 @@ class ResolutionAssessmentReport: cc: list[int] = field(default_factory=list) xp: list[int] = field(default_factory=list) - xn: list[int] = field(default_factory=list) + xn: list[int] = field(default_factory=list) xc: list[int] = field(default_factory=list) + nyr_p: list[int] = field(default_factory=list) + nyr_n: list[int] = field(default_factory=list) + nyr_c: list[int] = field(default_factory=list) + nyr_nyr: list[int] = field(default_factory=list) + nyr_x: list[int] = field(default_factory=list) + question_results: dict[int, QuestionAssessmentResult] = field(default_factory=dict) @property @@ -100,27 +106,48 @@ def n_xn(self) -> int: @property def n_xc(self) -> int: return len(self.xc) + + @property + def n_nyr_p(self) -> int: + return len(self.nyr_p) + + @property + def n_nyr_n(self) -> int: + return len(self.nyr_n) + + @property + def n_nyr_c(self) -> int: + return len(self.nyr_c) + + @property + def n_nyr_nyr(self) -> int: + return len(self.nyr_nyr) + + @property + def n_nyr_x(self) -> int: + return len(self.nyr_x) def binary_results_table(self) -> str: """ Returns a markdown table representation of the binary assessment report. Columns represent predicted resolutions, rows represent actual resolutions. - The "Not Answered" column contains cases where the resolver returned None + Then "Not Answered" column contains cases where the resolver returned None or an unexpected value (logged as warnings for debugging). Returns: str: A markdown formatted confusion matrix table """ corner_label = "Actual \\ Predicted" - col_headers = ["Positive", "Negative", "Cancelled", "Not Answered"] - row_labels = ["Positive", "Negative", "Cancelled"] - # Rows ordered: actual Positive, actual Negative, actual Cancelled - # Columns ordered: predicted Positive, predicted Negative, predicted Cancelled, predicted Not Answered + col_headers = ["Positive", "Negative", "Cancelled", "Not Yet Resolvable", "Not Answered"] + row_labels = ["Positive", "Negative", "Cancelled", "Not Yet Resolvable"] + # Rows ordered: actual Positive, actual Negative, actual Cancelled, actual Not Yet Resolvable + # Columns ordered: predicted Positive, predicted Negative, predicted Cancelled, predicted Not Yet Resolvable, predicted Not Answered data = [ - [str(self.n_pp), str(self.n_np), str(self.n_cp), str(self.n_xp)], - [str(self.n_pn), str(self.n_nn), str(self.n_cn), str(self.n_xn)], - [str(self.n_pc), str(self.n_nc), str(self.n_cc), str(self.n_xc)], + [str(self.n_pp), str(self.n_np), str(self.n_cp), str(self.n_nyr_p), str(self.n_xp)], + [str(self.n_pn), str(self.n_nn), str(self.n_cn), str(self.n_nyr_n), str(self.n_xn)], + [str(self.n_pc), str(self.n_nc), str(self.n_cc), str(self.n_nyr_c), str(self.n_xc)], + [str(self.n_nyr_p), str(self.n_nyr_n), str(self.n_nyr_c), str(self.n_nyr_nyr), str(self.n_nyr_x)], ] # Compute column widths dynamically @@ -167,11 +194,15 @@ def detailed_report(self) -> str: lines.append("") # Results calculation - total = self.n_pp + self.n_pn + self.n_pc + \ - self.n_np + self.n_nn + self.n_nc + \ - self.n_cp + self.n_cn + self.n_cc + \ - self.n_xp + self.n_xn + self.n_xc - correct = self.n_pp + self.n_nn + self.n_cc + total = ( + self.n_pp + self.n_pn + self.n_pc + + self.n_np + self.n_nn + self.n_nc + + self.n_cp + self.n_cn + self.n_cc + + self.n_xp + self.n_xn + self.n_xc + + self.n_nyr_p + self.n_nyr_n + self.n_nyr_c + + self.n_nyr_nyr + ) + correct = self.n_pp + self.n_nn + self.n_cc + self.n_nyr_nyr accuracy = (correct / total * 100) if total > 0 else 0 lines.append(f"**Total Questions:** {total}") @@ -250,7 +281,7 @@ def write_to_file(self, directory: str = "reports") -> Path: def _resolution_to_str(resolution: Optional[ResolutionType]) -> str: """Convert a resolution to a human-readable string.""" if resolution is None: - return "None (Unresolvable)" + return "NOT_YET_RESOLVABLE" elif isinstance(resolution, bool): return "TRUE" if resolution else "FALSE" elif isinstance(resolution, CanceledResolution): diff --git a/forecasting_tools/agents_and_tools/auto_resolver/resolution_models.py b/forecasting_tools/agents_and_tools/auto_resolver/resolution_models.py index 2a00f70d..8bd9c4c4 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/resolution_models.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/resolution_models.py @@ -9,6 +9,31 @@ from typing import Literal, Optional +class DeadlineCheckResult(BaseModel): + """Structured output from the LLM-based implicit deadline analysis. + + Used to determine whether a question's effective deadline (as described + in its text, resolution criteria, or fine print) has already passed. + + Attributes: + has_deadline: Whether the question contains a discernible deadline. + deadline_date: The effective deadline as an ISO-8601 date string + (YYYY-MM-DD), or None if no deadline was found. + reasoning: Brief explanation of how the deadline was determined. + """ + + has_deadline: bool = Field( + description="Whether the question contains a discernible deadline or time-bound condition" + ) + deadline_date: Optional[str] = Field( + default=None, + description="The effective deadline in YYYY-MM-DD format, or null if no deadline found", + ) + reasoning: str = Field( + description="1-2 sentence explanation of how the deadline was identified" + ) + + class BinaryResolutionResult(BaseModel): """Structured output for binary question resolution. diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/app.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/app.py index 90ab7241..2a25c5bc 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/tui/app.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/app.py @@ -37,9 +37,6 @@ from forecasting_tools.agents_and_tools.auto_resolver.tui.widgets.home_panel import ( HomePanel, ) -from forecasting_tools.agents_and_tools.auto_resolver.tui.widgets.log_panel import ( - LogPanel, -) from forecasting_tools.agents_and_tools.auto_resolver.tui.report import ( generate_markdown_report, ) @@ -58,7 +55,6 @@ class AutoResolverApp(App): t -- Add questions from a tournament r -- Re-run resolution on the selected question e -- Export report to markdown - l -- Toggle log panel (captured logging + stderr) q -- Quit """ @@ -78,7 +74,6 @@ class AutoResolverApp(App): Binding("t", "add_tournament", "Add Tournament", show=True), Binding("r", "rerun", "Re-run Selected", show=True), Binding("e", "export_report", "Export Report", show=True), - Binding("l", "toggle_logs", "Logs", show=True), Binding("q", "quit", "Quit", show=True), ] @@ -97,7 +92,6 @@ def __init__( self._initial_tournaments = initial_tournaments or [] self._initial_questions = initial_questions or [] self._original_stderr = sys.stderr - self._view_before_logs: Literal["home", "feed"] = "home" # ------------------------------------------------------------------ # Layout @@ -109,7 +103,6 @@ def compose(self) -> ComposeResult: yield Sidebar(id="sidebar") yield HomePanel(id="home-panel") yield FeedPanel(id="feed-panel") - yield LogPanel(id="log-panel") yield Footer() @property @@ -124,9 +117,6 @@ def feed_panel(self) -> FeedPanel: def home_panel(self) -> HomePanel: return self.query_one("#home-panel", HomePanel) - @property - def log_panel(self) -> LogPanel: - return self.query_one("#log-panel", LogPanel) # ------------------------------------------------------------------ # Lifecycle @@ -136,7 +126,6 @@ async def on_mount(self) -> None: """Load any questions / tournaments passed via CLI.""" # Start with home panel visible, feed/log panels hidden self.feed_panel.display = False - self.log_panel.display = False self.home_panel.display = True # Redirect sys.stderr so that non-logging stderr writes (e.g. @@ -157,14 +146,12 @@ def _show_home(self) -> None: """Switch to the home overview panel.""" self.home_panel.display = True self.feed_panel.display = False - self.log_panel.display = False self.home_panel.refresh_table(self._items) def _show_feed(self, item: QuestionItem | None) -> None: """Switch to the feed panel for a specific question.""" self.home_panel.display = False self.feed_panel.display = True - self.log_panel.display = False self.feed_panel.show_question(item) # ------------------------------------------------------------------ @@ -200,24 +187,6 @@ def action_export_report(self) -> None: filepath.write_text(report, encoding="utf-8") self.notify(f"Report exported to {filepath}") - def action_toggle_logs(self) -> None: - """Toggle between the log panel and the previous view.""" - if self.log_panel.display: - # Return to whichever view was active before - if self._view_before_logs == "feed": - item = self._items.get(self._selected_post_id) # type: ignore[arg-type] - self._show_feed(item) - else: - self._show_home() - else: - # Remember which view is currently active, then show logs - if self.feed_panel.display: - self._view_before_logs = "feed" - else: - self._view_before_logs = "home" - self.home_panel.display = False - self.feed_panel.display = False - self.log_panel.display = True # ------------------------------------------------------------------ # Message handlers diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/report.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/report.py index cdfc3f93..2a864876 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/tui/report.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/report.py @@ -29,7 +29,7 @@ def _status_str_to_resolution(status: str | None) -> Optional[ResolutionType]: """Convert a resolution_status_str (e.g. 'TRUE') to a typed resolution.""" if status is None: - return None + return s = status.strip().upper() if s == "TRUE": return True @@ -41,8 +41,6 @@ def _status_str_to_resolution(status: str | None) -> Optional[ResolutionType]: return CanceledResolution.ANNULLED elif s == "NOT_YET_RESOLVABLE": return None - else: - return None # ------------------------------------------------------------------ @@ -127,8 +125,26 @@ def build_report_from_items( case _: report.xc.append(post_id) outcome_category = "Unmatched - Cancelled" + + elif true_resolution is None: + match predicted_resolution: + case True: + report.nyr_p.append(post_id) + outcome_category = "Not Yet Resolvable Predicted as Positive" + case False: + report.nyr_n.append(post_id) + outcome_category = "Not Yet Resolvable Predicted as Negative" + case CanceledResolution(): + report.nyr_c.append(post_id) + outcome_category = "Not Yet Resolvable Predicted as Cancelled" + case None: + report.nyr_nyr.append(post_id) + outcome_category = "Correct Not Yet Resolvable" + case _: + report.nyr_x.append(post_id) + outcome_category = "Unmatched - Not Yet Resolvable" else: - # No ground truth (unresolved question) — skip from matrix + # Unknown resolution type — skip from matrix continue question_result = QuestionAssessmentResult( @@ -171,8 +187,10 @@ def generate_markdown_report(items: dict[int, QuestionItem]) -> str: + report.n_np + report.n_nn + report.n_nc + report.n_cp + report.n_cn + report.n_cc + report.n_xp + report.n_xn + report.n_xc + + report.n_nyr_p + report.n_nyr_n + report.n_nyr_c + + report.n_nyr_nyr ) - correct = report.n_pp + report.n_nn + report.n_cc + correct = report.n_pp + report.n_nn + report.n_cc + report.n_nyr_nyr accuracy = (correct / total * 100) if total > 0 else 0 lines.append(f"**Total Questions:** {total}") diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/feed_panel.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/feed_panel.py index 3a0656cf..35e7535a 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/feed_panel.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/feed_panel.py @@ -22,14 +22,16 @@ def _resolution_color(value: str) -> str: return "green" elif v == "FALSE": return "red" - elif v in ("NOT_YET_RESOLVABLE", "RESOLVING...", "PENDING"): + elif v in ("NOT_YET_RESOLVABLE", "NOT YET RESOLVABLE", "RESOLVING...", "PENDING"): return "yellow" else: return "cyan" # AMBIGUOUS, ANNULLED, errors, etc. -def _normalize_ground_truth(raw: str) -> str: +def _normalize_ground_truth(raw: str | None) -> str: """Convert Metaculus ground-truth strings to True/False style.""" + if raw is None: + return "Not Yet Resolvable" lowered = raw.strip().lower() if lowered == "yes": return "True" @@ -78,12 +80,11 @@ def render_item(self, item: Optional[QuestionItem]) -> None: ) # Ground truth (normalised and colour-coded) - if item.question.resolution_string: - gt_normalized = _normalize_ground_truth(item.question.resolution_string) - gt_color = _resolution_color(gt_normalized) - parts.append( - f"Ground Truth: [{gt_color}]{gt_normalized}[/{gt_color}]" - ) + gt_normalized = _normalize_ground_truth(item.question.resolution_string) + gt_color = _resolution_color(gt_normalized) + parts.append( + f"Ground Truth: [{gt_color}]{gt_normalized}[/{gt_color}]" + ) # Cost if item.cost > 0: diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/home_panel.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/home_panel.py index a9d54f59..03ccad65 100644 --- a/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/home_panel.py +++ b/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/home_panel.py @@ -19,13 +19,18 @@ ) +def _normalize_for_compare(s: str) -> str: + """Normalize a resolution string for comparison (uppercase, underscores for spaces).""" + return s.strip().upper().replace(" ", "_") + + def _match_str(predicted: str | None, ground_truth: str | None) -> str: """Return a match indicator comparing predicted and ground truth.""" if predicted is None or ground_truth is None: return "-" - p = predicted.strip().upper() - g = ground_truth.strip().upper() - if p in ("PENDING", "RESOLVING...", "NOT_YET_RESOLVABLE"): + p = _normalize_for_compare(predicted) + g = _normalize_for_compare(ground_truth) + if p in ("PENDING", "RESOLVING..."): return "-" if g in ("", "NONE", "-"): return "-" @@ -106,7 +111,7 @@ def refresh_table(self, items: dict[int, QuestionItem]) -> None: predicted = item.resolution_display gt_raw = item.question.resolution_string - gt = _normalize_ground_truth(gt_raw) if gt_raw else "-" + gt = _normalize_ground_truth(gt_raw) if gt_raw else "Not Yet Resolvable" match = _match_str(predicted, gt) if item.status == "completed": @@ -136,8 +141,10 @@ def refresh_table(self, items: dict[int, QuestionItem]) -> None: + report.n_np + report.n_nn + report.n_nc + report.n_cp + report.n_cn + report.n_cc + report.n_xp + report.n_xn + report.n_xc + + report.n_nyr_p + report.n_nyr_n + report.n_nyr_c + + report.n_nyr_nyr ) - matrix_correct = report.n_pp + report.n_nn + report.n_cc + matrix_correct = report.n_pp + report.n_nn + report.n_cc + report.n_nyr_nyr matrix_accuracy = (matrix_correct / matrix_total * 100) if matrix_total > 0 else 0 matrix_display = ( diff --git a/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/log_panel.py b/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/log_panel.py deleted file mode 100644 index 1bff8330..00000000 --- a/forecasting_tools/agents_and_tools/auto_resolver/tui/widgets/log_panel.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Global log panel for displaying captured logging and stderr output.""" - -from __future__ import annotations - -from textual.app import ComposeResult -from textual.containers import Vertical -from textual.widgets import RichLog, Static - - -class LogPanel(Vertical): - """Full-screen panel showing captured Python logging and stderr output. - - Toggled via the ``l`` keybinding. All log records from every logger - in the process (including third-party libraries like litellm, httpx, - and the OpenAI SDK) are routed here instead of to stderr so they do - not corrupt the TUI display. - """ - - DEFAULT_CSS = """ - LogPanel { - width: 1fr; - height: 1fr; - } - LogPanel > .log-panel-header { - dock: top; - height: auto; - padding: 0 2; - background: $surface; - border-bottom: solid $primary; - text-align: center; - text-style: bold; - } - LogPanel > #log-output { - height: 1fr; - padding: 0 1; - } - """ - - def compose(self) -> ComposeResult: - yield Static( - "Logs [dim](press [bold]l[/bold] to return)[/dim]", - classes="log-panel-header", - ) - yield RichLog( - id="log-output", - highlight=True, - markup=True, - wrap=True, - ) - - @property - def log_output(self) -> RichLog: - return self.query_one("#log-output", RichLog) - - def append_log(self, text: str) -> None: - """Append a pre-formatted log line to the panel.""" - self.log_output.write(text) - - def clear_logs(self) -> None: - """Clear all log output.""" - self.log_output.clear() From 7d426c4a1402a75a9e6a832897a8600ce3f0e59a Mon Sep 17 00:00:00 2001 From: Luke Beukelman Date: Mon, 9 Mar 2026 21:53:07 -0600 Subject: [PATCH 14/14] Removing old reports --- .../assessment_report_20260227_155703.md | 444 ------ .../reports/tui_report_20260301_202819.md | 1192 ----------------- 2 files changed, 1636 deletions(-) delete mode 100644 forecasting_tools/agents_and_tools/auto_resolver/reports/assessment_report_20260227_155703.md delete mode 100644 forecasting_tools/agents_and_tools/auto_resolver/reports/tui_report_20260301_202819.md diff --git a/forecasting_tools/agents_and_tools/auto_resolver/reports/assessment_report_20260227_155703.md b/forecasting_tools/agents_and_tools/auto_resolver/reports/assessment_report_20260227_155703.md deleted file mode 100644 index 8ce49ac8..00000000 --- a/forecasting_tools/agents_and_tools/auto_resolver/reports/assessment_report_20260227_155703.md +++ /dev/null @@ -1,444 +0,0 @@ -# Auto Resolver Assessment Report - -**Generated:** 2026-02-27 15:57:03 - -## Summary - -| Actual \ Predicted | True | False | Unresolvable | Cancelled | Unmatched | -|--------------------|------|-------|--------------|-----------|-----------| -| True | 8 | 1 | 0 | 0 | 0 | -| False | 1 | 10 | 0 | 0 | 0 | -| Cancelled | 0 | 0 | 0 | 0 | 0 | - -**Total Questions:** 20 -**Correct Predictions:** 18 (90.0%) - -## Detailed Results - - -### Question 39996 - -**Title:** Will any of these listed US federal government officials be out of their position before October 1, -**URL:** https://www.metaculus.com/questions/39996 - -**Question Contents:** - -> Will any of these listed US federal government officials be out of their position before October 1, 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | TRUE | - -**Key Evidence:** -- “My searches found no evidence of any cabinet secretaries or cabinet-level officials from the specified list departing during September 14 - October 1, 2025.” -- “While there were some departures at federal agencies during this period, they were not from the positions specified in the question:” -- “One source specifically noted that ‘Trump ends 2025 with no cabinet turnover,’ indicating the core cabinet remained stable through the end of 2025.” -- “Despite extensive searches using various terms and date ranges, I found no evidence of any of the 100 specifically listed officials leaving their positions during the September 14 - October 1, 2025 timeframe.” - ---- - -### Question 40939 - -**Title:** Will Iran announce a new capital location before January 3, 2026? -**URL:** https://www.metaculus.com/questions/40939 - -**Question Contents:** - -> Will Iran announce a new capital location before January 3, 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | FALSE | - -**Key Evidence:** -- “Iran officially announced a new capital location on January 8, 2025 - well before the January 3, 2026 deadline.” -- “Official Announcement Date: January 8, 2025” -- “Government spokeswoman Fatemeh Mohajerani made the official announcement” -- “Specific Location Announced: Makran Region” -- “This meets the resolution criteria requiring ‘at least at the level of a specific county’” - ---- - -### Question 39559 - -**Title:** Will Alexandre de Moraes cease to be a minister of the Supreme Federal Court of Brazil before 2026? -**URL:** https://www.metaculus.com/questions/39559 - -**Question Contents:** - -> Will Alexandre de Moraes cease to be a minister of the Supreme Federal Court of Brazil before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Key Evidence:** -- “Alexandre de Moraes remained a minister of Brazil's Supreme Federal Court through January 1, 2026” -- “No evidence found of any departure, resignation, retirement, or impeachment before the January 1, 2026 deadline” -- “He continued serving actively as a justice well into 2026” -- “He was issuing judicial orders as late as January 30, 2026” -- “No resignation occurred” - ---- - -### Question 39564 - -**Title:** Will the United Nations have more than 193 member states before January 1, 2026? -**URL:** https://www.metaculus.com/questions/39564 - -**Question Contents:** - -> Will the United Nations have more than 193 member states before January 1, 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Key Evidence:** -- The United Nations has 193 member states as of February 2026 -- This number has remained unchanged since South Sudan joined as the 193rd member on July 14, 2011 -- No new member states were admitted in 2024-2025 -- The US vetoed the Security Council recommendation on April 18, 2024 -- Palestine remains a non-member observer state - ---- - -### Question 39572 - -**Title:** Will the US pass a federal bill on AI regulation before January 1, 2026? -**URL:** https://www.metaculus.com/questions/39572 - -**Question Contents:** - -> Will the US pass a federal bill on AI regulation before January 1, 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Key Evidence:** -- “No comprehensive federal AI legislation was passed: Despite multiple legislative efforts in 2024 and 2025, no federal bill primarily focused on AI regulation was successfully passed by both the House and Senate and signed into law before January 1, 2026.” -- “Executive Orders, not legislation: The main federal AI actions were Executive Orders: - Biden's Executive Order 14141 (January 14, 2025) on AI infrastructure - Trump's Executive Order 14365 (December 11, 2025) on national AI policy framework” -- “Limited enacted legislation: The only AI-related federal statute was the TAKE IT DOWN Act (signed May 2025), which specifically addressed nonconsensual distribution of intimate images - this does not meet the criteria as it's not primarily focused on general AI regulation.” -- “None of these criteria were met.” - ---- - -### Question 39584 - -**Title:** Will a new war or a substantial escalation to a previous war kill at least 5,000 people in 2025? -**URL:** https://www.metaculus.com/questions/39584 - -**Question Contents:** - -> Will a new war or a substantial escalation to a previous war kill at least 5,000 people in 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Key Evidence:** -- "The India-Pakistan conflict was genuinely new but appears to have been brief with limited casualties" -- "However, the search results do NOT indicate casualties reached 5,000" -- "Sudan and Gaza had massive death tolls but were continuations of conflicts that began in 2023" -- "Syrian Network for Human Rights documented 3,338 deaths in 2025" - ---- - -### Question 39588 - -**Title:** Will Boris Johnson return as the leader of the UK Conservative Party before 2026? -**URL:** https://www.metaculus.com/questions/39588 - -**Question Contents:** - -> Will Boris Johnson return as the leader of the UK Conservative Party before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Key Evidence:** -- “The resolution criteria have NOT been met.” -- “Boris Johnson did not return as leader of the UK Conservative Party at any time between October 24, 2022 and January 1, 2026.” -- “October 25, 2022: Rishi Sunak became party leader after Truss's resignation (Johnson briefly considered a comeback but withdrew)” -- “Lost MP Status: Johnson resigned as MP for Uxbridge and South Ruislip on June 12, 2023, following the Privileges Committee investigation into ‘Partygate’” -- “The question should resolve as No.” - ---- - -### Question 39591 - -**Title:** Will an application to ban AfD be filed at the Federal Constitutional Court before 2026? -**URL:** https://www.metaculus.com/questions/39591 - -**Question Contents:** - -> Will an application to ban AfD be filed at the Federal Constitutional Court before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Key Evidence:** -- “No Federal Constitutional Court Application Filed: Despite the parliamentary initiative, no formal application was actually filed at the Federal Constitutional Court by any of the authorized entities (Federal Government, Bundestag, or Bundesrat) before January 1, 2026.” -- “Bundestag Debate (January 30, 2025): The Bundestag held its first-ever plenary debate on banning the AfD. However, the motion was NOT voted upon and was instead referred back to committee to avoid political risks and potential benefit to the AfD.” -- “Metaculus Confirmation: A Metaculus forecasting question asking the identical question resolved with ‘No,’ confirming that no such application was submitted before 2026.” -- “Conclusion: The resolution criteria have NOT been met. No application to ban AfD was filed at the Federal Constitutional Court by the Federal Government, Bundestag, or Bundesrat before January 1, 2026.” - ---- - -### Question 39608 - -**Title:** Will bitcoin reach $200k in 2025? -**URL:** https://www.metaculus.com/questions/39608 - -**Question Contents:** - -> Will bitcoin reach $200k in 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Key Evidence:** -- The question asks whether Bitcoin reached $200,000 USD as a "High" price on any single day in 2025, according to Yahoo Finance historical data. -- Bitcoin's **maximum price in 2025 was approximately $126,296**, reached on **October 6, 2025** -- Bitcoin never reached $200,000 at any point during 2025 -- **Resolution Status**: The criteria for a "Yes" resolution have NOT been met. - ---- - -### Question 39613 - -**Title:** Will the Nonlinear Fund have more than eight full-time equivalent employees on January 1, 2026? -**URL:** https://www.metaculus.com/questions/39613 - -**Question Contents:** - -> Will the Nonlinear Fund have more than eight full-time equivalent employees on January 1, 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Key Evidence:** -- “Key Observation: Throughout all my searches, I found no evidence that Nonlinear Fund ever had or was approaching 9+ full-time equivalent employees.” -- “The organization appears to have always operated as a small team of 4-5 core members plus advisors.” -- “Evidence suggests NO: - The organization's core team consists of approximately 4-5 people - No evidence of expansion to 9+ employees was found” -- “Based on the available evidence, the Nonlinear Fund appears to have remained a small organization (4-5 core team members) and shows no indication of having expanded to 9+ full-time equivalent employees by January 1, 2026.” - ---- - -### Question 39615 - -**Title:** Will there be major civil unrest or martial law in the Philippines in 2025? -**URL:** https://www.metaculus.com/questions/39615 - -**Question Contents:** - -> Will there be major civil unrest or martial law in the Philippines in 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Key Evidence:** -- Result: No martial law declared - Despite significant civil unrest throughout 2025, there is no evidence that President Ferdinand Marcos Jr. declared martial law anywhere in the Philippines during 2025 -- Total deaths from civil unrest: 2 fatalities -- Largest arrest event: September 21, 2025 protests with 224 arrests (including 95 minors) -- Based on this comprehensive research, none of the three resolution criteria were met - ---- - -### Question 40284 - -**Title:** Will there be a political protest with at least 600,000 participants in France before 2026? -**URL:** https://www.metaculus.com/questions/40284 - -**Question Contents:** - -> Will there be a political protest with at least 600,000 participants in France before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Key Evidence:** -- “Official Interior Ministry count: 506,000 participants nationwide (with 55,000 in Paris)” -- “This was the largest single protest during the relevant timeframe” -- “The count fell short of the required 600,000 threshold by approximately 94,000 participants” -- “Answer: The question should resolve as NO - there was no political protest with at least 600,000 participants according to French Ministry of Interior official reports before January 1, 2026.” - ---- - -### Question 39560 - -**Title:** Will the highest-grossing film of 2025 be part of a franchise? -**URL:** https://www.metaculus.com/questions/39560 - -**Question Contents:** - -> Will the highest-grossing film of 2025 be part of a franchise? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Key Evidence:** -- The year 2025 has concluded, and final box office data is available. -- Highest-Grossing Film of 2025: Ne Zha 2 with approximately $2.2-2.3 billion worldwide gross (sources show slight variations but all confirm it as #1) -- It's a sequel to the original "Ne Zha" (2019) -- Final worldwide box office data is available from reliable sources including Box Office Mojo - ---- - -### Question 40020 - -**Title:** Will the US government be shut down before October 2, 2025? -**URL:** https://www.metaculus.com/questions/40020 - -**Question Contents:** - -> Will the US government be shut down before October 2, 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Key Evidence:** -- Resolution Status: YES - The criteria have been met -- Shutdown Occurred: The US federal government entered shutdown furlough status on **October 1, 2025 at 12:01 AM ET** -- "as announced by the US Office of Personnel Management" - Confirmed: OPM issued special instructions for agencies affected by the lapse in appropriations starting October 1, 2025 -- "before October 2, 2025" - Confirmed: The shutdown began October 1, 2025 at 12:01 AM ET, which is before the October 2, 2025 deadline - ---- - -### Question 40210 - -**Title:** Will the United States bailout Argentina before 2026? -**URL:** https://www.metaculus.com/questions/40210 - -**Question Contents:** - -> Will the United States bailout Argentina before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Key Evidence:** -- “On October 9-20, 2025, the US Treasury Department established a $20 billion currency swap line with Argentina's central bank (BCRA)” -- “The BCRA drew $2.5 billion from this swap line in October 2025 - this clearly meets the resolution criteria” -- “All relevant actions occurred in 2025, well before the 2026 deadline” -- “Recommendation: RESOLVE AS YES” - ---- - -### Question 40246 - -**Title:** Will the following companies be added to the S&P 500 before 2026? (ARES) -**URL:** https://www.metaculus.com/questions/40246 - -**Question Contents:** - -> Will the following companies be added to the S&P 500 before 2026? (ARES) - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Key Evidence:** -- “S&P Dow Jones Indices officially announced on December 8, 2025 that Ares Management Corporation (NYSE: ARES) would be added to the S&P 500 Index.” -- “The inclusion became effective prior to the open of trading on December 11, 2025, replacing Kellanov (NYSE: K) which was being acquired by Mars Inc.” -- “The announcement date of December 8, 2025 was before January 1, 2026, which meets the resolution criteria for this question.” -- “Multiple sources confirm this was an official announcement from S&P Dow Jones Indices, the authoritative body that maintains the S&P 500 Index.” -- “Recommendation: This question should resolve YES.” - ---- - -### Question 40252 - -**Title:** Will the following companies be added to the S&P 500 before 2026? (CVNA) -**URL:** https://www.metaculus.com/questions/40252 - -**Question Contents:** - -> Will the following companies be added to the S&P 500 before 2026? (CVNA) - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Key Evidence:** -- “S&P Dow Jones Indices officially announced on December 5, 2025 that Carvana (CVNA) would be added to the S&P 500 Index, effective December 22, 2025.” -- “The announcement was made on December 5, 2025, which is before the January 1, 2026 deadline specified in the resolution criteria.” -- “The announcement came directly from S&P Dow Jones Indices, which is the authoritative source mentioned in the resolution criteria.” -- “The criteria only require the announcement, not the actual inclusion (which also occurred on December 22, 2025)” - ---- - -### Question 40253 - -**Title:** Will a player outside of the top 10 seeds compete in the 2025 FIDE World Cup final? -**URL:** https://www.metaculus.com/questions/40253 - -**Question Contents:** - -> Will a player outside of the top 10 seeds compete in the 2025 FIDE World Cup final? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Key Evidence:** -- “Javokhir Sindarov (Uzbekistan) defeated Wei Yi (China) 2.5-1.5 in the final” -- “Wei Yi was the 7th seed (2754 rating) - placing him WITHIN the top 10 seeds” -- “Javokhir Sindarov was the 16th seed - placing him OUTSIDE the top 10 seeds” -- “Resolution: YES - A player outside the top 10 seeds (Javokhir Sindarov, 16th seed) did compete in the 2025 FIDE World Cup final and actually won the tournament.” - ---- - -### Question 40285 - -**Title:** Will Ukraine extend martial law beyond November 5, 2025? -**URL:** https://www.metaculus.com/questions/40285 - -**Question Contents:** - -> Will Ukraine extend martial law beyond November 5, 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Key Evidence:** -- “Ukraine DID extend martial law beyond November 5, 2025 - this is definitively confirmed by multiple official sources.” -- “First extension: From November 5, 2025 to February 3, 2026 (approved by parliament in October 2025, signed by President Zelensky on October 30-31, 2025)” -- “Second extension: From February 3, 2026 to May 4, 2026 (approved by parliament in January 2026 based on Zelensky's decree from January 12, 2026)” -- “Current status as of February 27, 2026: - Martial law remains in effect until May 4, 2026” -- “Official confirmations: - UN depositary notifications confirm the extensions - Multiple parliamentary votes with 330+ MPs supporting each extension - Presidential decrees and parliamentary approvals are well-documented” - ---- - -### Question 40947 - -**Title:** Will Germany enact the Aktivrente before 2026? -**URL:** https://www.metaculus.com/questions/40947 - -**Question Contents:** - -> Will Germany enact the Aktivrente before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Key Evidence:** -- “The German Aktivrente was officially enacted in 2025 before the January 3, 2026 deadline” -- “December 5, 2025: Bundestag (Parliament) passage - the critical enactment date” -- “The Bundestag passed the bill on December 5, 2025, which constitutes official enactment” -- “This question will not wait for the law to enter into legal force, if the bill is officially enacted this question will immediately resolve as Yes” -- “RESOLUTION: YES - Germany enacted the Aktivrente before 2026.” - ---- diff --git a/forecasting_tools/agents_and_tools/auto_resolver/reports/tui_report_20260301_202819.md b/forecasting_tools/agents_and_tools/auto_resolver/reports/tui_report_20260301_202819.md deleted file mode 100644 index 91b8f615..00000000 --- a/forecasting_tools/agents_and_tools/auto_resolver/reports/tui_report_20260301_202819.md +++ /dev/null @@ -1,1192 +0,0 @@ -# Auto Resolver Assessment Report - -**Generated:** 2026-03-01 20:28:19 - -## Summary - -| Actual \ Predicted | Positive | Negative | Cancelled | Not Answered | -|--------------------|----------|----------|-----------|--------------| -| Positive | 13 | 0 | 1 | 0 | -| Negative | 0 | 29 | 0 | 0 | -| Cancelled | 3 | 2 | 1 | 0 | - -**Total Questions:** 49 -**Correct Predictions:** 43 (87.8%) -**Total Cost:** $7.9477 - -## Detailed Results - - -### Question 39056 - -**Title:** [PRACTICE] Will Shigeru Ishiba cease to be Prime Minister of Japan before September 2025? -**URL:** https://www.metaculus.com/questions/39056 - -**Question Contents:** - -> [PRACTICE] Will Shigeru Ishiba cease to be Prime Minister of Japan before September 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | CanceledResolution.ANNULLED | - -**Cost:** $0.1234 - -**Key Evidence:** -- Ishiba announced his resignation on September 7, 2025, which is after the resolution cutoff of September 1, 2025. -- During July and August 2025, Ishiba actively denied resignation reports, calling them 'completely unfounded,' and stated he would focus on tariff negotiations with the U.S. -- The LDP performed poorly in the House of Councillors election on July 20, 2025, generating calls for resignation, but Ishiba resisted throughout July and August 2025. -- After completing U.S.-Japan trade deal negotiations in early September 2025, internal LDP pressure intensified, leading to his resignation announcement on September 7, 2025. -- Sanae Takaichi succeeded Ishiba and took office on approximately October 21, 2025, confirming Ishiba remained PM through at least September 7, 2025. - ---- - -### Question 39621 - -**Title:** Will Sam Altman Use Microsoft Teams before 2026? -**URL:** https://www.metaculus.com/questions/39621 - -**Question Contents:** - -> Will Sam Altman Use Microsoft Teams before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | CanceledResolution.ANNULLED | - -**Cost:** $0.1739 - -**Key Evidence:** -- No news articles, official statements, or credible reports document Altman using Microsoft Teams across multiple searches. -- Sam Altman was fired from OpenAI on November 17, 2023, via a Google Meet call — not Microsoft Teams — which generated social media jokes about Teams being snubbed. -- Altman was hired by Microsoft on November 20, 2023, but returned to OpenAI as CEO just days later (November 21, 2023), before ever meaningfully starting his Microsoft role. -- Searches for 'Sam Altman' + 'Microsoft Teams' return only social media jokes and irony commentary about OpenAI using Google Meet instead, with zero credible reports of him actually using Teams. -- The original question appears to be asking a humorous/ironic question prompted by the Google Meet firing, and no resolution evidence (YES) has been documented. - ---- - -### Question 39387 - -**Title:** Will diplomatic engagement between the U.S. and China on nuclear risk reduction increase in 2025? -**URL:** https://www.metaculus.com/questions/39387 - -**Question Contents:** - -> Will diplomatic engagement between the U.S. and China on nuclear risk reduction increase in 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | CanceledResolution.ANNULLED | - -**Cost:** $0.1398 - -**Key Evidence:** -- A Track II dialogue was held in Shanghai, July 7–11, 2025, organized by Harvard Kennedy School's Project on Managing the Atom, the American Academy of Arts and Sciences, and China's Shanghai Academy of Social Sciences, explicitly addressing nuclear risk reduction. -- The American Academy of Arts and Sciences officially acknowledged the engagement, publishing a winter 2026 bulletin article titled 'Opportunities and Challenges: U.S.-China Nuclear Arms Control and Risk Reduction.' -- The July 2025 Shanghai dialogue explicitly addressed nuclear risk reduction, strategic stability, nuclear forces and policies, no-first-use commitments, nonproliferation, and risk-reduction mechanisms. -- The Trump administration pursued a multilateral approach to nuclear arms control throughout 2025, engaging both Russia and China, with Trump discussing nuclear weapons reduction talks with China. -- China did not rule out bilateral nuclear risk reduction talks with the U.S., and a Trump-Xi meeting occurred in November 2025 in South Korea, which included broader diplomatic engagement. - ---- - -### Question 39460 - -**Title:** In 2025, will IRA's Advanced Manufacturing Production Tax Credits (45Χ) start requiring the use of d -**URL:** https://www.metaculus.com/questions/39460 - -**Question Contents:** - -> In 2025, will IRA's Advanced Manufacturing Production Tax Credits (45Χ) start requiring the use of domestic materials or subcomponents? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | CanceledResolution.ANNULLED | - -**Cost:** $0.1806 - -**Key Evidence:** -- The One Big Beautiful Bill Act (H.R. 1, Public Law 119-21) was signed into law on July 4, 2025 — before January 1, 2026 -- The OBBBA introduced restrictions prohibiting 'material assistance from any prohibited foreign entity' for eligible components, effective for tax years beginning after July 4, 2025 -- The OBBBA established a Material Assistance Cost Ratio (MACR) calculation framework that restricts foreign-sourced materials -- For tax years beginning after December 31, 2026, at least 65% of the direct material cost of an integrated component must come from primary components manufactured in the U.S. -- The resolution criteria fine print states 'Any requirement for domestic materials or subcomponents will resolve this question as Yes' - ---- - -### Question 39600 - -**Title:** Will Maersk resume shipping in the Red Sea in 2025? -**URL:** https://www.metaculus.com/questions/39600 - -**Question Contents:** - -> Will Maersk resume shipping in the Red Sea in 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | CanceledResolution.AMBIGUOUS | - -**Cost:** $0.0966 - -**Key Evidence:** -- On December 18–19, 2025, Maersk's vessel Maersk Sebarok successfully completed a transit through the Bab el-Mandeb Strait and the Red Sea, marking the company's first container ship passage through the Red Sea since early 2024. -- Maersk confirmed the transit on December 22, 2025, via an official announcement on maersk.com, calling it 'a significant step forward.' -- This was part of Maersk's MECL service (Middle East Container Line) and represented a formal (if cautious) change to its routing policy. -- In early 2025, Maersk and Hapag-Lloyd through their Gemini Cooperation planned to route one shared IMX service through the Red Sea and Suez Canal, though by late February 2025 some sailings had to be rerouted back around the Cape of Good Hope. -- After the successful December transit, a subsequent transit by the U.S.-flagged Maersk Denver occurred in January 2026, and Maersk announced a 'structural return' of MECL to the trans-Suez route on January 15, 2026. - ---- - -### Question 39385 - -**Title:** Will real-money betting on Polymarket or Manifold become legal in the United States in 2025? -**URL:** https://www.metaculus.com/questions/39385 - -**Question Contents:** - -> Will real-money betting on Polymarket or Manifold become legal in the United States in 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| CanceledResolution.AMBIGUOUS | CanceledResolution.ANNULLED | - -**Cost:** $0.2094 - -**Key Evidence:** -- The synced Metaculus question (post_id 31340, question_id 30969) resolved as Ambiguous on September 24, 2025, before Polymarket's December 2025 CFTC approval and US launch. -- The question explicitly states: 'This question will resolve to the same value as the synced question.' -- Polymarket's US beta launch occurred in December 2025 — after the synced Metaculus question had already closed/resolved. -- Manifold discontinued its real money 'sweepcash' feature on March 28, 2025, and did not pursue legalization in the US. -- A related but separate Metaculus question (question_id 39385, opened October 16, 2025) resolved as Annulled, further supporting ambiguity around this topic. - ---- - -### Question 39580 - -**Title:** Will global average wealth per adult increase in 2024 compared to 2023? -**URL:** https://www.metaculus.com/questions/39580 - -**Question Contents:** - -> Will global average wealth per adult increase in 2024 compared to 2023? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| CanceledResolution.AMBIGUOUS | TRUE | - -**Cost:** $0.3324 - -**Key Evidence:** -- UBS Global Wealth Report (2025): Global wealth grew 4.6% in 2024, following a 4.2% increase in 2023, with growth heavily concentrated in North America (+11%). -- Allianz Global Wealth Report 2025 confirms global net financial assets grew in 2024. -- WID/World Inequality Report data shows global wealth-income ratios rose to over 625% of world net domestic product by 2025, continuing upward trends. -- No direct confirmation was found of the specific WID metric `anweal_p0p100_z` values for 2023 and 2024 in search results. -- More than half of the 56 markets sampled by UBS did not participate in global wealth growth in 2024 and actually saw their average wealth per adult decline in USD real terms, which could affect the median differently from the mean. - ---- - -### Question 39355 - -**Title:** Will a federal court rule that the Trump administration violated the Impoundment Control Act in atte -**URL:** https://www.metaculus.com/questions/39355 - -**Question Contents:** - -> Will a federal court rule that the Trump administration violated the Impoundment Control Act in attempting to permanently withhold or cancel congressionally allocated funds by September 27, 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.2257 - -**Key Evidence:** -- All federal court rulings related to the Trump administration's impoundment of funds before the deadline were preliminary in nature (TROs or preliminary injunctions), which the resolution criteria explicitly excludes. -- On August 13, 2025, the D.C. Circuit Court of Appeals ruled that humanitarian groups lacked standing (only the U.S. Comptroller General could challenge an impoundment under the ICA) — a procedural ruling, not a violation finding. -- On September 26, 2025, the Supreme Court stayed Judge Ali's August order, allowing the Trump administration to withhold ~$4 billion in foreign aid, and explicitly stated this was not a final determination on the merits. -- The GAO issued findings on July 23, 2025 that the Trump administration violated the ICA by withholding Head Start program funds and Renew America's Schools Program funds, but the GAO is a legislative branch agency — not a federal court — so these findings do not satisfy the resolution criteria. -- Judge Ali's August 2025 expanded preliminary injunction found the freeze 'likely violated both federal law and the Constitution,' but this was still a preliminary ruling and does not qualify as a formal decision on the merits. - ---- - -### Question 39359 - -**Title:** Will 2025 be the warmest year on record globally, according to the ECMWF? -**URL:** https://www.metaculus.com/questions/39359 - -**Question Contents:** - -> Will 2025 be the warmest year on record globally, according to the ECMWF? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.0527 - -**Key Evidence:** -- 2025 was the third-warmest year on record according to ECMWF, NOT the warmest. -- 2025 global surface air temperatures were 1.47°C above pre-industrial levels (1850–1900). -- 2025 was 0.01°C cooler than 2023 and 0.13°C cooler than 2024, with 2024 remaining the hottest year on record. -- ECMWF published this finding at: https://www.ecmwf.int/en/about/media-centre/news/2025/2025-third-warmest-year -- NOAA also confirmed 2025 as the third-warmest year globally. - ---- - -### Question 39381 - -**Title:** In 2025, will the domestic content requirements become mandatory for obtaining the IRA's Investment -**URL:** https://www.metaculus.com/questions/39381 - -**Question Contents:** - -> In 2025, will the domestic content requirements become mandatory for obtaining the IRA's Investment Tax Credits (48Ε)? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.2685 - -**Key Evidence:** -- The One Big Beautiful Bill Act (OBBBA) was signed into law on July 4, 2025, which is before January 1, 2026, but did NOT make domestic content mandatory for all 48E ITC claimants. -- For standard taxpayers claiming the ITC, domestic content compliance remains voluntary/optional, and meeting the requirements simply unlocks a bonus credit (10 or 2 percentage points). -- For direct pay ('applicable') entities (tax-exempt orgs, state/local governments, tribal governments, rural cooperatives), there are mandatory domestic content requirements or 'haircut' provisions, but this was already the case under the original IRA for projects ≥1 MW. -- The resolution criteria's fine print states: 'The domestic requirements becoming mandatory for ITC only for specific categories of projects will not resolve this question.' -- No legislation enacted before January 1, 2026 converted domestic content requirements into a universal eligibility prerequisite for the 48E ITC for all taxpayers. - ---- - -### Question 39384 - -**Title:** Will Russia have significantly expanded its controlled territory in Ukraine on January 1, 2026? -**URL:** https://www.metaculus.com/questions/39384 - -**Question Contents:** - -> Will Russia have significantly expanded its controlled territory in Ukraine on January 1, 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.1845 - -**Key Evidence:** -- All five 'must control all' locations remain under Russian de facto control as of January 1, 2026 (Simferopol, Nova Kakhovka, Svatove, Donetsk, Tokmak). -- Kharkiv: Russian forces were operating in northern Kharkiv Oblast but did not control the city as of January 2026. -- Zaporizhzhia: Russian forces advanced ~18 km south of the city but did not control the city. -- Sumy: Russian forces captured small border villages near Sumy Oblast but did not control the city. -- Odesa, Mykolaiv, Lozova, and Chernihiv showed no evidence of Russian control. - ---- - -### Question 39389 - -**Title:** Will at least 500,000 Palestinians be displaced from Gaza before December 31st, 2025? -**URL:** https://www.metaculus.com/questions/39389 - -**Question Contents:** - -> Will at least 500,000 Palestinians be displaced from Gaza before December 31st, 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.2153 - -**Key Evidence:** -- According to the Palestinian Central Bureau of Statistics, approximately 100,000 Palestinians were forced to leave Gaza (cross the border entirely) since October 2023. -- Estimates of ~110,000 Palestinians fled to Egypt as refugees/asylum seekers. -- The Rafah crossing was largely closed after Israel occupied the Palestinian side in May 2024, severely limiting departures after that point. -- ~1.9 million Palestinians were internally displaced within Gaza, but this is movement within Gaza and does not count toward the threshold. -- The Gaza population declined by ~254,000 people (10.6%) by end of 2025, reflecting deaths, emigration, and other factors combined — far short of indicating 500,000 external departures. - ---- - -### Question 39406 - -**Title:** At the end of 2025, will any of GiveWell's top charities perform mass deworming as their primary int -**URL:** https://www.metaculus.com/questions/39406 - -**Question Contents:** - -> At the end of 2025, will any of GiveWell's top charities perform mass deworming as their primary intervention? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.3005 - -**Key Evidence:** -- GiveWell's four top charities as of January 1, 2026 are: Against Malaria Foundation (insecticide-treated nets), Malaria Consortium (seasonal malaria chemoprevention), Helen Keller International (vitamin A supplementation), and New Incentives (conditional cash transfers for vaccinations in Nigeria). -- Evidence Action's Deworm the World Initiative was previously a GiveWell top charity, but is no longer; GiveWell does not accept donations designated for this charity. -- Sightsavers receives GiveWell exit funding through ~March 2027 for deworming, but is not a top charity. -- The alternative resolution path (10% of regrants going to deworming) does not apply because GiveWell still maintains a top charities list; even if it did apply, deworming grants appear to be well under 10% of GiveWell's total 2025 grantmaking of $418M. -- None of GiveWell's four top charities perform mass deworming as their primary intervention. - ---- - -### Question 39421 - -**Title:** Will the number of UFO sightings reported by the National UFO Reporting Center exceed 350 for Septem -**URL:** https://www.metaculus.com/questions/39421 - -**Question Contents:** - -> Will the number of UFO sightings reported by the National UFO Reporting Center exceed 350 for September 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.1118 - -**Key Evidence:** -- September 2025 had 337 UFO sightings reported to NUFORC — well below the 350 threshold required for a 'Yes' resolution. -- The resolution criteria requires greater than 350 UFO/UAP sighting reports for September 2025. -- For comparison, August 2025 just barely exceeded 350 with 351 reports, while September fell short. -- The data is sourced directly from the NUFORC national event index (nuforc.org/ndx/?id=event), which is the exact source specified in the resolution criteria. -- October 2025 had 415 reports and July 2025 had 381, showing September 2025's 337 was notably lower than surrounding months. - ---- - -### Question 39423 - -**Title:** Will Boeing file for bankruptcy before January 1, 2026? -**URL:** https://www.metaculus.com/questions/39423 - -**Question Contents:** - -> Will Boeing file for bankruptcy before January 1, 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.0528 - -**Key Evidence:** -- Boeing has not filed for bankruptcy at any point in 2024 or 2025 and continues to operate as a going concern. -- Boeing faced significant headwinds including the 2024 machinists' strike (which cost ~$5 billion), a 737 MAX door plug blowout incident in January 2024, production delays, FAA restrictions, and mounting debt, but no filing occurred. -- Boeing continued reporting financial results through Q4 2025 and completed acquisitions (Spirit AeroSystems) and strategic divestitures in 2025. -- CEO Kelly Ortberg stated the company 'made significant progress on our recovery in 2025.' -- Boeing's consolidated debt stands at ~$54.1 billion as of end of 2025 — heavy, but being managed. - ---- - -### Question 39427 - -**Title:** Will Bitcoin close 2025 higher than it started? -**URL:** https://www.metaculus.com/questions/39427 - -**Question Contents:** - -> Will Bitcoin close 2025 higher than it started? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.1168 - -**Key Evidence:** -- Resolution Threshold: $93,429.20 (Bitcoin's price at the start of 2025) -- December 31, 2025 Closing Price: Approximately $87,500–$88,430 — below the threshold -- CoinMarketCap data indicates a closing price of ~$88,430 on December 31, 2025 -- StatMuse data corroborates a closing price of ~$87,502 on December 31, 2025 -- Bitcoin fell approximately 19.1% from September 1 to December 31, 2025, resulting in a year-end price well below the starting price - ---- - -### Question 39430 - -**Title:** Will ARC find that GPT-5 has autonomous replication capabilities? -**URL:** https://www.metaculus.com/questions/39430 - -**Question Contents:** - -> Will ARC find that GPT-5 has autonomous replication capabilities? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.2907 - -**Key Evidence:** -- METR (the successor to ARC Evals) evaluated GPT-5 and concluded GPT-5 is NOT capable of autonomous replication and does not pose catastrophic risks via rogue replication threat models. -- GPT-5's 50%-time horizon was measured at 2 hours 17 minutes, far below the concerning threshold of ~40 hours, with even upper-bound uncertainty estimates (~4.5 hours) well below what would trigger concern. -- GPT-5 was released on August 7, 2025, satisfying the timeline requirement for the question. -- There is no evidence that ARC (the original organization) published any evaluation of GPT-5 specifically; the evaluation work has entirely migrated to METR. -- No evaluation from either ARC or METR found GPT-5 to be capable of autonomous replication, making a Yes/TRUE resolution unsupported by current evidence. - ---- - -### Question 39441 - -**Title:** Will SpaceX launch the Starship three or more times in the second half of 2025? -**URL:** https://www.metaculus.com/questions/39441 - -**Question Contents:** - -> Will SpaceX launch the Starship three or more times in the second half of 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.1607 - -**Key Evidence:** -- Flight 10 launched on August 26, 2025 at 23:30 UTC — clearly within H2 2025 -- Flight 11 launched on October 13, 2025 at 23:23 UTC — clearly within H2 2025 -- Flight 12 did NOT launch in H2 2025 — it is targeted for March 2026 -- No other Starship launches occurred between July 1 and December 31, 2025 -- Only 2 Starship launches occurred in the second half of 2025; the threshold of 3 or more launches was NOT met - ---- - -### Question 39453 - -**Title:** Will the number of deaths from antibiotic-resistant infections per year double by December 31, 2025? -**URL:** https://www.metaculus.com/questions/39453 - -**Question Contents:** - -> Will the number of deaths from antibiotic-resistant infections per year double by December 31, 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.1773 - -**Key Evidence:** -- 2019 CDC Threats Report: ~35,000 deaths/year (the most comprehensive and most cited figure); ~48,000 when C. difficile is included -- 2013 CDC Report: ~23,000 deaths/year from antibiotic-resistant infections in the US (later revised upward to ~44,000 using improved methodology) -- July 2024 CDC update (2021-2022 data): Documented a 20% rise in hospital-onset infections, but did not provide a new comprehensive national death toll -- 2025 CDC report on NDM-CRE: Documented a 460% surge in one specific pathogen between 2019-2023, but this is not a comprehensive mortality estimate -- No report found — from the CDC or any other credible source — citing 70,000 or more annual US deaths from antibiotic-resistant infections, at any point through 2025 - ---- - -### Question 39463 - -**Title:** Will 1 Euro be worth less than 1 US Dollar before 2026? -**URL:** https://www.metaculus.com/questions/39463 - -**Question Contents:** - -> Will 1 Euro be worth less than 1 US Dollar before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.1372 - -**Key Evidence:** -- The lowest EUR/USD rate in 2024 was 1.0350, recorded on December 31, 2024. The euro never fell to or below parity (1.0000) during 2024. -- The lowest EUR/USD rate in January 2025 was 1.0257, recorded on January 10, 2025. This was the weakest point of 2025 and still remained above parity. -- After the January 2025 low, the EUR/USD rate recovered significantly throughout 2025, reaching approximately 1.13–1.17 range by late 2025, ending the year around 1.1714. -- While analysts in early 2025 warned parity was possible, it never materialized. The floor was approximately 1.0257. -- At no point during the January 1, 2024 – January 1, 2026 window did the EUR/USD exchange rate fall below 1.0000. - ---- - -### Question 39466 - -**Title:** Will Walmart stores in the United States return to 24-hour service before 2026? -**URL:** https://www.metaculus.com/questions/39466 - -**Question Contents:** - -> Will Walmart stores in the United States return to 24-hour service before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.1970 - -**Key Evidence:** -- Most Walmart locations in the U.S. continued to operate on limited hours (typically 6:00 a.m. to midnight or similar schedules), rather than around the clock, as of January 2026. -- In August 2022, Walmart stated it had no plans to return to 24/7 operations, citing labor shortages, and no subsequent announcements reversed this policy through the resolution date. -- On December 24, 2025, Walmart stores operated only from 6:00 a.m. to 6:00 p.m., and all 4,600+ U.S. Walmart stores were completely closed on December 25, 2025. -- No official announcement or credible reporting confirmed a return to majority 24/7 operations as of January 2026. -- Store hours remained limited (not 24/7) throughout 2024 and 2025, with staffing shortages and higher operational costs cited as reasons for maintaining limited hours. - ---- - -### Question 39473 - -**Title:** Will Ukraine launch a nuclear weapons program before 2026? -**URL:** https://www.metaculus.com/questions/39473 - -**Question Contents:** - -> Will Ukraine launch a nuclear weapons program before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.1384 - -**Key Evidence:** -- Ukraine's Foreign Ministry repeatedly reaffirmed its commitment to the Nuclear Non-Proliferation Treaty (NPT) and stated it does not intend to acquire nuclear weapons. -- President Zelensky stated in February 2026: 'Unfortunately, there are no nuclear weapons in Ukraine.' -- The IAEA maintained continuous monitoring at Ukraine's nuclear sites throughout 2025, with no findings of weapons-related activities. -- All highly enriched uranium was removed from Ukraine by 2012, and IAEA safeguards remain in place at all Ukrainian nuclear sites. -- Statements about Ukraine's nuclear capability (e.g., a Ukrainian think tank in November 2024 and a 2024 Ministry of Defence report) referred to technical capability assessments, not an active program. - ---- - -### Question 39489 - -**Title:** In 2025, will the domestic content requirements become mandatory for obtaining the IRA's Production -**URL:** https://www.metaculus.com/questions/39489 - -**Question Contents:** - -> In 2025, will the domestic content requirements become mandatory for obtaining the IRA's Production Tax Credits (45Y)? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.2259 - -**Key Evidence:** -- The domestic content bonus under Section 45Y is optional, not mandatory — meeting the domestic content thresholds entitles taxpayers to a bonus credit of up to 10% on top of the base credit, but taxpayers can still claim the base 45Y credit without meeting domestic content requirements. -- The One Big Beautiful Bill Act (OBBA) was signed into law on July 4, 2025, which is before January 1, 2026, but it did NOT make domestic content requirements mandatory for claiming the base 45Y credit. -- The OBBA introduced separate FEOC (Foreign Entity of Concern) restrictions, which are mandatory exclusions, but these are distinct from the existing IRA domestic content bonus credit requirements. -- The OBBA also terminated the 45Y credit for wind and solar projects placed in service after December 31, 2027, but kept domestic content as optional bonus credits. -- No other legislation enacted before January 1, 2026 made domestic content requirements mandatory for 45Y credits. - ---- - -### Question 39499 - -**Title:** Will China operate at least one military base in a BRICS country before 2026? -**URL:** https://www.metaculus.com/questions/39499 - -**Question Contents:** - -> Will China operate at least one military base in a BRICS country before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.2188 - -**Key Evidence:** -- China's only confirmed overseas military base is in Djibouti (operational since 2017), and Djibouti is not a BRICS member. -- Pentagon's 2024 Annual China Report (December 2024) states China 'probably has considered' various countries for PLA military logistics facilities — language of planning/consideration, not confirmation. -- Pentagon's 2025 Annual China Report (December 2025) lists Pakistan, Bangladesh, Cuba, Angola, Indonesia, and others as countries where China has 'likely also considered basing' — again not confirmation of operational bases. -- UAE was assessed by US intelligence as a suspected location (~2020-2022), but no confirmed operational base was established, and construction was reportedly halted. -- No BRICS member countries (Brazil, Russia, India, China, South Africa, Egypt, Ethiopia, Iran, UAE, Saudi Arabia) have a confirmed Chinese military base per any official public announcement. - ---- - -### Question 39520 - -**Title:** Will Argentina's year-over-year inflation be below 30% for December 2025? -**URL:** https://www.metaculus.com/questions/39520 - -**Question Contents:** - -> Will Argentina's year-over-year inflation be below 30% for December 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.0542 - -**Key Evidence:** -- Argentina's year-over-year inflation for December 2025 was 31.5%, according to INDEC data cited by Investing.com. -- Itaú BBA Analysis confirmed Argentina's 2025 annual inflation fell to 31.5%, from 117.8% in 2024. -- BBVA Research and the Buenos Aires Herald both reported Argentina's 2025 inflation was 31.5%, the lowest in 8 years. -- The question's resolution threshold requires YoY inflation to be strictly less than 30.0%; 31.5% exceeds this threshold. -- Monthly (MoM) inflation for December 2025 was 2.8%, with core inflation at 3.0% MoM. - ---- - -### Question 39539 - -**Title:** Will Israel establish direct governmental control of the Gaza Strip in 2025? -**URL:** https://www.metaculus.com/questions/39539 - -**Question Contents:** - -> Will Israel establish direct governmental control of the Gaza Strip in 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.1359 - -**Key Evidence:** -- Israeli military control expanded to approximately 75% of Gaza by mid-2025, but this was de facto military control — which explicitly does not count per the resolution criteria. -- A Gaza peace plan signed on October 9, 2025 established a National Committee for the Administration of Gaza (NCAG) — 15 Palestinian technocrats, not Israelis — headed by Ali Shaath, a former Palestinian Authority deputy minister. -- Israel did not formally annex any part of the Gaza Strip in 2025; annexation debates focused on the West Bank, not Gaza. -- While Defense Minister Israel Katz proposed military settlements in Gaza for 'security needs' in December 2024, this was never formally enacted as law or policy during 2025. -- No formal civilian government headed by Israelis was established over 50% or more of the Gaza Strip during 2025. - ---- - -### Question 39545 - -**Title:** Will an H5 virus receive an "emergence" risk rating categorized as "high" by the US CDC Influenza Ri -**URL:** https://www.metaculus.com/questions/39545 - -**Question Contents:** - -> Will an H5 virus receive an "emergence" risk rating categorized as "high" by the US CDC Influenza Risk Assessment Tool before January 1, 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.0963 - -**Key Evidence:** -- The most recent H5N1 IRAT assessment before the deadline was for A/Texas/37/2024, assessed in June 2024, which received an emergence score of 5.79 — well within the moderate risk category (4.0–7.9). -- In March 2025, two new clade 2.3.4.4b viruses were assessed: A/California/147/2024 (emergence score of 5.59) and A/Washington/239/2024 (emergence score of 5.21) — lower than the Texas virus and still in the moderate risk category. -- The highest emergence score ever recorded by the IRAT is 7.50 for an H1N1 swine flu strain (A/swine/Shandong/1207/2016). No influenza virus of any type has ever received a score ≥ 8.0 on IRAT. -- No H5 virus has ever approached the 8.0 threshold, let alone exceeded it. - ---- - -### Question 39559 - -**Title:** Will Alexandre de Moraes cease to be a minister of the Supreme Federal Court of Brazil before 2026? -**URL:** https://www.metaculus.com/questions/39559 - -**Question Contents:** - -> Will Alexandre de Moraes cease to be a minister of the Supreme Federal Court of Brazil before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.0898 - -**Key Evidence:** -- As recently as February 24, 2026, Moraes was presiding over the trial of suspects in the Marielle Franco killing, confirming he remains a sitting justice. -- U.S. Magnitsky Act sanctions were imposed in late July 2025, but Moraes publicly declared he would ignore the sanctions and remain in his position — which he did. -- While political groups called for his removal and Elon Musk publicly criticized him, no formal impeachment proceeded to completion. -- Supreme Federal Court justices in Brazil are appointed for life with mandatory retirement at 75; Moraes is 55, far from mandatory retirement. -- There is no evidence whatsoever that Alexandre de Moraes ceased to be a minister of Brazil's Supreme Federal Court before the resolution cutoff date of January 1, 2026. - ---- - -### Question 39574 - -**Title:** Will Brazil ban the 6x1 work schedule before 1 January 2026? -**URL:** https://www.metaculus.com/questions/39574 - -**Question Contents:** - -> Will Brazil ban the 6x1 work schedule before 1 January 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.0570 - -**Key Evidence:** -- A proposed constitutional amendment (PEC 8/2025) to eliminate the 6x1 work schedule has advanced through Brazil's legislature, but has not been enacted into law. -- By late 2025, the Senate's Constitution and Justice Committee had approved the amendment proposal — but committee-level approval is not the same as the full legislative enactment required for it to become law. -- The Federal Government publicly declared support for ending the 6x1 model, but verbal/executive support does not constitute an official ban. -- As of early 2026, the legislative timeline projects admissibility analysis completion at end of March 2026, with a potential Chamber of Deputies vote in May 2026. -- Some Brazilian companies have voluntarily transitioned away from the 6x1 schedule, but this is not a legal ban. - ---- - -### Question 39578 - -**Title:** Will the bubble in the Magnificent Seven pop before 2026? -**URL:** https://www.metaculus.com/questions/39578 - -**Question Contents:** - -> Will the bubble in the Magnificent Seven pop before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.2524 - -**Key Evidence:** -- None of the Magnificent Seven stocks came close to falling 50% below their all-time high during the resolution window (before January 1, 2026). -- The largest declines from ATH were in Microsoft (~29%) and Tesla (~19.5%), still well short of the 50% threshold. -- Most all-time highs for these stocks were set during 2025 — meaning prices were generally rising, not collapsing. -- Some all-time highs were set in late 2025 or early 2026 (e.g., Alphabet's ATH on February 3, 2026), confirming prices were still near historic highs rather than crashing. -- The question's resolution window (before January 1, 2026) has already passed, and there is no evidence that any of these stocks fell 50% from their all-time highs during that period. - ---- - -### Question 39587 - -**Title:** Will Iran possess a nuclear weapon before 2026? -**URL:** https://www.metaculus.com/questions/39587 - -**Question Contents:** - -> Will Iran possess a nuclear weapon before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.1783 - -**Key Evidence:** -- U.S. Intelligence Assessment (May 2025): 'Iran almost certainly is not producing nuclear weapons, but Iran has undertaken activities in recent years that better position it to produce them, if it chooses to do so.' -- Arms Control Association fact sheet confirms Iran is NOT a nuclear-weapons state, noting it can produce weapons-grade uranium for multiple bombs within weeks, but has not done so. -- Iran possessed ~972 lbs of 60%-enriched uranium as of mid-June 2025, but never enriched to weapons grade. -- Iran continues to officially deny any intention to develop nuclear weapons, insisting its program is entirely peaceful. -- There is no evidence that Iran announced possession of, or conducted a test of, a nuclear weapon at any point before 2026. - ---- - -### Question 39589 - -**Title:** Will US offer India a nuclear submarine before 2026? -**URL:** https://www.metaculus.com/questions/39589 - -**Question Contents:** - -> Will US offer India a nuclear submarine before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.1760 - -**Key Evidence:** -- All searches confirm there was no formal US offer, agreement, or deal with India regarding nuclear-powered submarines during the resolution period (January 1, 2022 to January 1, 2026). -- The US has consistently maintained that American domestic laws prevent sharing nuclear submarine propulsion technology with any nation outside formal treaty relationships — including India, despite the 2008 US-India Civil Nuclear Agreement. -- The AUKUS nuclear submarine agreement (US, UK, Australia) was not extended to include India. India is not part of AUKUS and no comparable arrangement was offered to India. -- India sealed a deal to lease a nuclear-powered Akula-class submarine (Chakra-III) from Russia in December 2025, not from the US. -- The Modi-Trump joint statement (February 13, 2025) focused exclusively on civil nuclear energy cooperation (reactors), not military/submarine nuclear technology. - ---- - -### Question 39609 - -**Title:** Will evidence of alien technology be recovered near the IM1 crash site before 2026? -**URL:** https://www.metaculus.com/questions/39609 - -**Question Contents:** - -> Will evidence of alien technology be recovered near the IM1 crash site before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.2222 - -**Key Evidence:** -- The Chemical Geology paper (September 2024) by Avi Loeb and the Galileo Project involved four independent laboratories (Harvard, UC Berkeley, Bruker Corporation, and University of Technology in Papua New Guinea), satisfying the ≥3 labs criterion, but did not conclude with 95% confidence that the material was non-terrestrial. -- An October 2024 paper in Research Notes of the AAS (Patricio Gallardo) concluded the spherules match coal ash contaminants, with meteoritic origin 'disfavored.' -- A November 2024 preprint (Desch & Jackson) showed iron isotope ratios align with terrestrial fractionation lines, concluding that 'an interstellar origin can be effectively ruled out.' -- Research published in 2024 found the seismic signal used to locate the crash site was likely caused by a truck on a nearby road, not the meteor, meaning the expedition may have searched 100+ miles from the actual impact zone. -- No peer-reviewed study concluded with 95% confidence that the recovered material reflects a technological origin — Loeb's speculations about technological origin were not formal conclusions of any peer-reviewed paper. - ---- - -### Question 39623 - -**Title:** Will 3M file for bankruptcy before 2026? -**URL:** https://www.metaculus.com/questions/39623 - -**Question Contents:** - -> Will 3M file for bankruptcy before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.0958 - -**Key Evidence:** -- 3M Company (CIK 66740) did NOT file for bankruptcy under Chapter 11 or Chapter 7 at any point before January 1, 2026. -- Aearo Technologies (subsidiary) filed Chapter 11 on July 26, 2022, but this was a 3M subsidiary, not 3M itself; the bankruptcy court dismissed this filing in June 2023. -- 3M explicitly disclaimed bankruptcy intent in a formal SEC 8-K filing (August 29, 2023): '3M has no present intention to file a petition for relief under the Bankruptcy Code.' -- 3M settled the Combat Arms Earplug lawsuits for $6 billion (announced August 29, 2023), with ~$5.8 billion already paid out to veterans by May 2025. -- 3M remained financially healthy throughout this period: Full-year 2024 sales of $24.6 billion with $4.9 billion in adjusted free cash flow; full-year 2025 sales of $24.9 billion with adjusted EPS of $8.06. - ---- - -### Question 40604 - -**Title:** Will Iran allow the IAEA to inspect its stockpile of enriched uranium before 2026? -**URL:** https://www.metaculus.com/questions/40604 - -**Question Contents:** - -> Will Iran allow the IAEA to inspect its stockpile of enriched uranium before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.1421 - -**Key Evidence:** -- The September 9, 2025 'Cairo agreement' to resume nuclear inspections at the bombed sites was suspended by Iran on September 28, 2025, following UN Security Council reimposition of sanctions via the 'snapback' mechanism. -- By October 20, 2025, Reuters reported that Iran declared the IAEA cooperation deal void/scrapped. -- In November 2025, Iran formally canceled the Cairo agreement after the IAEA Board of Governors demanded full inspector access to all nuclear facilities. -- The IAEA conducted 12 inspection visits in Iran during this period, but none at the bombed sites (Fordow, Natanz, Isfahan). -- As of January 2026, the IAEA stated it could no longer verify Iran's nuclear activities, and satellite imagery showed construction activity at damaged sites with no IAEA inspector access. - ---- - -### Question 40867 - -**Title:** Will the People's Party win an absolute majority in the next regional elections in Extremadura, Spai -**URL:** https://www.metaculus.com/questions/40867 - -**Question Contents:** - -> Will the People's Party win an absolute majority in the next regional elections in Extremadura, Spain? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| FALSE | FALSE | - -**Cost:** $0.0897 - -**Key Evidence:** -- Official results (99.89% counted) show PP won 29 seats with 43.12–43.18% of the vote -- The absolute majority threshold in the 65-seat Assembly of Extremadura is 33 seats -- PP fell 4 seats short of an absolute majority -- Multiple authoritative Spanish sources (RTVE, El País, El Periódico, El Diario) and Wikipedia confirm the 29-seat result -- PP's María Guardiola will need support from Vox (11 seats) or other parties to form a government - ---- - -### Question 39353 - -**Title:** Will Elon Musk be the world's richest person on December 31, 2025? -**URL:** https://www.metaculus.com/questions/39353 - -**Question Contents:** - -> Will Elon Musk be the world's richest person on December 31, 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Cost:** $0.0941 - -**Key Evidence:** -- Bloomberg estimated Elon Musk's net worth at approximately $622–627 billion as of December 29–30, 2025, ranking him #1 on the Bloomberg Billionaires Index. -- SpaceX's private share sale in mid-December 2025 valued the company at $800 billion, adding over $100 billion to Musk's fortune. -- On December 20, 2025, the Delaware Supreme Court reinstated his 2018 Tesla stock options, adding ~$138–139 billion and pushing him past $700 billion. -- Larry Ellison briefly surpassed Musk in September 2025 but finished at #5 by year-end, with no other individual close to Musk's net worth. -- There is no indication of any other individual being co-ranked at #1 alongside Musk, and no indication of Bloomberg Index unavailability on December 31, 2025. - ---- - -### Question 39449 - -**Title:** Will Spain's economy grow more than the Eurozone's between Q3 2024 and Q3 2025? -**URL:** https://www.metaculus.com/questions/39449 - -**Question Contents:** - -> Will Spain's economy grow more than the Eurozone's between Q3 2024 and Q3 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Cost:** $0.1287 - -**Key Evidence:** -- Spain's year-on-year GDP growth in Q3 2025 was 2.8%, per INE and consistent with Eurostat data. -- The Eurozone's year-on-year GDP growth in Q3 2025 was 1.4%, per official Eurostat figures (published December 5, 2025 release: 2-05122025-ap). -- Spain's 2.8% exceeds the Eurozone's 1.4% by a margin of +1.4 percentage points. -- On a quarterly basis, Spain (0.6%) also outpaced the Eurozone (0.3%), confirming the result across both measurement methods. -- Eurostat published Q3 2025 data in December 2025, before the January 7, 2026 deadline, so the question is not void. - ---- - -### Question 39450 - -**Title:** Will Microsoft stock close 2025 higher than it started? -**URL:** https://www.metaculus.com/questions/39450 - -**Question Contents:** - -> Will Microsoft stock close 2025 higher than it started? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Cost:** $0.0824 - -**Key Evidence:** -- MSFT closing price on December 31, 2025 (last business day of 2025) was $483.62 (Source: StatMuse) -- MSFT opening price on January 2, 2025 (first trading day) was $422.41 (Source: StatMuse) -- The resolution threshold is $421.50, and $483.62 > $421.50, representing a margin of ~$62.12 (~14.7%) -- No stock splits occurred in 2025 that would require price adjustment -- December 31, 2025 was confirmed to be the last business day of 2025 - ---- - -### Question 39511 - -**Title:** Will transformer derived architectures still be state of the art for language modeling in December 2 -**URL:** https://www.metaculus.com/questions/39511 - -**Question Contents:** - -> Will transformer derived architectures still be state of the art for language modeling in December 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Cost:** $0.3846 - -**Key Evidence:** -- OpenAI GPT-4.5, GPT-4o, o1, o3-mini, DeepSeek R1, Qwen2.5, Llama 3, and Gemini 1.5 — all transformer-derived — are the top-cited 2025 models dominating benchmarks. -- The 'de facto standard stack' in 2025 LLMs is pre-norm (RMSNorm) + RoPE + SwiGLU + KV-sharing — all transformer refinements, including MoE extensions. -- Mamba/SSM-based models lag significantly on associative recall, in-context learning, and most standard benchmarks, and have not claimed SOTA on major NLP benchmarks. -- A Metaculus forecast explicitly states 95% probability that transformers would maintain SOTA through late 2025. -- The nlpprogress.com WikiText-2 top entry in December 2025 describes a transformer-derived language model, as evidenced by direct reference in the Metaculus question discussion. - ---- - -### Question 39523 - -**Title:** Will China be able to mass produce humanoid robots by the end of 2025? -**URL:** https://www.metaculus.com/questions/39523 - -**Question Contents:** - -> Will China be able to mass produce humanoid robots by the end of 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Cost:** $0.1829 - -**Key Evidence:** -- UBTECH reached its 1,000th Walker S2 robot milestone by late 2025, with Omdia confirming 1,000 shipments — the Walker S2 is a single model, meeting the 'identical model' requirement. -- Unitree Robotics shipped over 5,500 humanoid robots in 2025 with total production exceeding 6,500 units; given a small number of primary models (G1 and H1/H1-2), the G1 alone very likely exceeded 1,000 units. -- AgiBot announced the rollout of its 5,000th mass-produced humanoid robot in December 2025, with the A-Series alone reaching 1,742 units. -- China accounted for over 80% of global humanoid robot installations in 2025, with approximately 10,000–13,000 units installed in China out of 12,800–16,000 globally. -- China received over 30,000 orders for humanoid robots in 2025, with over 140 domestic manufacturers releasing more than 330 different models. - ---- - -### Question 39529 - -**Title:** Will the S&P 500 Index increase over the year 2025? -**URL:** https://www.metaculus.com/questions/39529 - -**Question Contents:** - -> Will the S&P 500 Index increase over the year 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Cost:** $0.1211 - -**Key Evidence:** -- S&P 500 closed at 5,881.63 on the last trading day of 2024 (Dec 31, 2024) -- S&P 500 closed at 6,845.50 on the last trading day of 2025 (Dec 31, 2025) -- Change was +963.87 points (+16.4% price return), with total return including dividends of +17.9% -- The S&P 500 posted its third consecutive year of double-digit gains in 2025 -- Multiple sources (StatMuse, RBC Wealth Management, FT Portfolios) confirm the year-end close above 2024's level - ---- - -### Question 39543 - -**Title:** Will Nvidia stock close 2025 higher than it started? -**URL:** https://www.metaculus.com/questions/39543 - -**Question Contents:** - -> Will Nvidia stock close 2025 higher than it started? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Cost:** $0.1206 - -**Key Evidence:** -- Nvidia's closing price on December 31, 2025 was $186.50 (sourced from Digrin / StatMuse) -- The resolution threshold is $134.29, described as Nvidia's starting price for 2025 (the close on the last business day of 2024) -- Dec 31, 2024 closing price was $134.25 (sourced from StatMuse) -- $186.50 exceeds $134.29 by approximately 39% -- No stock splits appear to have occurred in 2025 that would complicate this comparison - ---- - -### Question 39560 - -**Title:** Will the highest-grossing film of 2025 be part of a franchise? -**URL:** https://www.metaculus.com/questions/39560 - -**Question Contents:** - -> Will the highest-grossing film of 2025 be part of a franchise? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Cost:** $0.1629 - -**Key Evidence:** -- Ne Zha 2 is the highest-grossing film of 2025 worldwide with approximately $2.0–2.26 billion in total worldwide gross, confirmed by Box Office Mojo and The Numbers. -- Ne Zha 2 is the direct sequel to Ne Zha (2019), which earned over $742 million worldwide, forming a two-part animated franchise based on the Chinese mythological character Nezha. -- Both films are directed by the same director (Jiaozi/Yu Yang) and Ne Zha 2 picks up immediately where the original ended, making it unambiguously a franchise film. -- The data is confirmed by Box Office Mojo, satisfying the specified reliable source requirement in the resolution criteria. - ---- - -### Question 39569 - -**Title:** Will the IBEX 35 close 2025 higher than it started? -**URL:** https://www.metaculus.com/questions/39569 - -**Question Contents:** - -> Will the IBEX 35 close 2025 higher than it started? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Cost:** $0.0449 - -**Key Evidence:** -- IBEX 35 opening value (start of 2025): 11,595.00 -- IBEX 35 closing value (December 31, 2025): ~17,315 points according to Trading Economics -- Year-over-year gain of ~49.3% (nearly 50%) -- The closing value of ~17,315 is dramatically higher than the resolution threshold of 11,595.00 -- A gain of nearly 50% for the full year 2025 is a milestone not seen since 1993 - ---- - -### Question 40210 - -**Title:** Will the United States bailout Argentina before 2026? -**URL:** https://www.metaculus.com/questions/40210 - -**Question Contents:** - -> Will the United States bailout Argentina before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Cost:** $0.0933 - -**Key Evidence:** -- On October 9, 2025, the Trump administration announced a $20 billion Exchange Stabilization Agreement (ESA) between the U.S. Treasury's Exchange Stabilization Fund (ESF) and Argentina's Central Bank (BCRA). -- In October 2025, the BCRA drew $2.5 billion from this swap line, exchanging Argentine pesos for U.S. dollars. -- The U.S. also provided an additional $872 million in dollar liquidity via SDR transactions to help Argentina make IMF payments. -- By December 2025, Argentina fully repaid the $2.5 billion draw, and the swap transaction was closed. -- Treasury Secretary Scott Bessent confirmed in January 2026 that Argentina had 'quickly and fully repaid its limited draw.' - ---- - -### Question 40252 - -**Title:** Will the following companies be added to the S&P 500 before 2026? (CVNA) -**URL:** https://www.metaculus.com/questions/40252 - -**Question Contents:** - -> Will the following companies be added to the S&P 500 before 2026? (CVNA) - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Cost:** $0.0562 - -**Key Evidence:** -- Official announcement from S&P Dow Jones Indices confirmed on December 5, 2025 that Carvana (CVNA) would be added to the S&P 500 -- The addition took effect on December 22, 2025 (before market open) -- Both the announcement AND the actual addition occurred before January 1, 2026, which is the resolution deadline -- Carvana joined alongside CRH and Comfort Systems USA (FIX) as part of the index's quarterly rebalancing -- The announcement came directly from S&P Dow Jones Indices' press release, which is the authoritative source per the resolution criteria - ---- - -### Question 40283 - -**Title:** Will Hamas no longer have any hostages before 2026? -**URL:** https://www.metaculus.com/questions/40283 - -**Question Contents:** - -> Will Hamas no longer have any hostages before 2026? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Cost:** $0.0917 - -**Key Evidence:** -- All 20 living hostages released on October 13, 2025 — well before the January 1, 2026 deadline. -- A peace deal was signed by Israel and Hamas on October 9, 2025, in Sharm el-Sheikh, Egypt. -- On October 13, 2025, all 20 surviving living hostages were released in two batches (7 in Gaza City at 8 a.m., and 13 in southern Gaza at ~11 a.m.), transferred via the Red Cross to Israeli custody. -- No living hostages remained in Hamas custody after October 13, 2025 — prior to the January 1, 2026 deadline. -- Israel released ~2,000 Palestinian prisoners in exchange, including 250 life-sentence prisoners and ~1,700 detainees. - ---- - -### Question 40285 - -**Title:** Will Ukraine extend martial law beyond November 5, 2025? -**URL:** https://www.metaculus.com/questions/40285 - -**Question Contents:** - -> Will Ukraine extend martial law beyond November 5, 2025? - -| Output Resolution | Correct Resolution | -|-------------------|--------------------| -| TRUE | TRUE | - -**Cost:** $0.0489 - -**Key Evidence:** -- Ukraine's parliament voted to extend martial law for 90 days starting November 5, 2025, with 317 votes in favor. -- Parliament subsequently approved another 90-day extension of both martial law and general mobilization, now set to expire May 4, 2026. -- Martial law was originally declared on February 24, 2022, in response to the Russian invasion and has been extended systematically every 90 days. -- Sources include Ukrinform, Anadolu Agency, and NV Ukraine (English) confirming the extensions. -- Ukraine's constitution prohibits national elections while martial law is in effect, providing additional context for the ongoing extensions. - ----