From f1a3fb58bc8ca125df100d4f1f43109ca343da2e Mon Sep 17 00:00:00 2001 From: Debug Agent Date: Thu, 2 Apr 2026 11:51:21 -0300 Subject: [PATCH] fix: enable LiteLLM virtual key cost tracking for default agent The evaluation orchestrator already creates per-instance LiteLLM virtual keys, but only ACP agents injected them into their LLM client. The default OpenHands agent used the master API key directly, so proxy_cost was always $0.00 for non-ACP runs. Add `apply_virtual_key()` to litellm_proxy.py that returns a copy of the LLM config with the virtual key as api_key (thread-safe via model_copy + threading.local). Update all benchmarks to use it when creating the default Agent. Fixes #603 Co-Authored-By: Claude Opus 4.6 --- benchmarks/commit0/run_infer.py | 3 ++- benchmarks/gaia/run_infer.py | 3 ++- benchmarks/multiswebench/run_infer.py | 3 ++- benchmarks/openagentsafety/run_infer.py | 3 ++- benchmarks/swebench/run_infer.py | 5 ++--- benchmarks/swebenchmultilingual/run_infer.py | 3 ++- benchmarks/swebenchmultimodal/run_infer.py | 3 ++- benchmarks/swefficiency/run_infer.py | 3 ++- benchmarks/swtbench/run_infer.py | 3 ++- benchmarks/utils/litellm_proxy.py | 19 +++++++++++++++++++ 10 files changed, 37 insertions(+), 11 deletions(-) diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index 4c65fe954..a7b6b5061 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -33,6 +33,7 @@ get_default_on_result_writer, ) from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists +from benchmarks.utils.litellm_proxy import apply_virtual_key from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, @@ -393,7 +394,7 @@ def evaluate_instance( keep_first=self.metadata.condenser_keep_first, ) agent = Agent( - llm=self.metadata.llm, + llm=apply_virtual_key(self.metadata.llm), tools=tools, system_prompt_kwargs={"cli_mode": True}, condenser=condenser, diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index d0c7e676b..b8d88a6d4 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -36,6 +36,7 @@ ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists +from benchmarks.utils.litellm_proxy import apply_virtual_key from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput from benchmarks.utils.version import IMAGE_TAG_PREFIX @@ -336,7 +337,7 @@ def evaluate_instance( keep_first=self.metadata.condenser_keep_first, ) agent = Agent( - llm=self.metadata.llm, + llm=apply_virtual_key(self.metadata.llm), tools=tools, system_prompt_kwargs={"cli_mode": True}, condenser=condenser, diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py index 481dfa000..ac3713bb0 100644 --- a/benchmarks/multiswebench/run_infer.py +++ b/benchmarks/multiswebench/run_infer.py @@ -26,6 +26,7 @@ ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response from benchmarks.utils.image_utils import remote_image_exists +from benchmarks.utils.litellm_proxy import apply_virtual_key from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, @@ -294,7 +295,7 @@ def evaluate_instance( ) agent = Agent( - llm=self.metadata.llm, + llm=apply_virtual_key(self.metadata.llm), tools=tools, system_prompt_kwargs={"cli_mode": True}, condenser=condenser, diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py index 30da0db89..99b5402d9 100644 --- a/benchmarks/openagentsafety/run_infer.py +++ b/benchmarks/openagentsafety/run_infer.py @@ -25,6 +25,7 @@ from benchmarks.utils.evaluation import Evaluation from benchmarks.utils.evaluation_utils import construct_eval_output_dir from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response +from benchmarks.utils.litellm_proxy import apply_virtual_key from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput from openhands.sdk import Agent, Conversation, Tool, get_logger @@ -455,7 +456,7 @@ def evaluate_instance( tools.append(Tool(name=DelegateTool.name)) # Create agent - agent = Agent(llm=self.metadata.llm, tools=tools) + agent = Agent(llm=apply_virtual_key(self.metadata.llm), tools=tools) # Collect events received_events = [] diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 72e321b69..da05f77b5 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -20,6 +20,7 @@ setup_acp_workspace, workspace_keepalive, ) +from benchmarks.utils.litellm_proxy import apply_virtual_key from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser from benchmarks.utils.build_utils import ensure_local_image from benchmarks.utils.console_logging import summarize_instance @@ -268,12 +269,10 @@ def evaluate_instance( keep_first=self.metadata.condenser_keep_first, ) agent = Agent( - llm=self.metadata.llm, + llm=apply_virtual_key(self.metadata.llm), tools=tools, system_prompt_kwargs={"cli_mode": True}, condenser=condenser, - # TODO: we can enable security analyzer later - # security_analyzer=LLMSecurityAnalyzer(), ) assert isinstance(workspace, RemoteWorkspace) diff --git a/benchmarks/swebenchmultilingual/run_infer.py b/benchmarks/swebenchmultilingual/run_infer.py index b037bd08a..65fb495c6 100644 --- a/benchmarks/swebenchmultilingual/run_infer.py +++ b/benchmarks/swebenchmultilingual/run_infer.py @@ -26,6 +26,7 @@ ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response from benchmarks.utils.image_utils import remote_image_exists +from benchmarks.utils.litellm_proxy import apply_virtual_key from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, @@ -250,7 +251,7 @@ def evaluate_instance( if self.metadata.enable_delegation: tools.append(Tool(name=DelegateTool.name)) agent = Agent( - llm=self.metadata.llm, + llm=apply_virtual_key(self.metadata.llm), tools=tools, system_prompt_kwargs={"cli_mode": True}, # TODO: we can enable condenser and security analyzer later diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index 6be511176..344291103 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -32,6 +32,7 @@ ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response from benchmarks.utils.image_utils import remote_image_exists +from benchmarks.utils.litellm_proxy import apply_virtual_key from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, @@ -254,7 +255,7 @@ def evaluate_instance( keep_first=self.metadata.condenser_keep_first, ) agent = Agent( - llm=self.metadata.llm, + llm=apply_virtual_key(self.metadata.llm), tools=tools, system_prompt_kwargs={"cli_mode": True}, condenser=condenser, diff --git a/benchmarks/swefficiency/run_infer.py b/benchmarks/swefficiency/run_infer.py index 322663d1d..ce3f5f4fd 100644 --- a/benchmarks/swefficiency/run_infer.py +++ b/benchmarks/swefficiency/run_infer.py @@ -22,6 +22,7 @@ ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response from benchmarks.utils.image_utils import remote_image_exists +from benchmarks.utils.litellm_proxy import apply_virtual_key from benchmarks.utils.models import ( EvalInstance, EvalMetadata, @@ -308,7 +309,7 @@ def evaluate_instance( """ tools = get_default_tools(enable_browser=False) agent = Agent( - llm=self.metadata.llm, + llm=apply_virtual_key(self.metadata.llm), tools=tools, system_prompt_kwargs={"cli_mode": True}, ) diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index 8373d04f0..cea1583c2 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -29,6 +29,7 @@ create_docker_workspace, remote_image_exists, ) +from benchmarks.utils.litellm_proxy import apply_virtual_key from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, @@ -263,7 +264,7 @@ def evaluate_instance( keep_first=self.metadata.condenser_keep_first, ) agent = Agent( - llm=self.metadata.llm, + llm=apply_virtual_key(self.metadata.llm), tools=tools, system_prompt_kwargs={"cli_mode": True}, condenser=condenser, diff --git a/benchmarks/utils/litellm_proxy.py b/benchmarks/utils/litellm_proxy.py index 966c6e896..9d96082ee 100644 --- a/benchmarks/utils/litellm_proxy.py +++ b/benchmarks/utils/litellm_proxy.py @@ -155,3 +155,22 @@ def set_current_virtual_key(key: str | None) -> None: def get_current_virtual_key() -> str | None: """Return the virtual key for the current worker thread, or None.""" return getattr(_thread_local, "virtual_key", None) + + +def apply_virtual_key(llm): # type: ignore[no-untyped-def] + """Return an LLM config copy with the per-instance virtual key as api_key. + + If no virtual key is active for this thread, returns the original config + unchanged. This is thread-safe: ``model_copy`` creates a new instance + and ``get_current_virtual_key`` reads from ``threading.local``. + + Use this when creating a default (non-ACP) ``Agent`` so that all LLM + calls go through the proxy with the per-instance virtual key, enabling + accurate per-instance cost tracking. + """ + virtual_key = get_current_virtual_key() + if virtual_key is None: + return llm + from pydantic import SecretStr + + return llm.model_copy(update={"api_key": SecretStr(virtual_key)})