From f1a3fb58bc8ca125df100d4f1f43109ca343da2e Mon Sep 17 00:00:00 2001
From: Debug Agent <debug@example.com>
Date: Thu, 2 Apr 2026 11:51:21 -0300
Subject: [PATCH] fix: enable LiteLLM virtual key cost tracking for default
 agent

The evaluation orchestrator already creates per-instance LiteLLM virtual
keys, but only ACP agents injected them into their LLM client. The
default OpenHands agent used the master API key directly, so proxy_cost
was always $0.00 for non-ACP runs.

Add `apply_virtual_key()` to litellm_proxy.py that returns a copy of
the LLM config with the virtual key as api_key (thread-safe via
model_copy + threading.local). Update all benchmarks to use it when
creating the default Agent.

Fixes #603

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarks/commit0/run_infer.py              |  3 ++-
 benchmarks/gaia/run_infer.py                 |  3 ++-
 benchmarks/multiswebench/run_infer.py        |  3 ++-
 benchmarks/openagentsafety/run_infer.py      |  3 ++-
 benchmarks/swebench/run_infer.py             |  5 ++---
 benchmarks/swebenchmultilingual/run_infer.py |  3 ++-
 benchmarks/swebenchmultimodal/run_infer.py   |  3 ++-
 benchmarks/swefficiency/run_infer.py         |  3 ++-
 benchmarks/swtbench/run_infer.py             |  3 ++-
 benchmarks/utils/litellm_proxy.py            | 19 +++++++++++++++++++
 10 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py
index 4c65fe954..a7b6b5061 100644
--- a/benchmarks/commit0/run_infer.py
+++ b/benchmarks/commit0/run_infer.py
@@ -33,6 +33,7 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists
+from benchmarks.utils.litellm_proxy import apply_virtual_key
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
@@ -393,7 +394,7 @@ def evaluate_instance(
                     keep_first=self.metadata.condenser_keep_first,
                 )
             agent = Agent(
-                llm=self.metadata.llm,
+                llm=apply_virtual_key(self.metadata.llm),
                 tools=tools,
                 system_prompt_kwargs={"cli_mode": True},
                 condenser=condenser,
diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py
index d0c7e676b..b8d88a6d4 100644
--- a/benchmarks/gaia/run_infer.py
+++ b/benchmarks/gaia/run_infer.py
@@ -36,6 +36,7 @@
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
 from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists
+from benchmarks.utils.litellm_proxy import apply_virtual_key
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
 from benchmarks.utils.version import IMAGE_TAG_PREFIX
@@ -336,7 +337,7 @@ def evaluate_instance(
                     keep_first=self.metadata.condenser_keep_first,
                 )
             agent = Agent(
-                llm=self.metadata.llm,
+                llm=apply_virtual_key(self.metadata.llm),
                 tools=tools,
                 system_prompt_kwargs={"cli_mode": True},
                 condenser=condenser,
diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py
index 481dfa000..ac3713bb0 100644
--- a/benchmarks/multiswebench/run_infer.py
+++ b/benchmarks/multiswebench/run_infer.py
@@ -26,6 +26,7 @@
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
 from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.litellm_proxy import apply_virtual_key
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
@@ -294,7 +295,7 @@ def evaluate_instance(
             )
 
         agent = Agent(
-            llm=self.metadata.llm,
+            llm=apply_virtual_key(self.metadata.llm),
             tools=tools,
             system_prompt_kwargs={"cli_mode": True},
             condenser=condenser,
diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py
index 30da0db89..99b5402d9 100644
--- a/benchmarks/openagentsafety/run_infer.py
+++ b/benchmarks/openagentsafety/run_infer.py
@@ -25,6 +25,7 @@
 from benchmarks.utils.evaluation import Evaluation
 from benchmarks.utils.evaluation_utils import construct_eval_output_dir
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
+from benchmarks.utils.litellm_proxy import apply_virtual_key
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
 from openhands.sdk import Agent, Conversation, Tool, get_logger
@@ -455,7 +456,7 @@ def evaluate_instance(
             tools.append(Tool(name=DelegateTool.name))
 
         # Create agent
-        agent = Agent(llm=self.metadata.llm, tools=tools)
+        agent = Agent(llm=apply_virtual_key(self.metadata.llm), tools=tools)
 
         # Collect events
         received_events = []
diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py
index 72e321b69..da05f77b5 100644
--- a/benchmarks/swebench/run_infer.py
+++ b/benchmarks/swebench/run_infer.py
@@ -20,6 +20,7 @@
     setup_acp_workspace,
     workspace_keepalive,
 )
+from benchmarks.utils.litellm_proxy import apply_virtual_key
 from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser
 from benchmarks.utils.build_utils import ensure_local_image
 from benchmarks.utils.console_logging import summarize_instance
@@ -268,12 +269,10 @@ def evaluate_instance(
                     keep_first=self.metadata.condenser_keep_first,
                 )
             agent = Agent(
-                llm=self.metadata.llm,
+                llm=apply_virtual_key(self.metadata.llm),
                 tools=tools,
                 system_prompt_kwargs={"cli_mode": True},
                 condenser=condenser,
-                # TODO: we can enable security analyzer later
-                # security_analyzer=LLMSecurityAnalyzer(),
             )
 
         assert isinstance(workspace, RemoteWorkspace)
diff --git a/benchmarks/swebenchmultilingual/run_infer.py b/benchmarks/swebenchmultilingual/run_infer.py
index b037bd08a..65fb495c6 100644
--- a/benchmarks/swebenchmultilingual/run_infer.py
+++ b/benchmarks/swebenchmultilingual/run_infer.py
@@ -26,6 +26,7 @@
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
 from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.litellm_proxy import apply_virtual_key
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
@@ -250,7 +251,7 @@ def evaluate_instance(
         if self.metadata.enable_delegation:
             tools.append(Tool(name=DelegateTool.name))
         agent = Agent(
-            llm=self.metadata.llm,
+            llm=apply_virtual_key(self.metadata.llm),
             tools=tools,
             system_prompt_kwargs={"cli_mode": True},
             # TODO: we can enable condenser and security analyzer later
diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py
index 6be511176..344291103 100644
--- a/benchmarks/swebenchmultimodal/run_infer.py
+++ b/benchmarks/swebenchmultimodal/run_infer.py
@@ -32,6 +32,7 @@
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
 from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.litellm_proxy import apply_virtual_key
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
@@ -254,7 +255,7 @@ def evaluate_instance(
                     keep_first=self.metadata.condenser_keep_first,
                 )
             agent = Agent(
-                llm=self.metadata.llm,
+                llm=apply_virtual_key(self.metadata.llm),
                 tools=tools,
                 system_prompt_kwargs={"cli_mode": True},
                 condenser=condenser,
diff --git a/benchmarks/swefficiency/run_infer.py b/benchmarks/swefficiency/run_infer.py
index 322663d1d..ce3f5f4fd 100644
--- a/benchmarks/swefficiency/run_infer.py
+++ b/benchmarks/swefficiency/run_infer.py
@@ -22,6 +22,7 @@
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
 from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.litellm_proxy import apply_virtual_key
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
@@ -308,7 +309,7 @@ def evaluate_instance(
         """
         tools = get_default_tools(enable_browser=False)
         agent = Agent(
-            llm=self.metadata.llm,
+            llm=apply_virtual_key(self.metadata.llm),
             tools=tools,
             system_prompt_kwargs={"cli_mode": True},
         )
diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py
index 8373d04f0..cea1583c2 100644
--- a/benchmarks/swtbench/run_infer.py
+++ b/benchmarks/swtbench/run_infer.py
@@ -29,6 +29,7 @@
     create_docker_workspace,
     remote_image_exists,
 )
+from benchmarks.utils.litellm_proxy import apply_virtual_key
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
@@ -263,7 +264,7 @@ def evaluate_instance(
                     keep_first=self.metadata.condenser_keep_first,
                 )
             agent = Agent(
-                llm=self.metadata.llm,
+                llm=apply_virtual_key(self.metadata.llm),
                 tools=tools,
                 system_prompt_kwargs={"cli_mode": True},
                 condenser=condenser,
diff --git a/benchmarks/utils/litellm_proxy.py b/benchmarks/utils/litellm_proxy.py
index 966c6e896..9d96082ee 100644
--- a/benchmarks/utils/litellm_proxy.py
+++ b/benchmarks/utils/litellm_proxy.py
@@ -155,3 +155,22 @@ def set_current_virtual_key(key: str | None) -> None:
 def get_current_virtual_key() -> str | None:
     """Return the virtual key for the current worker thread, or None."""
     return getattr(_thread_local, "virtual_key", None)
+
+
+def apply_virtual_key(llm):  # type: ignore[no-untyped-def]
+    """Return an LLM config copy with the per-instance virtual key as api_key.
+
+    If no virtual key is active for this thread, returns the original config
+    unchanged.  This is thread-safe: ``model_copy`` creates a new instance
+    and ``get_current_virtual_key`` reads from ``threading.local``.
+
+    Use this when creating a default (non-ACP) ``Agent`` so that all LLM
+    calls go through the proxy with the per-instance virtual key, enabling
+    accurate per-instance cost tracking.
+    """
+    virtual_key = get_current_virtual_key()
+    if virtual_key is None:
+        return llm
+    from pydantic import SecretStr
+
+    return llm.model_copy(update={"api_key": SecretStr(virtual_key)})