OpenHands · simonrosenberg · Apr 2, 2026 · all-hands-bot · Apr 2, 2026 · all-hands-bot
diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py
@@ -33,6 +33,7 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists
+from benchmarks.utils.litellm_proxy import apply_virtual_key
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
@@ -393,7 +394,7 @@ def evaluate_instance(
                     keep_first=self.metadata.condenser_keep_first,
                 )
             agent = Agent(
-                llm=self.metadata.llm,
+                llm=apply_virtual_key(self.metadata.llm),
                 tools=tools,
                 system_prompt_kwargs={"cli_mode": True},
                 condenser=condenser,

diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py
@@ -36,6 +36,7 @@
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
 from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists
+from benchmarks.utils.litellm_proxy import apply_virtual_key
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
 from benchmarks.utils.version import IMAGE_TAG_PREFIX
@@ -336,7 +337,7 @@ def evaluate_instance(
                     keep_first=self.metadata.condenser_keep_first,
                 )
             agent = Agent(
-                llm=self.metadata.llm,
+                llm=apply_virtual_key(self.metadata.llm),
                 tools=tools,
                 system_prompt_kwargs={"cli_mode": True},
                 condenser=condenser,

diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py
@@ -26,6 +26,7 @@
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
 from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.litellm_proxy import apply_virtual_key
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
@@ -294,7 +295,7 @@ def evaluate_instance(
             )
 
         agent = Agent(
-            llm=self.metadata.llm,
+            llm=apply_virtual_key(self.metadata.llm),
             tools=tools,
             system_prompt_kwargs={"cli_mode": True},
             condenser=condenser,

diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py
@@ -25,6 +25,7 @@
 from benchmarks.utils.evaluation import Evaluation
 from benchmarks.utils.evaluation_utils import construct_eval_output_dir
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
+from benchmarks.utils.litellm_proxy import apply_virtual_key
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
 from openhands.sdk import Agent, Conversation, Tool, get_logger
@@ -455,7 +456,7 @@ def evaluate_instance(
             tools.append(Tool(name=DelegateTool.name))
 
         # Create agent
-        agent = Agent(llm=self.metadata.llm, tools=tools)
+        agent = Agent(llm=apply_virtual_key(self.metadata.llm), tools=tools)
 
         # Collect events
         received_events = []

diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py
@@ -20,6 +20,7 @@
     setup_acp_workspace,
     workspace_keepalive,
 )
+from benchmarks.utils.litellm_proxy import apply_virtual_key
 from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser
 from benchmarks.utils.build_utils import ensure_local_image
 from benchmarks.utils.console_logging import summarize_instance
@@ -268,12 +269,10 @@ def evaluate_instance(
                     keep_first=self.metadata.condenser_keep_first,
                 )
             agent = Agent(
-                llm=self.metadata.llm,
+                llm=apply_virtual_key(self.metadata.llm),
                 tools=tools,
                 system_prompt_kwargs={"cli_mode": True},
                 condenser=condenser,
-                # TODO: we can enable security analyzer later
-                # security_analyzer=LLMSecurityAnalyzer(),
             )
 
         assert isinstance(workspace, RemoteWorkspace)

diff --git a/benchmarks/swebenchmultilingual/run_infer.py b/benchmarks/swebenchmultilingual/run_infer.py
@@ -26,6 +26,7 @@
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
 from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.litellm_proxy import apply_virtual_key
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
@@ -250,7 +251,7 @@ def evaluate_instance(
         if self.metadata.enable_delegation:
             tools.append(Tool(name=DelegateTool.name))
         agent = Agent(
-            llm=self.metadata.llm,
+            llm=apply_virtual_key(self.metadata.llm),
             tools=tools,
             system_prompt_kwargs={"cli_mode": True},
             # TODO: we can enable condenser and security analyzer later

diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py
@@ -32,6 +32,7 @@
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
 from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.litellm_proxy import apply_virtual_key
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
@@ -254,7 +255,7 @@ def evaluate_instance(
                     keep_first=self.metadata.condenser_keep_first,
                 )
             agent = Agent(
-                llm=self.metadata.llm,
+                llm=apply_virtual_key(self.metadata.llm),
                 tools=tools,
                 system_prompt_kwargs={"cli_mode": True},
                 condenser=condenser,

diff --git a/benchmarks/swefficiency/run_infer.py b/benchmarks/swefficiency/run_infer.py
@@ -22,6 +22,7 @@
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
 from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.litellm_proxy import apply_virtual_key
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
@@ -308,7 +309,7 @@ def evaluate_instance(
         """
         tools = get_default_tools(enable_browser=False)
         agent = Agent(
-            llm=self.metadata.llm,
+            llm=apply_virtual_key(self.metadata.llm),
             tools=tools,
             system_prompt_kwargs={"cli_mode": True},
         )

diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py
@@ -29,6 +29,7 @@
     create_docker_workspace,
     remote_image_exists,
 )
+from benchmarks.utils.litellm_proxy import apply_virtual_key
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
@@ -263,7 +264,7 @@ def evaluate_instance(
                     keep_first=self.metadata.condenser_keep_first,
                 )
             agent = Agent(
-                llm=self.metadata.llm,
+                llm=apply_virtual_key(self.metadata.llm),
                 tools=tools,
                 system_prompt_kwargs={"cli_mode": True},
                 condenser=condenser,

diff --git a/benchmarks/utils/litellm_proxy.py b/benchmarks/utils/litellm_proxy.py
@@ -155,3 +155,22 @@ def set_current_virtual_key(key: str | None) -> None:
 def get_current_virtual_key() -> str | None:
     """Return the virtual key for the current worker thread, or None."""
     return getattr(_thread_local, "virtual_key", None)
+
+
+def apply_virtual_key(llm):  # type: ignore[no-untyped-def]
-def apply_virtual_key(llm):  # type: ignore[no-untyped-def]
+def apply_virtual_key(llm: LLM) -> LLM:
+    """Return an LLM config copy with the per-instance virtual key as api_key.
-def apply_virtual_key(llm):  # type: ignore[no-untyped-def]
+def apply_virtual_key(llm: LLM) -> LLM:
+    """Return an LLM config copy with the per-instance virtual key as api_key.
+    """Return an LLM config copy with the per-instance virtual key as api_key.
+
+    If no virtual key is active for this thread, returns the original config
+    unchanged.  This is thread-safe: ``model_copy`` creates a new instance
+    and ``get_current_virtual_key`` reads from ``threading.local``.
+
+    Use this when creating a default (non-ACP) ``Agent`` so that all LLM
+    calls go through the proxy with the per-instance virtual key, enabling
+    accurate per-instance cost tracking.
+    """
+    virtual_key = get_current_virtual_key()
+    if virtual_key is None:
+        return llm
+    from pydantic import SecretStr
+
+    return llm.model_copy(update={"api_key": SecretStr(virtual_key)})