fix(pydantic_ai): strip token metrics from wrapper spans

colinbennettbrain · colinbennettbrain · commit 6a6cb76ac7cb · 2026-04-16T22:20:48.000Z
#312 retyped wrapper spans (agent_run, model_request, streaming wrappers) from LLM to TASK, but they kept logging the same prompt_tokens/completion_tokens/tokens as their nested leaf `chat <model>` span. The server derives estimated_cost per-span from tokens+metadata.model (brainstore estimated_cost.rs), rolls up trace totals with `coalesce_add` over every non-scorer span regardless of type (summary.rs accumulate_metrics), and sums experiment-level token/cost over all non-scorer spans without filtering on span_type='llm' (summary.ts experimentScanSpanSummary). Retyping to TASK therefore did not stop double-counting on any of those three axes. Route every wrapper log site through a new `_wrapper_span_metrics` helper that emits only {start, end, duration, optional time_to_first_token}. Leaf `chat <model>` spans (from _wrap_concrete_model_class and _DirectStreamWrapper when span_type=LLM) keep full _extract_response_metrics. `_DirectStreamWrapper` now branches on span_type since it serves as both leaf and wrapper. Delete now-dead `_extract_usage_metrics` and `_extract_stream_usage_metrics`. Flip existing cassette-backed assertions (test_agent_run_async, test_agent_run_sync, test_agent_run_stream_async, test_agent_with_tools, test_agent_run_stream_sync) to assert prompt_tokens / completion_tokens / tokens / prompt_cached_tokens are absent from wrapper spans and present only on the leaf. No cassette re-recording needed -- the change is purely in post-processing.
diff --git a/py/src/braintrust/integrations/pydantic_ai/test_pydantic_ai_integration.py b/py/src/braintrust/integrations/pydantic_ai/test_pydantic_ai_integration.py
@@ -140,23 +140,21 @@ async def test_agent_run_async(memory_logger):
     assert chat_span["metadata"]["provider"] == "openai"
     _assert_metrics_are_valid(chat_span["metrics"], start, end)
 
-    # Agent spans should have token metrics
-    assert "prompt_tokens" in agent_span["metrics"]
-    assert "completion_tokens" in agent_span["metrics"]
-    assert agent_span["metrics"]["prompt_tokens"] > 0
-    assert agent_span["metrics"]["completion_tokens"] > 0
-
-    # Regression: no double-counting of cost/tokens. Experiment-level aggregations
-    # sum metrics across type='llm' spans, so a single agent turn must contribute
-    # its tokens exactly once. The wrapper agent_run span logs the same usage as
-    # the leaf chat span; only the leaf should be type=LLM.
+    # Regression: wrapper agent_run span must NOT log token metrics. The leaf chat
+    # span already logs them, and trace-tree rollup (self + descendants) plus any
+    # unfiltered sum over metrics would otherwise double-count tokens/cost at the
+    # parent regardless of span type.
+    for token_key in ("prompt_tokens", "completion_tokens", "tokens", "prompt_cached_tokens"):
+        assert token_key not in agent_span["metrics"], (
+            f"wrapper span must not log {token_key}; it duplicates the leaf chat span"
+        )
+
+    # Only the leaf chat span should be type=LLM, and it must own the token totals.
     llm_spans = [s for s in spans if s["span_attributes"]["type"] == SpanTypeAttribute.LLM]
     assert len(llm_spans) == 1, f"expected exactly one LLM-typed span, got {len(llm_spans)}"
     assert llm_spans[0]["span_id"] == chat_span["span_id"]
-    llm_prompt_tokens_sum = sum(s["metrics"].get("prompt_tokens", 0) for s in llm_spans)
-    llm_completion_tokens_sum = sum(s["metrics"].get("completion_tokens", 0) for s in llm_spans)
-    assert llm_prompt_tokens_sum == chat_span["metrics"]["prompt_tokens"]
-    assert llm_completion_tokens_sum == chat_span["metrics"]["completion_tokens"]
+    assert chat_span["metrics"]["prompt_tokens"] > 0
+    assert chat_span["metrics"]["completion_tokens"] > 0
 
 
 @pytest.mark.vcr
@@ -245,9 +243,12 @@ def is_descendant(child_span, ancestor_id):
     assert chat_span["metadata"]["provider"] == "openai"
     _assert_metrics_are_valid(chat_span["metrics"], start, end)
 
-    # Agent spans should have token metrics
-    assert "prompt_tokens" in agent_sync_span["metrics"]
-    assert "completion_tokens" in agent_sync_span["metrics"]
+    # Wrapper agent_run_sync span must not log token metrics (would double-count at rollup).
+    assert "prompt_tokens" not in agent_sync_span["metrics"]
+    assert "completion_tokens" not in agent_sync_span["metrics"]
+    # Tokens live on the leaf chat span only.
+    assert chat_span["metrics"]["prompt_tokens"] > 0
+    assert chat_span["metrics"]["completion_tokens"] > 0
 
 
 def test_agent_to_cli_sync(memory_logger, monkeypatch):
@@ -544,9 +545,12 @@ async def test_agent_run_stream(memory_logger):
     print(f"span_parents: {chat_span['span_parents']}")
     print(f"metrics: {chat_span['metrics']}")
 
-    # Agent spans should have token metrics
-    assert "prompt_tokens" in agent_span["metrics"]
-    assert "completion_tokens" in agent_span["metrics"]
+    # Wrapper stream span must not log token metrics (would double-count at rollup).
+    # time_to_first_token is asserted above; it's a non-summable timing metric and stays.
+    assert "prompt_tokens" not in agent_span["metrics"]
+    assert "completion_tokens" not in agent_span["metrics"]
+    assert chat_span["metrics"]["prompt_tokens"] > 0
+    assert chat_span["metrics"]["completion_tokens"] > 0
 
 
 @pytest.mark.vcr
@@ -842,9 +846,11 @@ def is_descendant(child_span, ancestor_id):
     assert chat_span["metadata"]["provider"] == "openai"
     _assert_metrics_are_valid(chat_span["metrics"], start, end)
 
-    # Agent spans should have token metrics
-    assert "prompt_tokens" in agent_span["metrics"]
-    assert "completion_tokens" in agent_span["metrics"]
+    # Wrapper agent_run span must not log token metrics (would double-count at rollup).
+    assert "prompt_tokens" not in agent_span["metrics"]
+    assert "completion_tokens" not in agent_span["metrics"]
+    assert chat_span["metrics"]["prompt_tokens"] > 0
+    assert chat_span["metrics"]["completion_tokens"] > 0
 
 
 @pytest.mark.vcr
@@ -1143,9 +1149,9 @@ def is_descendant(child_span, ancestor_id):
     # Chat span may not have complete metrics since it's an intermediate span
     assert "start" in chat_span["metrics"]
 
-    # Agent spans should have token metrics
-    assert "prompt_tokens" in agent_span["metrics"]
-    assert "completion_tokens" in agent_span["metrics"]
+    # Wrapper agent_run_stream_sync span must not log token metrics.
+    assert "prompt_tokens" not in agent_span["metrics"]
+    assert "completion_tokens" not in agent_span["metrics"]
 
 
 @pytest.mark.vcr
diff --git a/py/src/braintrust/integrations/pydantic_ai/tracing.py b/py/src/braintrust/integrations/pydantic_ai/tracing.py
@@ -83,9 +83,7 @@ async def _agent_run_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: Any
             _maybe_create_tool_spans_from_messages(result)
 
             output = _serialize_result_output(result)
-            metrics = _extract_usage_metrics(result, start_time, end_time)
-
-            agent_span.log(output=output, metrics=metrics)
+            agent_span.log(output=output, metrics=_wrapper_span_metrics(start_time, end_time))
             return result
         finally:
             _reset_tool_trace_capture(tool_trace_token)
@@ -109,9 +107,7 @@ def _agent_run_sync_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: Any)
             _maybe_create_tool_spans_from_messages(result)
 
             output = _serialize_result_output(result)
-            metrics = _extract_usage_metrics(result, start_time, end_time)
-
-            agent_span.log(output=output, metrics=metrics)
+            agent_span.log(output=output, metrics=_wrapper_span_metrics(start_time, end_time))
             return result
         finally:
             _reset_tool_trace_capture(tool_trace_token)
@@ -131,7 +127,7 @@ def _agent_to_cli_sync_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: A
         start_time = time.time()
         result = wrapped(*args, **kwargs)
         end_time = time.time()
-        agent_span.log(metrics={"start": start_time, "end": end_time, "duration": end_time - start_time})
+        agent_span.log(metrics=_wrapper_span_metrics(start_time, end_time))
         return result
 
 
@@ -211,17 +207,13 @@ async def _agent_run_stream_events_wrapper(wrapped: Any, instance: Any, args: An
                 _maybe_create_tool_spans_from_messages(final_result)
 
             output = None
-            metrics = {
-                "start": start_time,
-                "end": end_time,
-                "duration": end_time - start_time,
+            metrics: dict[str, float] = {
+                **_wrapper_span_metrics(start_time, end_time),
                 "event_count": event_count,
             }
 
             if final_result:
                 output = _serialize_result_output(final_result)
-                usage_metrics = _extract_usage_metrics(final_result, start_time, end_time)
-                metrics.update(usage_metrics)
 
             agent_span.log(output=output, metrics=metrics)
         finally:
@@ -245,9 +237,7 @@ async def wrapper(wrapped: Any, instance: Any, args: Any, kwargs: Any):
             end_time = time.time()
 
             output = _serialize_model_response(result)
-            metrics = _extract_response_metrics(result, start_time, end_time)
-
-            span.log(output=output, metrics=metrics)
+            span.log(output=output, metrics=_wrapper_span_metrics(start_time, end_time))
             return result
 
     return wrapper
@@ -270,9 +260,7 @@ def wrapper(wrapped: Any, instance: Any, args: Any, kwargs: Any):
             end_time = time.time()
 
             output = _serialize_model_response(result)
-            metrics = _extract_response_metrics(result, start_time, end_time)
-
-            span.log(output=output, metrics=metrics)
+            span.log(output=output, metrics=_wrapper_span_metrics(start_time, end_time))
             return result
 
     return wrapper
@@ -326,9 +314,7 @@ async def wrapper(*args, **kwargs):
             end_time = time.time()
 
             output = _serialize_model_response(result)
-            metrics = _extract_response_metrics(result, start_time, end_time)
-
-            span.log(output=output, metrics=metrics)
+            span.log(output=output, metrics=_wrapper_span_metrics(start_time, end_time))
             return result
 
     return wrapper
@@ -349,9 +335,7 @@ def wrapper(*args, **kwargs):
             end_time = time.time()
 
             output = _serialize_model_response(result)
-            metrics = _extract_response_metrics(result, start_time, end_time)
-
-            span.log(output=output, metrics=metrics)
+            span.log(output=output, metrics=_wrapper_span_metrics(start_time, end_time))
             return result
 
     return wrapper
@@ -492,10 +476,10 @@ async def __aexit__(self, exc_type, exc_val, exc_tb):
                 _maybe_create_tool_spans_from_messages(self.stream_result)
 
                 output = _serialize_stream_output(self.stream_result)
-                metrics = _extract_stream_usage_metrics(
-                    self.stream_result, self.start_time, end_time, self._first_token_time
+                self.span_cm.log(
+                    output=output,
+                    metrics=_wrapper_span_metrics(self.start_time, end_time, self._first_token_time),
                 )
-                self.span_cm.log(output=output, metrics=metrics)
 
             # Clean up span context
             if self.span_cm:
@@ -593,9 +577,12 @@ async def __aexit__(self, exc_type, exc_val, exc_tb):
                 try:
                     final_response = self.stream.get()
                     output = _serialize_model_response(final_response)
-                    metrics = _extract_response_metrics(
-                        final_response, self.start_time, end_time, self._first_token_time
-                    )
+                    if self.span_type == SpanTypeAttribute.LLM:
+                        metrics = _extract_response_metrics(
+                            final_response, self.start_time, end_time, self._first_token_time
+                        )
+                    else:
+                        metrics = _wrapper_span_metrics(self.start_time, end_time, self._first_token_time)
                     self.span_cm.log(output=output, metrics=metrics)
                 except Exception as e:
                     logger.debug(f"Failed to extract stream output/metrics: {e}")
@@ -700,10 +687,10 @@ def _finalize(self):
                 _maybe_create_tool_spans_from_messages(self._stream_result)
 
                 output = _serialize_stream_output(self._stream_result)
-                metrics = _extract_stream_usage_metrics(
-                    self._stream_result, self._start_time, end_time, self._first_token_time
+                self._span.log(
+                    output=output,
+                    metrics=_wrapper_span_metrics(self._start_time, end_time, self._first_token_time),
                 )
-                self._span.log(output=output, metrics=metrics)
                 self._logged = True
             finally:
                 try:
@@ -761,10 +748,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):
                 try:
                     final_response = self.stream.get()
                     output = _serialize_model_response(final_response)
-                    metrics = _extract_response_metrics(
-                        final_response, self.start_time, end_time, self._first_token_time
+                    self.span_cm.log(
+                        output=output,
+                        metrics=_wrapper_span_metrics(self.start_time, end_time, self._first_token_time),
                     )
-                    self.span_cm.log(output=output, metrics=metrics)
                 except Exception as e:
                     logger.debug(f"Failed to extract stream output/metrics: {e}")
 
@@ -1155,105 +1142,20 @@ def _parse_model_string(model: Any) -> tuple[str | None, str | None]:
     return model_str, None
 
 
-def _extract_usage_metrics(result: Any, start_time: float, end_time: float) -> dict[str, float] | None:
-    """Extract usage metrics from agent run result."""
-    metrics: dict[str, float] = {}
-
-    metrics["start"] = start_time
-    metrics["end"] = end_time
-    metrics["duration"] = end_time - start_time
-
-    usage = None
-    if hasattr(result, "response"):
-        try:
-            response = result.response
-            if hasattr(response, "usage"):
-                usage = response.usage
-        except (AttributeError, ValueError):
-            pass
-
-    if usage is None and hasattr(result, "usage"):
-        usage = result.usage
-
-    if usage is None:
-        return metrics
-
-    if hasattr(usage, "input_tokens"):
-        input_tokens = usage.input_tokens
-        if input_tokens is not None:
-            metrics["prompt_tokens"] = float(input_tokens)
-
-    if hasattr(usage, "output_tokens"):
-        output_tokens = usage.output_tokens
-        if output_tokens is not None:
-            metrics["completion_tokens"] = float(output_tokens)
-
-    if hasattr(usage, "total_tokens"):
-        total_tokens = usage.total_tokens
-        if total_tokens is not None:
-            metrics["tokens"] = float(total_tokens)
-
-    if hasattr(usage, "cache_read_tokens") and usage.cache_read_tokens is not None:
-        metrics["prompt_cached_tokens"] = float(usage.cache_read_tokens)
-
-    if hasattr(usage, "cache_write_tokens") and usage.cache_write_tokens is not None:
-        metrics["prompt_cache_creation_tokens"] = float(usage.cache_write_tokens)
-
-    if hasattr(usage, "input_audio_tokens") and usage.input_audio_tokens is not None:
-        metrics["prompt_audio_tokens"] = float(usage.input_audio_tokens)
-
-    if hasattr(usage, "output_audio_tokens") and usage.output_audio_tokens is not None:
-        metrics["completion_audio_tokens"] = float(usage.output_audio_tokens)
-
-    if hasattr(usage, "details") and isinstance(usage.details, dict):
-        details = usage.details
-
-        if "reasoning_tokens" in details:
-            metrics["completion_reasoning_tokens"] = float(details["reasoning_tokens"])
-
-        if "cached_tokens" in details:
-            metrics["prompt_cached_tokens"] = float(details["cached_tokens"])
-
-    return metrics if metrics else None
-
-
-def _extract_stream_usage_metrics(
-    stream_result: Any, start_time: float, end_time: float, first_token_time: float | None
-) -> dict[str, float] | None:
-    """Extract usage metrics from stream result."""
-    metrics: dict[str, float] = {}
-
-    metrics["start"] = start_time
-    metrics["end"] = end_time
-    metrics["duration"] = end_time - start_time
-
-    if first_token_time:
+def _wrapper_span_metrics(
+    start_time: float, end_time: float, first_token_time: float | None = None
+) -> dict[str, float]:
+    # Wrapper spans (agent_run, model_request, streaming wrappers) must NOT log token or
+    # cost metrics. The leaf `chat <model>` span already logs them, and trace-tree rollup
+    # (self + descendants) would then double-count tokens/cost at every wrapper ancestor.
+    metrics: dict[str, float] = {
+        "start": start_time,
+        "end": end_time,
+        "duration": end_time - start_time,
+    }
+    if first_token_time is not None:
         metrics["time_to_first_token"] = first_token_time - start_time
-
-    if hasattr(stream_result, "usage"):
-        usage_func = stream_result.usage
-        if callable(usage_func):
-            usage = usage_func()
-        else:
-            usage = usage_func
-
-        if usage:
-            if hasattr(usage, "input_tokens") and usage.input_tokens is not None:
-                metrics["prompt_tokens"] = float(usage.input_tokens)
-
-            if hasattr(usage, "output_tokens") and usage.output_tokens is not None:
-                metrics["completion_tokens"] = float(usage.output_tokens)
-
-            if hasattr(usage, "total_tokens") and usage.total_tokens is not None:
-                metrics["tokens"] = float(usage.total_tokens)
-
-            if hasattr(usage, "cache_read_tokens") and usage.cache_read_tokens is not None:
-                metrics["prompt_cached_tokens"] = float(usage.cache_read_tokens)
-
-            if hasattr(usage, "cache_write_tokens") and usage.cache_write_tokens is not None:
-                metrics["prompt_cache_creation_tokens"] = float(usage.cache_write_tokens)
-
-    return metrics if metrics else None
+    return metrics
 
 
 def _extract_response_metrics(