Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions sentry_sdk/ai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from sys import getsizeof
from typing import TYPE_CHECKING

from sentry_sdk._types import BLOB_DATA_SUBSTITUTE

if TYPE_CHECKING:
from typing import Any, Callable, Dict, List, Optional, Tuple

Expand Down Expand Up @@ -141,6 +143,85 @@ def _find_truncation_index(messages: "List[Dict[str, Any]]", max_bytes: int) ->
return 0


def redact_blob_message_parts(
messages: "List[Dict[str, Any]]",
) -> "List[Dict[str, Any]]":
"""
Redact blob message parts from the messages by replacing blob content with "[Filtered]".

This function creates a deep copy of messages that contain blob content to avoid
mutating the original message dictionaries. Messages without blob content are
returned as-is to minimize copying overhead.

e.g:
{
"role": "user",
"content": [
{
"text": "How many ponies do you see in the image?",
"type": "text"
},
{
"type": "blob",
"modality": "image",
"mime_type": "image/jpeg",
"content": "data:image/jpeg;base64,..."
}
]
}
becomes:
{
"role": "user",
"content": [
{
"text": "How many ponies do you see in the image?",
"type": "text"
},
{
"type": "blob",
"modality": "image",
"mime_type": "image/jpeg",
"content": "[Filtered]"
}
]
}
"""

# First pass: check if any message contains blob content
has_blobs = False
for message in messages:
if not isinstance(message, dict):
continue
content = message.get("content")
if isinstance(content, list):
for item in content:
if isinstance(item, dict) and item.get("type") == "blob":
has_blobs = True
break
if has_blobs:
break

# If no blobs found, return original messages to avoid unnecessary copying
if not has_blobs:
return messages

# Deep copy messages to avoid mutating the original
messages_copy = deepcopy(messages)

# Second pass: redact blob content in the copy
for message in messages_copy:
if not isinstance(message, dict):
continue

content = message.get("content")
if isinstance(content, list):
for item in content:
if isinstance(item, dict) and item.get("type") == "blob":
item["content"] = BLOB_DATA_SUBSTITUTE
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Undefined variable BLOB_DATA_SUBSTITUTE causes NameError

High Severity

The code uses BLOB_DATA_SUBSTITUTE which is not defined or imported anywhere in the codebase. Line 8 imports SENSITIVE_DATA_SUBSTITUTE from sentry_sdk._types, but line 220 references the non-existent BLOB_DATA_SUBSTITUTE. This will cause a NameError at runtime whenever a message containing blob content is processed. The tests also assert against SENSITIVE_DATA_SUBSTITUTE, showing a mismatch between the code and expected behavior.

Fix in Cursor Fix in Web

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm on it

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nah, that's on me 😓

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The value still needs to be defined in sentry_sdk._types and the tests updated.


return messages_copy


def truncate_messages_by_size(
messages: "List[Dict[str, Any]]",
max_bytes: int = MAX_GEN_AI_MESSAGE_BYTES,
Expand Down Expand Up @@ -186,6 +267,8 @@ def truncate_and_annotate_messages(
if not messages:
return None

messages = redact_blob_message_parts(messages)

truncated_messages, removed_count = truncate_messages_by_size(messages, max_bytes)
if removed_count > 0:
scope._gen_ai_original_message_count[span.span_id] = len(messages)
Expand Down
213 changes: 212 additions & 1 deletion tests/test_ai_monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest

import sentry_sdk
from sentry_sdk._types import AnnotatedValue
from sentry_sdk._types import AnnotatedValue, SENSITIVE_DATA_SUBSTITUTE
from sentry_sdk.ai.monitoring import ai_track
from sentry_sdk.ai.utils import (
MAX_GEN_AI_MESSAGE_BYTES,
Expand All @@ -13,6 +13,7 @@
truncate_and_annotate_messages,
truncate_messages_by_size,
_find_truncation_index,
redact_blob_message_parts,
)
from sentry_sdk.serializer import serialize
from sentry_sdk.utils import safe_serialize
Expand Down Expand Up @@ -425,6 +426,49 @@ def __init__(self):
assert isinstance(result, list)
assert result[0] == large_messages[-len(result)]

def test_preserves_original_messages_with_blobs(self):
"""Test that truncate_and_annotate_messages doesn't mutate the original messages"""

class MockSpan:
def __init__(self):
self.span_id = "test_span_id"
self.data = {}

def set_data(self, key, value):
self.data[key] = value

class MockScope:
def __init__(self):
self._gen_ai_original_message_count = {}

messages = [
{
"role": "user",
"content": [
{"text": "What's in this image?", "type": "text"},
{
"type": "blob",
"modality": "image",
"content": "data:image/jpeg;base64,original_content",
},
],
}
]

original_blob_content = messages[0]["content"][1]["content"]

span = MockSpan()
scope = MockScope()

# This should NOT mutate the original messages
result = truncate_and_annotate_messages(messages, span, scope)

# Verify original is unchanged
assert messages[0]["content"][1]["content"] == original_blob_content

# Verify result has redacted content
assert result[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE


class TestClientAnnotation:
def test_client_wraps_truncated_messages_in_annotated_value(self, large_messages):
Expand Down Expand Up @@ -542,3 +586,170 @@ def __init__(self):
assert isinstance(messages_value, AnnotatedValue)
assert messages_value.metadata["len"] == stored_original_length
assert len(messages_value.value) == len(truncated_messages)


class TestRedactBlobMessageParts:
def test_redacts_single_blob_content(self):
"""Test that blob content is redacted without mutating original messages"""
messages = [
{
"role": "user",
"content": [
{
"text": "How many ponies do you see in the image?",
"type": "text",
},
{
"type": "blob",
"modality": "image",
"mime_type": "image/jpeg",
"content": "data:image/jpeg;base64,/9j/4AAQSkZJRg==",
},
],
}
]

# Save original blob content for comparison
original_blob_content = messages[0]["content"][1]["content"]

result = redact_blob_message_parts(messages)

# Original messages should be UNCHANGED
assert messages[0]["content"][1]["content"] == original_blob_content

# Result should have redacted content
assert (
result[0]["content"][0]["text"]
== "How many ponies do you see in the image?"
)
assert result[0]["content"][0]["type"] == "text"
assert result[0]["content"][1]["type"] == "blob"
assert result[0]["content"][1]["modality"] == "image"
assert result[0]["content"][1]["mime_type"] == "image/jpeg"
assert result[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE

def test_redacts_multiple_blob_parts(self):
"""Test that multiple blob parts are redacted without mutation"""
messages = [
{
"role": "user",
"content": [
{"text": "Compare these images", "type": "text"},
{
"type": "blob",
"modality": "image",
"mime_type": "image/jpeg",
"content": "data:image/jpeg;base64,first_image",
},
{
"type": "blob",
"modality": "image",
"mime_type": "image/png",
"content": "data:image/png;base64,second_image",
},
],
}
]

original_first = messages[0]["content"][1]["content"]
original_second = messages[0]["content"][2]["content"]

result = redact_blob_message_parts(messages)

# Original should be unchanged
assert messages[0]["content"][1]["content"] == original_first
assert messages[0]["content"][2]["content"] == original_second

# Result should be redacted
assert result[0]["content"][0]["text"] == "Compare these images"
assert result[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE
assert result[0]["content"][2]["content"] == SENSITIVE_DATA_SUBSTITUTE

def test_redacts_blobs_in_multiple_messages(self):
"""Test that blob parts are redacted across multiple messages without mutation"""
messages = [
{
"role": "user",
"content": [
{"text": "First message", "type": "text"},
{
"type": "blob",
"modality": "image",
"content": "data:image/jpeg;base64,first",
},
],
},
{
"role": "assistant",
"content": "I see the image.",
},
{
"role": "user",
"content": [
{"text": "Second message", "type": "text"},
{
"type": "blob",
"modality": "image",
"content": "data:image/jpeg;base64,second",
},
],
},
]

original_first = messages[0]["content"][1]["content"]
original_second = messages[2]["content"][1]["content"]

result = redact_blob_message_parts(messages)

# Original should be unchanged
assert messages[0]["content"][1]["content"] == original_first
assert messages[2]["content"][1]["content"] == original_second

# Result should be redacted
assert result[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE
assert result[1]["content"] == "I see the image." # Unchanged
assert result[2]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE

def test_no_blobs_returns_original_list(self):
"""Test that messages without blobs are returned as-is (performance optimization)"""
messages = [
{"role": "user", "content": "Simple text message"},
{"role": "assistant", "content": "Simple response"},
]

result = redact_blob_message_parts(messages)

# Should return the same list object when no blobs present
assert result is messages

def test_handles_non_dict_messages(self):
"""Test that non-dict messages are handled gracefully"""
messages = [
"string message",
{"role": "user", "content": "text"},
None,
123,
]

result = redact_blob_message_parts(messages)

# Should return same list since no blobs
assert result is messages

def test_handles_non_dict_content_items(self):
"""Test that non-dict content items in arrays are handled"""
messages = [
{
"role": "user",
"content": [
"string item",
{"text": "text item", "type": "text"},
None,
],
}
]

result = redact_blob_message_parts(messages)

# Should return same list since no blobs
assert result is messages
Loading