From 1f32952d0066a9dc1ff1482cef48c3cbe0acb663 Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Wed, 17 Dec 2025 10:45:45 +0100 Subject: [PATCH 1/8] fix(ai): redact message parts content of type blob --- sentry_sdk/ai/utils.py | 51 +++++++++++++++++ tests/test_ai_monitoring.py | 106 +++++++++++++++++++++++++++++++++++- 2 files changed, 156 insertions(+), 1 deletion(-) diff --git a/sentry_sdk/ai/utils.py b/sentry_sdk/ai/utils.py index 1d2b4483c9..73155b0305 100644 --- a/sentry_sdk/ai/utils.py +++ b/sentry_sdk/ai/utils.py @@ -5,6 +5,8 @@ from sys import getsizeof from typing import TYPE_CHECKING +from sentry_sdk._types import SENSITIVE_DATA_SUBSTITUTE + if TYPE_CHECKING: from typing import Any, Callable, Dict, List, Optional, Tuple @@ -141,6 +143,53 @@ def _find_truncation_index(messages: "List[Dict[str, Any]]", max_bytes: int) -> return 0 +def redact_blob_message_parts(messages): + # type: (List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], int] + """ + Redact blob message parts from the messages, by removing the "content" key. + e.g: + { + "role": "user", + "content": [ + { + "text": "How many ponies do you see in the image?", + "type": "text" + }, + { + "type": "blob", + "modality": "image", + "mime_type": "image/jpeg", + "content": "data:image/jpeg;base64,..." + } + ] + } + becomes: + { + "role": "user", + "content": [ + { + "text": "How many ponies do you see in the image?", + "type": "text" + }, + { + "type": "blob", + "modality": "image", + "mime_type": "image/jpeg", + "content": "[Filtered]" + } + ] + } + """ + + for message in messages: + content = message.get("content") + if isinstance(content, list): + for item in content: + if item.get("type") == "blob": + item["content"] = SENSITIVE_DATA_SUBSTITUTE + return messages + + def truncate_messages_by_size( messages: "List[Dict[str, Any]]", max_bytes: int = MAX_GEN_AI_MESSAGE_BYTES, @@ -186,6 +235,8 @@ def truncate_and_annotate_messages( if not messages: return None + messages = redact_blob_message_parts(messages) + truncated_messages, removed_count = truncate_messages_by_size(messages, max_bytes) if removed_count > 0: scope._gen_ai_original_message_count[span.span_id] = len(messages) diff --git a/tests/test_ai_monitoring.py b/tests/test_ai_monitoring.py index 8d3d4ba204..e9f3712cd3 100644 --- a/tests/test_ai_monitoring.py +++ b/tests/test_ai_monitoring.py @@ -4,7 +4,7 @@ import pytest import sentry_sdk -from sentry_sdk._types import AnnotatedValue +from sentry_sdk._types import AnnotatedValue, SENSITIVE_DATA_SUBSTITUTE from sentry_sdk.ai.monitoring import ai_track from sentry_sdk.ai.utils import ( MAX_GEN_AI_MESSAGE_BYTES, @@ -13,6 +13,7 @@ truncate_and_annotate_messages, truncate_messages_by_size, _find_truncation_index, + redact_blob_message_parts, ) from sentry_sdk.serializer import serialize from sentry_sdk.utils import safe_serialize @@ -542,3 +543,106 @@ def __init__(self): assert isinstance(messages_value, AnnotatedValue) assert messages_value.metadata["len"] == stored_original_length assert len(messages_value.value) == len(truncated_messages) + + +class TestRedactBlobMessageParts: + def test_redacts_single_blob_content(self): + """Test that blob content is redacted in a message with single blob part""" + messages = [ + { + "role": "user", + "content": [ + { + "text": "How many ponies do you see in the image?", + "type": "text", + }, + { + "type": "blob", + "modality": "image", + "mime_type": "image/jpeg", + "content": "", + }, + ], + } + ] + + result = redact_blob_message_parts(messages) + + assert result == messages # Returns the same list + assert ( + messages[0]["content"][0]["text"] + == "How many ponies do you see in the image?" + ) + assert messages[0]["content"][0]["type"] == "text" + assert messages[0]["content"][1]["type"] == "blob" + assert messages[0]["content"][1]["modality"] == "image" + assert messages[0]["content"][1]["mime_type"] == "image/jpeg" + assert messages[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE + + def test_redacts_multiple_blob_parts(self): + """Test that multiple blob parts in a single message are all redacted""" + messages = [ + { + "role": "user", + "content": [ + {"text": "Compare these images", "type": "text"}, + { + "type": "blob", + "modality": "image", + "mime_type": "image/jpeg", + "content": "_image", + }, + { + "type": "blob", + "modality": "image", + "mime_type": "image/png", + "content": "_image", + }, + ], + } + ] + + result = redact_blob_message_parts(messages) + + assert result == messages + assert messages[0]["content"][0]["text"] == "Compare these images" + assert messages[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE + assert messages[0]["content"][2]["content"] == SENSITIVE_DATA_SUBSTITUTE + + def test_redacts_blobs_in_multiple_messages(self): + """Test that blob parts are redacted across multiple messages""" + messages = [ + { + "role": "user", + "content": [ + {"text": "First message", "type": "text"}, + { + "type": "blob", + "modality": "image", + "content": "", + }, + ], + }, + { + "role": "assistant", + "content": "I see the image.", + }, + { + "role": "user", + "content": [ + {"text": "Second message", "type": "text"}, + { + "type": "blob", + "modality": "image", + "content": "", + }, + ], + }, + ] + + result = redact_blob_message_parts(messages) + + assert result == messages + assert messages[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE + assert messages[1]["content"] == "I see the image." # Unchanged + assert messages[2]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE From 795bcea241f7777e646a4da14c870a3049bdbe90 Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Wed, 17 Dec 2025 11:05:04 +0100 Subject: [PATCH 2/8] fix(ai): skip non dict messages --- sentry_sdk/ai/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sentry_sdk/ai/utils.py b/sentry_sdk/ai/utils.py index 73155b0305..ae507e898b 100644 --- a/sentry_sdk/ai/utils.py +++ b/sentry_sdk/ai/utils.py @@ -182,6 +182,9 @@ def redact_blob_message_parts(messages): """ for message in messages: + if not isinstance(message, dict): + continue + content = message.get("content") if isinstance(content, list): for item in content: From a623e137d26e982c0d85258256c0ba013f9ecb24 Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Wed, 17 Dec 2025 11:21:43 +0100 Subject: [PATCH 3/8] fix(ai): typing --- sentry_sdk/ai/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sentry_sdk/ai/utils.py b/sentry_sdk/ai/utils.py index ae507e898b..1b61c7a113 100644 --- a/sentry_sdk/ai/utils.py +++ b/sentry_sdk/ai/utils.py @@ -143,8 +143,9 @@ def _find_truncation_index(messages: "List[Dict[str, Any]]", max_bytes: int) -> return 0 -def redact_blob_message_parts(messages): - # type: (List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], int] +def redact_blob_message_parts( + messages: "List[Dict[str, Any]]", +) -> "List[Dict[str, Any]]": """ Redact blob message parts from the messages, by removing the "content" key. e.g: From 3d3ce5bbdca43f14194edbbbee11d3b6dcd6d8a3 Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Wed, 17 Dec 2025 11:37:12 +0100 Subject: [PATCH 4/8] fix(ai): content items may not be dicts --- sentry_sdk/ai/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sentry_sdk/ai/utils.py b/sentry_sdk/ai/utils.py index 1b61c7a113..78a64ab737 100644 --- a/sentry_sdk/ai/utils.py +++ b/sentry_sdk/ai/utils.py @@ -189,7 +189,7 @@ def redact_blob_message_parts( content = message.get("content") if isinstance(content, list): for item in content: - if item.get("type") == "blob": + if isinstance(item, dict) and item.get("type") == "blob": item["content"] = SENSITIVE_DATA_SUBSTITUTE return messages From 767050c1aca635c90dd25cc0ab27faae50db10fa Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Mon, 12 Jan 2026 09:23:10 +0100 Subject: [PATCH 5/8] fix(ai): enhance blob redaction to preserve original messages and optimize performance --- sentry_sdk/ai/utils.py | 32 +++++++- tests/test_ai_monitoring.py | 143 +++++++++++++++++++++++++++++++----- 2 files changed, 155 insertions(+), 20 deletions(-) diff --git a/sentry_sdk/ai/utils.py b/sentry_sdk/ai/utils.py index 78a64ab737..d9aad22709 100644 --- a/sentry_sdk/ai/utils.py +++ b/sentry_sdk/ai/utils.py @@ -147,7 +147,12 @@ def redact_blob_message_parts( messages: "List[Dict[str, Any]]", ) -> "List[Dict[str, Any]]": """ - Redact blob message parts from the messages, by removing the "content" key. + Redact blob message parts from the messages by replacing blob content with "[Filtered]". + + This function creates a deep copy of messages that contain blob content to avoid + mutating the original message dictionaries. Messages without blob content are + returned as-is to minimize copying overhead. + e.g: { "role": "user", @@ -182,7 +187,29 @@ def redact_blob_message_parts( } """ + # First pass: check if any message contains blob content + has_blobs = False for message in messages: + if not isinstance(message, dict): + continue + content = message.get("content") + if isinstance(content, list): + for item in content: + if isinstance(item, dict) and item.get("type") == "blob": + has_blobs = True + break + if has_blobs: + break + + # If no blobs found, return original messages to avoid unnecessary copying + if not has_blobs: + return messages + + # Deep copy messages to avoid mutating the original + messages_copy = deepcopy(messages) + + # Second pass: redact blob content in the copy + for message in messages_copy: if not isinstance(message, dict): continue @@ -191,7 +218,8 @@ def redact_blob_message_parts( for item in content: if isinstance(item, dict) and item.get("type") == "blob": item["content"] = SENSITIVE_DATA_SUBSTITUTE - return messages + + return messages_copy def truncate_messages_by_size( diff --git a/tests/test_ai_monitoring.py b/tests/test_ai_monitoring.py index e9f3712cd3..5bb395659e 100644 --- a/tests/test_ai_monitoring.py +++ b/tests/test_ai_monitoring.py @@ -426,6 +426,49 @@ def __init__(self): assert isinstance(result, list) assert result[0] == large_messages[-len(result)] + def test_preserves_original_messages_with_blobs(self): + """Test that truncate_and_annotate_messages doesn't mutate the original messages""" + + class MockSpan: + def __init__(self): + self.span_id = "test_span_id" + self.data = {} + + def set_data(self, key, value): + self.data[key] = value + + class MockScope: + def __init__(self): + self._gen_ai_original_message_count = {} + + messages = [ + { + "role": "user", + "content": [ + {"text": "What's in this image?", "type": "text"}, + { + "type": "blob", + "modality": "image", + "content": "_content", + }, + ], + } + ] + + original_blob_content = messages[0]["content"][1]["content"] + + span = MockSpan() + scope = MockScope() + + # This should NOT mutate the original messages + result = truncate_and_annotate_messages(messages, span, scope) + + # Verify original is unchanged + assert messages[0]["content"][1]["content"] == original_blob_content + + # Verify result has redacted content + assert result[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE + class TestClientAnnotation: def test_client_wraps_truncated_messages_in_annotated_value(self, large_messages): @@ -547,7 +590,7 @@ def __init__(self): class TestRedactBlobMessageParts: def test_redacts_single_blob_content(self): - """Test that blob content is redacted in a message with single blob part""" + """Test that blob content is redacted without mutating original messages""" messages = [ { "role": "user", @@ -566,21 +609,27 @@ def test_redacts_single_blob_content(self): } ] + # Save original blob content for comparison + original_blob_content = messages[0]["content"][1]["content"] + result = redact_blob_message_parts(messages) - assert result == messages # Returns the same list + # Original messages should be UNCHANGED + assert messages[0]["content"][1]["content"] == original_blob_content + + # Result should have redacted content assert ( - messages[0]["content"][0]["text"] + result[0]["content"][0]["text"] == "How many ponies do you see in the image?" ) - assert messages[0]["content"][0]["type"] == "text" - assert messages[0]["content"][1]["type"] == "blob" - assert messages[0]["content"][1]["modality"] == "image" - assert messages[0]["content"][1]["mime_type"] == "image/jpeg" - assert messages[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE + assert result[0]["content"][0]["type"] == "text" + assert result[0]["content"][1]["type"] == "blob" + assert result[0]["content"][1]["modality"] == "image" + assert result[0]["content"][1]["mime_type"] == "image/jpeg" + assert result[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE def test_redacts_multiple_blob_parts(self): - """Test that multiple blob parts in a single message are all redacted""" + """Test that multiple blob parts are redacted without mutation""" messages = [ { "role": "user", @@ -602,15 +651,22 @@ def test_redacts_multiple_blob_parts(self): } ] + original_first = messages[0]["content"][1]["content"] + original_second = messages[0]["content"][2]["content"] + result = redact_blob_message_parts(messages) - assert result == messages - assert messages[0]["content"][0]["text"] == "Compare these images" - assert messages[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE - assert messages[0]["content"][2]["content"] == SENSITIVE_DATA_SUBSTITUTE + # Original should be unchanged + assert messages[0]["content"][1]["content"] == original_first + assert messages[0]["content"][2]["content"] == original_second + + # Result should be redacted + assert result[0]["content"][0]["text"] == "Compare these images" + assert result[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE + assert result[0]["content"][2]["content"] == SENSITIVE_DATA_SUBSTITUTE def test_redacts_blobs_in_multiple_messages(self): - """Test that blob parts are redacted across multiple messages""" + """Test that blob parts are redacted across multiple messages without mutation""" messages = [ { "role": "user", @@ -640,9 +696,60 @@ def test_redacts_blobs_in_multiple_messages(self): }, ] + original_first = messages[0]["content"][1]["content"] + original_second = messages[2]["content"][1]["content"] + + result = redact_blob_message_parts(messages) + + # Original should be unchanged + assert messages[0]["content"][1]["content"] == original_first + assert messages[2]["content"][1]["content"] == original_second + + # Result should be redacted + assert result[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE + assert result[1]["content"] == "I see the image." # Unchanged + assert result[2]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE + + def test_no_blobs_returns_original_list(self): + """Test that messages without blobs are returned as-is (performance optimization)""" + messages = [ + {"role": "user", "content": "Simple text message"}, + {"role": "assistant", "content": "Simple response"}, + ] + + result = redact_blob_message_parts(messages) + + # Should return the same list object when no blobs present + assert result is messages + + def test_handles_non_dict_messages(self): + """Test that non-dict messages are handled gracefully""" + messages = [ + "string message", + {"role": "user", "content": "text"}, + None, + 123, + ] + + result = redact_blob_message_parts(messages) + + # Should return same list since no blobs + assert result is messages + + def test_handles_non_dict_content_items(self): + """Test that non-dict content items in arrays are handled""" + messages = [ + { + "role": "user", + "content": [ + "string item", + {"text": "text item", "type": "text"}, + None, + ], + } + ] + result = redact_blob_message_parts(messages) - assert result == messages - assert messages[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE - assert messages[1]["content"] == "I see the image." # Unchanged - assert messages[2]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE + # Should return same list since no blobs + assert result is messages From 614d4392d02dd763ce9cf10a13feb1ff6b6b8ef0 Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Mon, 12 Jan 2026 19:04:12 +0100 Subject: [PATCH 6/8] Update sentry_sdk/ai/utils.py Co-authored-by: Alex Alderman Webb --- sentry_sdk/ai/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sentry_sdk/ai/utils.py b/sentry_sdk/ai/utils.py index d9aad22709..05ee085254 100644 --- a/sentry_sdk/ai/utils.py +++ b/sentry_sdk/ai/utils.py @@ -217,7 +217,7 @@ def redact_blob_message_parts( if isinstance(content, list): for item in content: if isinstance(item, dict) and item.get("type") == "blob": - item["content"] = SENSITIVE_DATA_SUBSTITUTE + item["content"] = BLOB_DATA_SUBSTITUTE return messages_copy From c4a4f49f777970ac25cf25f02369a89708088ba2 Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Tue, 13 Jan 2026 08:57:11 +0100 Subject: [PATCH 7/8] fix: Replace SENSITIVE_DATA_SUBSTITUTE with BLOB_DATA_SUBSTITUTE --- sentry_sdk/ai/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sentry_sdk/ai/utils.py b/sentry_sdk/ai/utils.py index 05ee085254..dd04473b6a 100644 --- a/sentry_sdk/ai/utils.py +++ b/sentry_sdk/ai/utils.py @@ -5,7 +5,7 @@ from sys import getsizeof from typing import TYPE_CHECKING -from sentry_sdk._types import SENSITIVE_DATA_SUBSTITUTE +from sentry_sdk._types import BLOB_DATA_SUBSTITUTE if TYPE_CHECKING: from typing import Any, Callable, Dict, List, Optional, Tuple From b9218753220350cf8f177e8c2d3812a20aacad83 Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Tue, 13 Jan 2026 09:32:51 +0100 Subject: [PATCH 8/8] add blob replacement string and update tests --- sentry_sdk/_types.py | 1 + tests/test_ai_monitoring.py | 18 +++++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/sentry_sdk/_types.py b/sentry_sdk/_types.py index 5497a27a3d..c514a80bf9 100644 --- a/sentry_sdk/_types.py +++ b/sentry_sdk/_types.py @@ -6,6 +6,7 @@ SENSITIVE_DATA_SUBSTITUTE = "[Filtered]" +BLOB_DATA_SUBSTITUTE = "[Blob substitute]" class AnnotatedValue: diff --git a/tests/test_ai_monitoring.py b/tests/test_ai_monitoring.py index 5bb395659e..b11ca9710d 100644 --- a/tests/test_ai_monitoring.py +++ b/tests/test_ai_monitoring.py @@ -4,7 +4,11 @@ import pytest import sentry_sdk -from sentry_sdk._types import AnnotatedValue, SENSITIVE_DATA_SUBSTITUTE +from sentry_sdk._types import ( + AnnotatedValue, + SENSITIVE_DATA_SUBSTITUTE, + BLOB_DATA_SUBSTITUTE, +) from sentry_sdk.ai.monitoring import ai_track from sentry_sdk.ai.utils import ( MAX_GEN_AI_MESSAGE_BYTES, @@ -467,7 +471,7 @@ def __init__(self): assert messages[0]["content"][1]["content"] == original_blob_content # Verify result has redacted content - assert result[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE + assert result[0]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE class TestClientAnnotation: @@ -626,7 +630,7 @@ def test_redacts_single_blob_content(self): assert result[0]["content"][1]["type"] == "blob" assert result[0]["content"][1]["modality"] == "image" assert result[0]["content"][1]["mime_type"] == "image/jpeg" - assert result[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE + assert result[0]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE def test_redacts_multiple_blob_parts(self): """Test that multiple blob parts are redacted without mutation""" @@ -662,8 +666,8 @@ def test_redacts_multiple_blob_parts(self): # Result should be redacted assert result[0]["content"][0]["text"] == "Compare these images" - assert result[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE - assert result[0]["content"][2]["content"] == SENSITIVE_DATA_SUBSTITUTE + assert result[0]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE + assert result[0]["content"][2]["content"] == BLOB_DATA_SUBSTITUTE def test_redacts_blobs_in_multiple_messages(self): """Test that blob parts are redacted across multiple messages without mutation""" @@ -706,9 +710,9 @@ def test_redacts_blobs_in_multiple_messages(self): assert messages[2]["content"][1]["content"] == original_second # Result should be redacted - assert result[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE + assert result[0]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE assert result[1]["content"] == "I see the image." # Unchanged - assert result[2]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE + assert result[2]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE def test_no_blobs_returns_original_list(self): """Test that messages without blobs are returned as-is (performance optimization)"""