From c71baeff2ae45eff18dbe26af5e7ecf5d2528e24 Mon Sep 17 00:00:00 2001 From: Saraj Manes Date: Tue, 28 Apr 2026 15:20:05 -0400 Subject: [PATCH 1/4] fix(eap): flatten nested attributes_array JSON for dotted keys ClickHouse can emit nested objects for dotted keys under toJSONString(attributes_array). Flatten that shape back to dotted attribute names before decoding tagged array elements. All parsing stays in process_arrays (common.py); bad JSON or structure raises BadSnubaRPCRequestException. fixes EXP-915 --- snuba/web/rpc/common/common.py | 86 +++++++++++++++++-- tests/web/rpc/test_common.py | 33 +++++++ .../v1/test_endpoint_trace_item_details.py | 80 +++++++++++++++++ .../test_endpoint_trace_item_table.py | 19 ++++ 4 files changed, 211 insertions(+), 7 deletions(-) diff --git a/snuba/web/rpc/common/common.py b/snuba/web/rpc/common/common.py index 8292768b27..0ba5a80b37 100644 --- a/snuba/web/rpc/common/common.py +++ b/snuba/web/rpc/common/common.py @@ -78,7 +78,12 @@ def _trace_item_filter_key_expression( BUCKET_COUNT = 40 -def transform_array_value(value: dict[str, str]) -> Any: +def transform_array_value(value: Any) -> Any: + """Decode one tagged JSON object from an attributes_array element (e.g. ``{\"String\": \"x\"}``).""" + if not isinstance(value, dict): + raise BadSnubaRPCRequestException( + f"array element must be an object with a String/Int/Double/Bool tag, got {type(value).__name__}" + ) for t, v in value.items(): if t == "Int": return int(v) @@ -87,15 +92,82 @@ def transform_array_value(value: dict[str, str]) -> Any: if t == "Bool": return str(v).lower() == "true" if t == "String": - return v - raise BadSnubaRPCRequestException(f"array value type unknown: {type(v)}") + return str(v) + raise BadSnubaRPCRequestException( + f"array value has no recognized tag, keys={list(value.keys())}" + ) + + +def _flatten_attributes_array_json(node: dict[str, Any], prefix: str = "") -> dict[str, list[Any]]: + """ + Normalize ClickHouse ``attributes_array`` JSON to a flat map ``name -> list``. + + Ingestion stores dotted keys as a single path (e.g. ``resource.process.command_args``). After + ``toJSONString`` on the JSON column, ClickHouse may emit nested objects instead of a single + top-level key; we flatten those back to dotted names so each value remains an array of tagged + objects. + """ + out: dict[str, list[Any]] = {} + for k, v in node.items(): + full_key = f"{prefix}.{k}" if prefix else k + if isinstance(v, list): + if full_key in out: + raise BadSnubaRPCRequestException( + f"duplicate attributes_array key after normalization: {full_key!r}" + ) + out[full_key] = v + elif isinstance(v, dict): + nested = _flatten_attributes_array_json(v, full_key) + for nk, nv in nested.items(): + if nk in out: + raise BadSnubaRPCRequestException( + f"duplicate attributes_array key after normalization: {nk!r}" + ) + out[nk] = nv + else: + raise BadSnubaRPCRequestException( + f"attributes_array value at {full_key!r} must be a list or object, got {type(v).__name__}" + ) + return out def process_arrays(raw: str) -> dict[str, list[Any]]: - parsed = json.loads(raw) or {} - arrays = {} - for key, values in parsed.items(): - arrays[key] = [transform_array_value(v) for v in values] + """ + Parse ``toJSONString(attributes_array)`` into ``attribute_name -> list`` of Python scalars. + """ + if raw is None or (isinstance(raw, str) and raw.strip() == ""): + parsed: Any = {} + else: + try: + parsed = json.loads(raw) + except json.JSONDecodeError as e: + raise BadSnubaRPCRequestException(f"attributes_array is not valid JSON: {e}") from e + + if parsed is None: + parsed = {} + if not isinstance(parsed, dict): + raise BadSnubaRPCRequestException( + f"attributes_array JSON must be an object, got {type(parsed).__name__}" + ) + + collected = _flatten_attributes_array_json(parsed) + arrays: dict[str, list[Any]] = {} + for key, values in collected.items(): + if not isinstance(values, list): + raise BadSnubaRPCRequestException( + f"attributes_array key {key!r} must map to a list, got {type(values).__name__}" + ) + parsed_elements: list[Any] = [] + for i, elem in enumerate(values): + try: + parsed_elements.append(transform_array_value(elem)) + except BadSnubaRPCRequestException: + raise + except Exception as e: + raise BadSnubaRPCRequestException( + f"invalid attributes_array element at {key!r}[{i}]: {e}" + ) from e + arrays[key] = parsed_elements return arrays diff --git a/tests/web/rpc/test_common.py b/tests/web/rpc/test_common.py index fe018e7756..ae18b6a5be 100644 --- a/tests/web/rpc/test_common.py +++ b/tests/web/rpc/test_common.py @@ -37,6 +37,7 @@ attribute_key_to_expression, next_monday, prev_monday, + process_arrays, trace_item_filters_to_expression, use_sampling_factor, ) @@ -51,6 +52,38 @@ from tests.web.rpc.v1.test_utils import gen_item_message +class TestProcessArrays: + def test_flat_json(self) -> None: + raw = '{"tags":[{"String":"a"},{"String":"b"}]}' + assert process_arrays(raw) == {"tags": ["a", "b"]} + + def test_nested_json_from_dotted_key_roundtrip(self) -> None: + """Simulates ClickHouse toJSONString when dotted keys become nested objects.""" + nested = '{"resource":{"process":{"command_args":[{"String":"node"},{"String":"--enable-source-maps"}]}}}' + assert process_arrays(nested) == { + "resource.process.command_args": ["node", "--enable-source-maps"], + } + + def test_mixed_flat_and_nested(self) -> None: + raw = '{"tags":[{"String":"x"}],"resource":{"process":{"command_args":[{"Int":"1"}]}}}' + assert process_arrays(raw) == { + "tags": ["x"], + "resource.process.command_args": [1], + } + + def test_invalid_json_raises(self) -> None: + with pytest.raises(BadSnubaRPCRequestException, match="not valid JSON"): + process_arrays("not json") + + def test_non_object_root_raises(self) -> None: + with pytest.raises(BadSnubaRPCRequestException, match="must be an object"): + process_arrays("[1]") + + def test_scalar_leaf_raises(self) -> None: + with pytest.raises(BadSnubaRPCRequestException, match="must be a list or object"): + process_arrays('{"a":{"b":"bad"}}') + + class TestCommon: def test_timestamp_rounding(self) -> None: start = datetime(2025, 3, 10) diff --git a/tests/web/rpc/v1/test_endpoint_trace_item_details.py b/tests/web/rpc/v1/test_endpoint_trace_item_details.py index ec4d0acc5b..a3258add64 100644 --- a/tests/web/rpc/v1/test_endpoint_trace_item_details.py +++ b/tests/web/rpc/v1/test_endpoint_trace_item_details.py @@ -332,6 +332,86 @@ def test_endpoint_returns_array_attribute(self, eap: None, redis_db: None) -> No assert cols_attr.value.WhichOneof("value") == "val_array" assert [e.val_int for e in cols_attr.value.val_array.values] == [1, 3] + def test_dotted_key_array_attribute_parsed_properly(self, eap: None, redis_db: None) -> None: + trace_id = uuid.uuid4().hex + span_ts = BASE_TIME - timedelta(minutes=1) + storage = get_storage(StorageKey("eap_items")) + write_raw_unprocessed_events( + storage, # type: ignore + [ + gen_item_message( + span_ts, + trace_id=trace_id, + remove_default_attributes=True, + attributes={ + "resource.process.command_args": _str_tags_array( + "node", "--enable-source-maps" + ), + }, + ), + ], + ) + start = Timestamp() + end = Timestamp() + start.FromDatetime(BASE_TIME - timedelta(hours=4)) + end.GetCurrentTime() + + spans = ( + EndpointTraceItemTable() + .execute( + TraceItemTableRequest( + meta=RequestMeta( + project_ids=[1], + organization_id=1, + cogs_category="something", + referrer="something", + start_timestamp=start, + end_timestamp=end, + request_id=_REQUEST_ID, + trace_item_type=TraceItemType.TRACE_ITEM_TYPE_SPAN, + ), + columns=[ + Column( + key=AttributeKey(type=AttributeKey.TYPE_STRING, name="sentry.item_id") + ), + Column( + key=AttributeKey(type=AttributeKey.TYPE_STRING, name="sentry.trace_id") + ), + ], + ) + ) + .column_values + ) + # Do not use results[0]: other rows may sort ahead of our span; find this trace's row. + idx = next( + i + for i, tr in enumerate(spans[1].results) + if tr.val_str.replace("-", "").lower() == trace_id.replace("-", "").lower() + ) + item_id = spans[0].results[idx].val_str + trace_id_for_details = spans[1].results[idx].val_str + + res = EndpointTraceItemDetails().execute( + TraceItemDetailsRequest( + meta=RequestMeta( + project_ids=[1], + organization_id=1, + cogs_category="something", + referrer="something", + start_timestamp=start, + end_timestamp=end, + request_id=_REQUEST_ID, + trace_item_type=TraceItemType.TRACE_ITEM_TYPE_SPAN, + ), + item_id=item_id, + trace_id=trace_id_for_details, + ) + ) + attr = next((a for a in res.attributes if a.name == "resource.process.command_args"), None) + assert attr is not None + assert attr.value.WhichOneof("value") == "val_array" + assert [e.val_str for e in attr.value.val_array.values] == ["node", "--enable-source-maps"] + def test_endpoint_on_spans(self, setup_spans_in_db: Any) -> None: end = Timestamp() start = Timestamp() diff --git a/tests/web/rpc/v1/test_endpoint_trace_item_table/test_endpoint_trace_item_table.py b/tests/web/rpc/v1/test_endpoint_trace_item_table/test_endpoint_trace_item_table.py index 9fbc620ade..db37dda848 100644 --- a/tests/web/rpc/v1/test_endpoint_trace_item_table/test_endpoint_trace_item_table.py +++ b/tests/web/rpc/v1/test_endpoint_trace_item_table/test_endpoint_trace_item_table.py @@ -3949,6 +3949,14 @@ def test_select_array_column_returns_val_array(self) -> None: "cols": AnyValue( array_value=ArrayValue(values=[AnyValue(int_value=v) for v in [1, 3]]) ), + "resource.process.command_args": AnyValue( + array_value=ArrayValue( + values=[ + AnyValue(string_value="node"), + AnyValue(string_value="--enable-source-maps"), + ] + ) + ), }, ), ], @@ -3967,6 +3975,11 @@ def test_select_array_column_returns_val_array(self) -> None: Column(key=AttributeKey(type=AttributeKey.TYPE_STRING, name="sentry.item_id")), Column(key=AttributeKey(type=AttributeKey.TYPE_ARRAY, name="tags")), Column(key=AttributeKey(type=AttributeKey.TYPE_ARRAY, name="cols")), + Column( + key=AttributeKey( + type=AttributeKey.TYPE_ARRAY, name="resource.process.command_args" + ) + ), ], ) response = EndpointTraceItemTable().execute(message) @@ -3978,6 +3991,12 @@ def test_select_array_column_returns_val_array(self) -> None: ] assert by_name["cols"].results[0].WhichOneof("value") == "val_array" assert [e.val_int for e in by_name["cols"].results[0].val_array.values] == [1, 3] + assert ( + by_name["resource.process.command_args"].results[0].WhichOneof("value") == "val_array" + ) + assert [ + e.val_str for e in by_name["resource.process.command_args"].results[0].val_array.values + ] == ["node", "--enable-source-maps"] class TestUtils: From 01474850465bbd5e89bf76e8d4b7aeb5ec2882fd Mon Sep 17 00:00:00 2001 From: Saraj Manes Date: Tue, 28 Apr 2026 15:25:08 -0400 Subject: [PATCH 2/4] strings .. --- snuba/web/rpc/common/common.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/snuba/web/rpc/common/common.py b/snuba/web/rpc/common/common.py index 0ba5a80b37..fdef83b6e7 100644 --- a/snuba/web/rpc/common/common.py +++ b/snuba/web/rpc/common/common.py @@ -79,7 +79,7 @@ def _trace_item_filter_key_expression( def transform_array_value(value: Any) -> Any: - """Decode one tagged JSON object from an attributes_array element (e.g. ``{\"String\": \"x\"}``).""" + """Decode one array element: a small JSON object with a String, Int, Double, or Bool tag.""" if not isinstance(value, dict): raise BadSnubaRPCRequestException( f"array element must be an object with a String/Int/Double/Bool tag, got {type(value).__name__}" @@ -99,14 +99,7 @@ def transform_array_value(value: Any) -> Any: def _flatten_attributes_array_json(node: dict[str, Any], prefix: str = "") -> dict[str, list[Any]]: - """ - Normalize ClickHouse ``attributes_array`` JSON to a flat map ``name -> list``. - - Ingestion stores dotted keys as a single path (e.g. ``resource.process.command_args``). After - ``toJSONString`` on the JSON column, ClickHouse may emit nested objects instead of a single - top-level key; we flatten those back to dotted names so each value remains an array of tagged - objects. - """ + """Flatten nested dicts into dotted keys (name -> list). ClickHouse often nests dotted paths.""" out: dict[str, list[Any]] = {} for k, v in node.items(): full_key = f"{prefix}.{k}" if prefix else k @@ -132,9 +125,7 @@ def _flatten_attributes_array_json(node: dict[str, Any], prefix: str = "") -> di def process_arrays(raw: str) -> dict[str, list[Any]]: - """ - Parse ``toJSONString(attributes_array)`` into ``attribute_name -> list`` of Python scalars. - """ + """Parse attributes_array JSON into attribute name -> list of decoded values.""" if raw is None or (isinstance(raw, str) and raw.strip() == ""): parsed: Any = {} else: From c8d50c08443df0bc9c125dbd32b342921a1d5d7a Mon Sep 17 00:00:00 2001 From: Saraj Manes Date: Tue, 28 Apr 2026 15:28:15 -0400 Subject: [PATCH 3/4] More linter issues --- snuba/web/rpc/common/common.py | 2 +- tests/web/rpc/v1/test_endpoint_trace_item_details.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/snuba/web/rpc/common/common.py b/snuba/web/rpc/common/common.py index fdef83b6e7..4814047710 100644 --- a/snuba/web/rpc/common/common.py +++ b/snuba/web/rpc/common/common.py @@ -79,7 +79,7 @@ def _trace_item_filter_key_expression( def transform_array_value(value: Any) -> Any: - """Decode one array element: a small JSON object with a String, Int, Double, or Bool tag.""" + """Decode one array element: with a String, Int, Double, or Bool tag.""" if not isinstance(value, dict): raise BadSnubaRPCRequestException( f"array element must be an object with a String/Int/Double/Bool tag, got {type(value).__name__}" diff --git a/tests/web/rpc/v1/test_endpoint_trace_item_details.py b/tests/web/rpc/v1/test_endpoint_trace_item_details.py index a3258add64..2cc8a0922d 100644 --- a/tests/web/rpc/v1/test_endpoint_trace_item_details.py +++ b/tests/web/rpc/v1/test_endpoint_trace_item_details.py @@ -382,7 +382,6 @@ def test_dotted_key_array_attribute_parsed_properly(self, eap: None, redis_db: N ) .column_values ) - # Do not use results[0]: other rows may sort ahead of our span; find this trace's row. idx = next( i for i, tr in enumerate(spans[1].results) From aac1b6d67ca7e9bedd69e01ca94ecdde103fd51e Mon Sep 17 00:00:00 2001 From: Saraj Manes Date: Wed, 29 Apr 2026 09:17:38 -0400 Subject: [PATCH 4/4] Update the description and documentation --- snuba/web/rpc/common/common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/snuba/web/rpc/common/common.py b/snuba/web/rpc/common/common.py index 4814047710..37583c051c 100644 --- a/snuba/web/rpc/common/common.py +++ b/snuba/web/rpc/common/common.py @@ -99,7 +99,10 @@ def transform_array_value(value: Any) -> Any: def _flatten_attributes_array_json(node: dict[str, Any], prefix: str = "") -> dict[str, list[Any]]: - """Flatten nested dicts into dotted keys (name -> list). ClickHouse often nests dotted paths.""" + """Flatten nested dicts into dotted keys (name -> list). + ClickHouse stores all JSON objects as flat and interprets dotted paths in a special way to support nesting + Docs: https://clickhouse.com/docs/sql-reference/data-types/newjson#handling-json-keys-with-dots + """ out: dict[str, list[Any]] = {} for k, v in node.items(): full_key = f"{prefix}.{k}" if prefix else k