From 6917d797684fbf7b14affb0d599a80ee8f7d5abd Mon Sep 17 00:00:00 2001 From: Gordon Murray Date: Fri, 20 Feb 2026 15:32:35 +0000 Subject: [PATCH] fix: decode binary fields as UTF-8 text when possible Binary and large_binary PyArrow columns that contain valid UTF-8 are now displayed as readable text instead of base64. Falls back to base64 for actual binary data. Also handles newer PyArrow versions where .as_py() returns str directly. --- backend/serialize_value.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/backend/serialize_value.py b/backend/serialize_value.py index 755c54d..a54e485 100644 --- a/backend/serialize_value.py +++ b/backend/serialize_value.py @@ -16,8 +16,16 @@ def _serialize_temporal(obj): def _serialize_pyarrow_scalar(obj): """Convert PyArrow scalar types to JSON-serializable format.""" - if pa.types.is_binary(obj.type): - return base64.b64encode(obj.as_py()).decode("utf-8") + if pa.types.is_binary(obj.type) or pa.types.is_large_binary(obj.type): + raw = obj.as_py() + if raw is None: + return None + if isinstance(raw, str): + return raw + try: + return raw.decode("utf-8") + except UnicodeDecodeError: + return base64.b64encode(raw).decode("utf-8") if pa.types.is_temporal(obj.type): return _serialize_temporal(obj.as_py()) @@ -48,8 +56,21 @@ def _serialize_container(obj): def _serialize_basic_types(obj): """Convert basic Python types to JSON-serializable format.""" - if isinstance(obj, (bytes, pa.BinaryScalar)): - return base64.b64encode(obj).decode("utf-8") + if isinstance(obj, bytes): + try: + return obj.decode("utf-8") + except UnicodeDecodeError: + return base64.b64encode(obj).decode("utf-8") + if isinstance(obj, pa.BinaryScalar): + raw = obj.as_py() + if raw is None: + return None + if isinstance(raw, str): + return raw + try: + return raw.decode("utf-8") + except UnicodeDecodeError: + return base64.b64encode(raw).decode("utf-8") if isinstance(obj, (datetime, date, time)): return obj.isoformat() if isinstance(obj, timedelta):