From a7319c0698cfd340f9f6d9144521811524f2b784 Mon Sep 17 00:00:00 2001 From: Andrew Yao Date: Sun, 28 Sep 2025 11:23:20 -0700 Subject: [PATCH] Add generic serialize_value --- backend/app.py | 19 +++------ backend/serialize_value.py | 86 ++++++++++++++++++++++++++++++++++++++ docker/Dockerfile | 4 +- 3 files changed, 94 insertions(+), 15 deletions(-) create mode 100644 backend/serialize_value.py diff --git a/backend/app.py b/backend/app.py index 1ab306b..5dc6b57 100644 --- a/backend/app.py +++ b/backend/app.py @@ -12,6 +12,7 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse from fastapi.staticfiles import StaticFiles +from serialize_value import serialize_value logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -54,17 +55,8 @@ def get_lance_connection(): def serialize_arrow_value(value): try: - if pa.types.is_null(value.type): - return None - elif pa.types.is_boolean(value.type): - return value.as_py() - elif pa.types.is_integer(value.type) or pa.types.is_floating(value.type): - return value.as_py() - elif pa.types.is_string(value.type) or pa.types.is_large_string(value.type): - return value.as_py() - elif pa.types.is_timestamp(value.type): - return value.as_py().isoformat() if value.as_py() else None - elif pa.types.is_list(value.type) and pa.types.is_floating(value.value_type): + # Handle vector columns with special processing + if pa.types.is_list(value.type) and pa.types.is_floating(value.value_type): try: vec = value.as_py() if vec is None: @@ -118,8 +110,9 @@ def serialize_arrow_value(value): except Exception as vec_error: logger.warning(f"Error processing vector data: {vec_error}") return {"type": "vector", "error": f"Vector processing failed: {str(vec_error)}"} - else: - return str(value.as_py()) + + # Use the general serialize_value utility for all other types + return serialize_value(value) except Exception as e: logger.warning(f"Error serializing value: {e}") return {"error": f"Serialization failed: {str(e)}"} diff --git a/backend/serialize_value.py b/backend/serialize_value.py new file mode 100644 index 0000000..755c54d --- /dev/null +++ b/backend/serialize_value.py @@ -0,0 +1,86 @@ +import base64 +from datetime import date, datetime, time, timedelta + +import numpy as np +import pyarrow as pa + + +def _serialize_temporal(obj): + """Convert temporal types to string representation.""" + if isinstance(obj, (datetime, date, time)): + return obj.isoformat() + if isinstance(obj, timedelta): + return obj.total_seconds() + return str(obj) + + +def _serialize_pyarrow_scalar(obj): + """Convert PyArrow scalar types to JSON-serializable format.""" + if pa.types.is_binary(obj.type): + return base64.b64encode(obj.as_py()).decode("utf-8") + + if pa.types.is_temporal(obj.type): + return _serialize_temporal(obj.as_py()) + + if pa.types.is_list(obj.type) or pa.types.is_map(obj.type): + return [serialize_value(item) for item in obj.as_py()] + + if pa.types.is_struct(obj.type): + return { + field.name: serialize_value(obj.field(field.name).as_py()) + for field in obj.type + } + + if pa.types.is_floating(obj.type): + return float(obj.as_py()) + + return obj.as_py() + + +def _serialize_container(obj): + """Convert container types (dict, list, tuple) recursively.""" + if isinstance(obj, dict): + return {key: serialize_value(value) for key, value in obj.items()} + if isinstance(obj, (list, tuple)): + return [serialize_value(item) for item in obj] + return obj + + +def _serialize_basic_types(obj): + """Convert basic Python types to JSON-serializable format.""" + if isinstance(obj, (bytes, pa.BinaryScalar)): + return base64.b64encode(obj).decode("utf-8") + if isinstance(obj, (datetime, date, time)): + return obj.isoformat() + if isinstance(obj, timedelta): + return obj.total_seconds() + if isinstance(obj, np.number): + return obj.item() + return obj + + +def serialize_value(obj): + """ + Recursively convert objects to JSON-serializable format. + + Handles: + - bytes/PyArrow binary: Base64-encoded string + - datetime types: ISO format string + - PyArrow types: Python native types + - nested types: recursive conversion + """ + # First try basic type conversions + result = _serialize_basic_types(obj) + if result is not obj: + return result + + # Then try container types + result = _serialize_container(obj) + if result is not obj: + return result + + # Finally try PyArrow scalar types + if isinstance(obj, pa.Scalar): + return _serialize_pyarrow_scalar(obj) + + return obj diff --git a/docker/Dockerfile b/docker/Dockerfile index ac4afc7..6289a6c 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -21,7 +21,7 @@ COPY --from=builder /root/.local /home/appuser/.local WORKDIR /app -COPY backend/app.py . +COPY backend/*.py . COPY web/vanilla/ /web/ RUN chown -R appuser:appuser /app /web @@ -47,4 +47,4 @@ LABEL org.opencontainers.image.version="0.1.0" LABEL org.opencontainers.image.licenses="MIT" LABEL com.github.lancedb.version="${LANCEDB_VERSION}" -CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8080"] \ No newline at end of file +CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8080"]