diff --git a/docs/snippets/multimodal.mdx b/docs/snippets/multimodal.mdx index 334eec6..1e0cde4 100644 --- a/docs/snippets/multimodal.mdx +++ b/docs/snippets/multimodal.mdx @@ -4,6 +4,8 @@ export const PyBlobApiIngest = "import lancedb\n\ndb = lancedb.connect(db_path_f export const PyBlobApiSchema = "import pyarrow as pa\n\n# Define schema with Blob API metadata for lazy loading\nschema = pa.schema([\n pa.field(\"id\", pa.int64()),\n pa.field(\n \"video\", \n pa.large_binary(), \n metadata={\"lance-encoding:blob\": \"true\"} # Enable Blob API\n ),\n])\n"; +export const PyBlobApiToPandas = "# Default: blob columns come back lazily\ndf_lazy = tbl.to_pandas()\n\n# Materialize blob bytes eagerly\ndf_bytes = tbl.to_pandas(blob_mode=\"bytes\")\n\n# Return descriptors instead of payloads\ndf_desc = tbl.to_pandas(blob_mode=\"descriptions\")\n\n# Forward extra kwargs to PyArrow's to_pandas\ndf_typed = tbl.to_pandas(split_blocks=True, self_destruct=True)\n"; + export const PyCreateDummyData = "# Create some dummy images\ndef create_dummy_image(color):\n img = Image.new('RGB', (100, 100), color=color)\n buf = io.BytesIO()\n img.save(buf, format='PNG')\n return buf.getvalue()\n\n# Create dataset with metadata, vectors, and image blobs\ndata = [\n {\n \"id\": 1,\n \"filename\": \"red_square.png\",\n \"vector\": np.random.rand(128).astype(np.float32),\n \"image_blob\": create_dummy_image('red'),\n \"label\": \"red\"\n },\n {\n \"id\": 2,\n \"filename\": \"blue_square.png\",\n \"vector\": np.random.rand(128).astype(np.float32),\n \"image_blob\": create_dummy_image('blue'),\n \"label\": \"blue\"\n }\n]\n"; export const PyDefineSchema = "# Define schema explictly to ensure image_blob is treated as binary\nschema = pa.schema([\n pa.field(\"id\", pa.int32()),\n pa.field(\"filename\", pa.string()),\n pa.field(\"vector\", pa.list_(pa.float32(), 128)),\n pa.field(\"image_blob\", pa.binary()), # Important: Use pa.binary() for blobs\n pa.field(\"label\", pa.string())\n])\n"; @@ -14,6 +16,8 @@ export const PyMultimodalImports = "import lancedb\nimport pyarrow as pa\nimport export const PyProcessResults = "# Convert back to PIL Image\nfor _, row in results.iterrows():\n image_bytes = row['image_blob']\n image = Image.open(io.BytesIO(image_bytes))\n print(f\"Retrieved image: {row['filename']}, Size: {image.size}\")\n # You can now use 'image' with other libraries or display it\n"; +export const PyQueryToPandasKwargs = "df = (\n tbl.search(query_vector)\n .limit(10)\n .to_pandas(split_blocks=True, self_destruct=True)\n)\n"; + export const PySearchData = "# Search for similar images\nquery_vector = np.random.rand(128).astype(np.float32)\nresults = tbl.search(query_vector).limit(1).to_pandas()\n"; export const TsBlobApiIngest = "const blobData = lancedb.makeArrowTable(\n [\n { id: 1, video: Buffer.from(\"fake_video_bytes_1\") },\n { id: 2, video: Buffer.from(\"fake_video_bytes_2\") },\n ],\n { schema: blobSchema },\n);\nconst blobTable = await db.createTable(\"videos\", blobData, {\n mode: \"overwrite\",\n});\n"; diff --git a/docs/tables/multimodal.mdx b/docs/tables/multimodal.mdx index ec32440..95670fc 100644 --- a/docs/tables/multimodal.mdx +++ b/docs/tables/multimodal.mdx @@ -31,6 +31,8 @@ import { PyBlobApiIngest as BlobApiIngest, TsBlobApiIngest as TsBlobApiIngest, RsBlobApiIngest as RsBlobApiIngest, + PyBlobApiToPandas as BlobApiToPandas, + PyQueryToPandasKwargs as QueryToPandasKwargs, } from '/snippets/multimodal.mdx'; LanceDB handles multimodal data—images, audio, video, and PDF files—natively by storing the raw bytes in a binary column alongside your vectors and metadata. This approach simplifies your data infrastructure by keeping the raw assets and their embeddings in the same database, eliminating the need for separate object storage for many use cases. @@ -194,6 +196,34 @@ For more advanced usage, including random access and file-like reading of blobs, Lance format's [blob API documentation](https://lance.org/guide/blob/). +### 3. Convert blob tables to pandas + +When you call `to_pandas()` on a local LanceDB table that contains Blob API columns, the `blob_mode` argument controls how those columns materialize. This is available in the Python SDK on local tables; remote tables raise `NotImplementedError`. + +`blob_mode` accepts: + +- `"lazy"` (default): returns blob columns without eagerly materializing their payloads. Use this when you want existing behavior and don't need to inspect blob metadata. For namespace-managed tables and in-memory datasets, lazy mode falls back to the standard PyArrow `to_pandas()` path. +- `"bytes"`: eagerly materializes each blob as `bytes`. Use this when you need the raw payload in the DataFrame, for example to decode an image or audio clip in-process. +- `"descriptions"`: returns blob descriptors (offsets, sizes, and positions) instead of the data itself. Use this when you want to plan I/O without paying the cost of loading every blob. + +`"bytes"` and `"descriptions"` require a filesystem-backed Lance dataset; they are not supported on in-memory tables or namespace-managed tables. + +Extra keyword arguments are forwarded to the underlying PyArrow / Lance pandas conversion, so you can also pass options like `split_blocks` or `self_destruct`: + + + + {BlobApiToPandas} + + + +Query builders accept the same PyArrow `to_pandas` kwargs, but not `blob_mode`. They materialize Arrow results first, apply LanceDB-specific `flatten` and `timeout` handling, and then forward the remaining kwargs: + + + + {QueryToPandasKwargs} + + + ## Other modalities The `pa.binary()` and `pa.large_binary()` types are universal. You can use this same pattern for other types of multimodal data: diff --git a/tests/py/test_multimodal.py b/tests/py/test_multimodal.py index 4069a77..7cf5e8e 100644 --- a/tests/py/test_multimodal.py +++ b/tests/py/test_multimodal.py @@ -120,3 +120,75 @@ def test_blob_api_definition(db_path_factory): tbl = db.create_table("videos", data=data, schema=schema) # --8<-- [end:blob_api_ingest] assert len(tbl) == 2 + + +def test_blob_api_to_pandas(db_path_factory): + db = lancedb.connect(db_path_factory("blob_to_pandas_db")) + schema = pa.schema([ + pa.field("id", pa.int64()), + pa.field( + "video", + pa.large_binary(), + metadata={"lance-encoding:blob": "true"}, + ), + ]) + tbl = db.create_table( + "videos", + data=[ + {"id": 1, "video": b"fake_video_bytes_1"}, + {"id": 2, "video": b"fake_video_bytes_2"}, + ], + schema=schema, + mode="overwrite", + ) + + # --8<-- [start:blob_api_to_pandas] + # Default: blob columns come back lazily + df_lazy = tbl.to_pandas() + + # Materialize blob bytes eagerly + df_bytes = tbl.to_pandas(blob_mode="bytes") + + # Return descriptors instead of payloads + df_desc = tbl.to_pandas(blob_mode="descriptions") + + # Forward extra kwargs to PyArrow's to_pandas + df_typed = tbl.to_pandas(split_blocks=True, self_destruct=True) + # --8<-- [end:blob_api_to_pandas] + + assert len(df_lazy) == 2 + assert isinstance(df_bytes["video"].iloc[0], bytes) + assert df_bytes["video"].tolist() == [ + b"fake_video_bytes_1", + b"fake_video_bytes_2", + ] + assert len(df_desc) == 2 + assert len(df_typed) == 2 + + +def test_query_to_pandas_kwargs(db_path_factory): + db = lancedb.connect(db_path_factory("query_to_pandas_db")) + schema = pa.schema([ + pa.field("id", pa.int64()), + pa.field("vector", pa.list_(pa.float32(), 128)), + ]) + tbl = db.create_table( + "search_demo", + data=[ + {"id": i, "vector": np.random.rand(128).astype(np.float32)} + for i in range(10) + ], + schema=schema, + mode="overwrite", + ) + query_vector = np.random.rand(128).astype(np.float32) + + # --8<-- [start:query_to_pandas_kwargs] + df = ( + tbl.search(query_vector) + .limit(10) + .to_pandas(split_blocks=True, self_destruct=True) + ) + # --8<-- [end:query_to_pandas_kwargs] + + assert len(df) == 10