lancedb · mintlify · May 19, 2026 · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/docs/snippets/multimodal.mdx b/docs/snippets/multimodal.mdx
@@ -4,6 +4,8 @@ export const PyBlobApiIngest = "import lancedb\n\ndb = lancedb.connect(db_path_f
 
 export const PyBlobApiSchema = "import pyarrow as pa\n\n# Define schema with Blob API metadata for lazy loading\nschema = pa.schema([\n    pa.field(\"id\", pa.int64()),\n    pa.field(\n        \"video\", \n        pa.large_binary(), \n        metadata={\"lance-encoding:blob\": \"true\"} # Enable Blob API\n    ),\n])\n";
 
+export const PyBlobApiToPandas = "# Default: blob columns come back lazily\ndf_lazy = tbl.to_pandas()\n\n# Materialize blob bytes eagerly\ndf_bytes = tbl.to_pandas(blob_mode=\"bytes\")\n\n# Return descriptors instead of payloads\ndf_desc = tbl.to_pandas(blob_mode=\"descriptions\")\n\n# Forward extra kwargs to PyArrow's to_pandas\ndf_typed = tbl.to_pandas(split_blocks=True, self_destruct=True)\n";
+
 export const PyCreateDummyData = "# Create some dummy images\ndef create_dummy_image(color):\n    img = Image.new('RGB', (100, 100), color=color)\n    buf = io.BytesIO()\n    img.save(buf, format='PNG')\n    return buf.getvalue()\n\n# Create dataset with metadata, vectors, and image blobs\ndata = [\n    {\n        \"id\": 1,\n        \"filename\": \"red_square.png\",\n        \"vector\": np.random.rand(128).astype(np.float32),\n        \"image_blob\": create_dummy_image('red'),\n        \"label\": \"red\"\n    },\n    {\n        \"id\": 2,\n        \"filename\": \"blue_square.png\",\n        \"vector\": np.random.rand(128).astype(np.float32),\n        \"image_blob\": create_dummy_image('blue'),\n        \"label\": \"blue\"\n    }\n]\n";
 
 export const PyDefineSchema = "# Define schema explictly to ensure image_blob is treated as binary\nschema = pa.schema([\n    pa.field(\"id\", pa.int32()),\n    pa.field(\"filename\", pa.string()),\n    pa.field(\"vector\", pa.list_(pa.float32(), 128)),\n    pa.field(\"image_blob\", pa.binary()), # Important: Use pa.binary() for blobs\n    pa.field(\"label\", pa.string())\n])\n";
@@ -14,6 +16,8 @@ export const PyMultimodalImports = "import lancedb\nimport pyarrow as pa\nimport
 
 export const PyProcessResults = "# Convert back to PIL Image\nfor _, row in results.iterrows():\n    image_bytes = row['image_blob']\n    image = Image.open(io.BytesIO(image_bytes))\n    print(f\"Retrieved image: {row['filename']}, Size: {image.size}\")\n    # You can now use 'image' with other libraries or display it\n";
 
+export const PyQueryToPandasKwargs = "df = (\n    tbl.search(query_vector)\n    .limit(10)\n    .to_pandas(split_blocks=True, self_destruct=True)\n)\n";
+
 export const PySearchData = "# Search for similar images\nquery_vector = np.random.rand(128).astype(np.float32)\nresults = tbl.search(query_vector).limit(1).to_pandas()\n";
 
 export const TsBlobApiIngest = "const blobData = lancedb.makeArrowTable(\n  [\n    { id: 1, video: Buffer.from(\"fake_video_bytes_1\") },\n    { id: 2, video: Buffer.from(\"fake_video_bytes_2\") },\n  ],\n  { schema: blobSchema },\n);\nconst blobTable = await db.createTable(\"videos\", blobData, {\n  mode: \"overwrite\",\n});\n";

diff --git a/docs/tables/multimodal.mdx b/docs/tables/multimodal.mdx
@@ -31,6 +31,8 @@ import {
     PyBlobApiIngest as BlobApiIngest,
     TsBlobApiIngest as TsBlobApiIngest,
     RsBlobApiIngest as RsBlobApiIngest,
+    PyBlobApiToPandas as BlobApiToPandas,
+    PyQueryToPandasKwargs as QueryToPandasKwargs,
 } from '/snippets/multimodal.mdx';
 
 LanceDB handles multimodal data—images, audio, video, and PDF files—natively by storing the raw bytes in a binary column alongside your vectors and metadata. This approach simplifies your data infrastructure by keeping the raw assets and their embeddings in the same database, eliminating the need for separate object storage for many use cases.
@@ -194,6 +196,34 @@ For more advanced usage, including random access and file-like reading of blobs,
 Lance format's [blob API documentation](https://lance.org/guide/blob/).
 </Card>
 
+### 3. Convert blob tables to pandas
+
+When you call `to_pandas()` on a local LanceDB table that contains Blob API columns, the `blob_mode` argument controls how those columns materialize. This is available in the Python SDK on local tables; remote tables raise `NotImplementedError`.
+
+`blob_mode` accepts:
+
+- `"lazy"` (default): returns blob columns without eagerly materializing their payloads. Use this when you want existing behavior and don't need to inspect blob metadata. For namespace-managed tables and in-memory datasets, lazy mode falls back to the standard PyArrow `to_pandas()` path.
+- `"bytes"`: eagerly materializes each blob as `bytes`. Use this when you need the raw payload in the DataFrame, for example to decode an image or audio clip in-process.
+- `"descriptions"`: returns blob descriptors (offsets, sizes, and positions) instead of the data itself. Use this when you want to plan I/O without paying the cost of loading every blob.
+
+`"bytes"` and `"descriptions"` require a filesystem-backed Lance dataset; they are not supported on in-memory tables or namespace-managed tables.
+
+Extra keyword arguments are forwarded to the underlying PyArrow / Lance pandas conversion, so you can also pass options like `split_blocks` or `self_destruct`:
+
+<CodeGroup>
+    <CodeBlock filename="Python" language="Python" icon="python">
+    {BlobApiToPandas}
+    </CodeBlock>
+</CodeGroup>
+
+Query builders accept the same PyArrow `to_pandas` kwargs, but not `blob_mode`. They materialize Arrow results first, apply LanceDB-specific `flatten` and `timeout` handling, and then forward the remaining kwargs:
+
+<CodeGroup>
+    <CodeBlock filename="Python" language="Python" icon="python">
+    {QueryToPandasKwargs}
+    </CodeBlock>
+</CodeGroup>
+
 ## Other modalities
 
 The `pa.binary()` and `pa.large_binary()` types are universal. You can use this same pattern for other types of multimodal data:

diff --git a/tests/py/test_multimodal.py b/tests/py/test_multimodal.py
@@ -120,3 +120,75 @@ def test_blob_api_definition(db_path_factory):
     tbl = db.create_table("videos", data=data, schema=schema)
     # --8<-- [end:blob_api_ingest]
     assert len(tbl) == 2
+
+
+def test_blob_api_to_pandas(db_path_factory):
+    db = lancedb.connect(db_path_factory("blob_to_pandas_db"))
+    schema = pa.schema([
+        pa.field("id", pa.int64()),
+        pa.field(
+            "video",
+            pa.large_binary(),
+            metadata={"lance-encoding:blob": "true"},
+        ),
+    ])
+    tbl = db.create_table(
+        "videos",
+        data=[
+            {"id": 1, "video": b"fake_video_bytes_1"},
+            {"id": 2, "video": b"fake_video_bytes_2"},
+        ],
+        schema=schema,
+        mode="overwrite",
+    )
+
+    # --8<-- [start:blob_api_to_pandas]
+    # Default: blob columns come back lazily
+    df_lazy = tbl.to_pandas()
+
+    # Materialize blob bytes eagerly
+    df_bytes = tbl.to_pandas(blob_mode="bytes")
+
+    # Return descriptors instead of payloads
+    df_desc = tbl.to_pandas(blob_mode="descriptions")
+
+    # Forward extra kwargs to PyArrow's to_pandas
+    df_typed = tbl.to_pandas(split_blocks=True, self_destruct=True)
+    # --8<-- [end:blob_api_to_pandas]
+
+    assert len(df_lazy) == 2
+    assert isinstance(df_bytes["video"].iloc[0], bytes)
+    assert df_bytes["video"].tolist() == [
+        b"fake_video_bytes_1",
+        b"fake_video_bytes_2",
+    ]
+    assert len(df_desc) == 2
+    assert len(df_typed) == 2
+
+
+def test_query_to_pandas_kwargs(db_path_factory):
+    db = lancedb.connect(db_path_factory("query_to_pandas_db"))
+    schema = pa.schema([
+        pa.field("id", pa.int64()),
+        pa.field("vector", pa.list_(pa.float32(), 128)),
+    ])
+    tbl = db.create_table(
+        "search_demo",
+        data=[
+            {"id": i, "vector": np.random.rand(128).astype(np.float32)}
+            for i in range(10)
+        ],
+        schema=schema,
+        mode="overwrite",
+    )
+    query_vector = np.random.rand(128).astype(np.float32)
+
+    # --8<-- [start:query_to_pandas_kwargs]
+    df = (
+        tbl.search(query_vector)
+        .limit(10)
+        .to_pandas(split_blocks=True, self_destruct=True)
+    )
+    # --8<-- [end:query_to_pandas_kwargs]
+
+    assert len(df) == 10