From aa05976cce0b4ac23ae59c69db9fa7823367963a Mon Sep 17 00:00:00 2001
From: "mintlify[bot]" <109931778+mintlify[bot]@users.noreply.github.com>
Date: Tue, 19 May 2026 11:10:07 +0000
Subject: [PATCH 1/6] docs: document to_pandas blob_mode and pandas kwargs
---
docs/tables/multimodal.mdx | 40 ++++++++++++++++++++++++++++++++++++++
1 file changed, 40 insertions(+)
diff --git a/docs/tables/multimodal.mdx b/docs/tables/multimodal.mdx
index ec32440..5375d02 100644
--- a/docs/tables/multimodal.mdx
+++ b/docs/tables/multimodal.mdx
@@ -194,6 +194,46 @@ For more advanced usage, including random access and file-like reading of blobs,
Lance format's [blob API documentation](https://lance.org/guide/blob/).
+### 3. Convert blob tables to pandas
+
+When you call `to_pandas()` on a local LanceDB table that contains Blob API columns, the `blob_mode` argument controls how those columns materialize. This is available in the Python SDK on local tables; remote tables raise `NotImplementedError`.
+
+`blob_mode` accepts:
+
+- `"lazy"` (default): returns blob columns as standard PyArrow binary values. Use this when you want existing behavior and don't need to inspect blob metadata.
+- `"bytes"`: eagerly materializes each blob as `bytes`. Use this when you need the raw payload in the DataFrame, for example to decode an image or audio clip in-process.
+- `"descriptions"`: returns blob descriptors (offsets, sizes, and positions) instead of the data itself. Use this when you want to plan I/O without paying the cost of loading every blob.
+
+Extra keyword arguments are forwarded to the underlying PyArrow / Lance pandas conversion, so you can also pass options like `split_blocks` or `self_destruct`:
+
+```python
+import lancedb
+
+tbl = lancedb.connect("data/sample-lancedb").open_table("videos")
+
+# Default: blob columns come back as PyArrow-style binary
+df = tbl.to_pandas()
+
+# Materialize blob bytes eagerly
+df_bytes = tbl.to_pandas(blob_mode="bytes")
+
+# Return descriptors instead of payloads
+df_desc = tbl.to_pandas(blob_mode="descriptions")
+
+# Forward extra kwargs to PyArrow's to_pandas
+df_typed = tbl.to_pandas(split_blocks=True, self_destruct=True)
+```
+
+Query builders accept the same PyArrow `to_pandas` kwargs, but not `blob_mode`. They materialize Arrow results first, apply LanceDB-specific `flatten` and `timeout` handling, and then forward the remaining kwargs:
+
+```python
+df = (
+ tbl.search(query_vector)
+ .limit(10)
+ .to_pandas(split_blocks=True, self_destruct=True)
+)
+```
+
## Other modalities
The `pa.binary()` and `pa.large_binary()` types are universal. You can use this same pattern for other types of multimodal data:
From 8982a778b7301bf61ad9f6dd6b58840217fcebde Mon Sep 17 00:00:00 2001
From: "mintlify[bot]" <109931778+mintlify[bot]@users.noreply.github.com>
Date: Tue, 19 May 2026 12:07:15 +0000
Subject: [PATCH 2/6] docs: clarify to_pandas lazy fallback for namespace and
memory tables
---
docs/tables/multimodal.mdx | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/docs/tables/multimodal.mdx b/docs/tables/multimodal.mdx
index 5375d02..993d3bb 100644
--- a/docs/tables/multimodal.mdx
+++ b/docs/tables/multimodal.mdx
@@ -200,10 +200,12 @@ When you call `to_pandas()` on a local LanceDB table that contains Blob API colu
`blob_mode` accepts:
-- `"lazy"` (default): returns blob columns as standard PyArrow binary values. Use this when you want existing behavior and don't need to inspect blob metadata.
+- `"lazy"` (default): returns blob columns without eagerly materializing their payloads. Use this when you want existing behavior and don't need to inspect blob metadata. For namespace-managed tables and in-memory datasets, lazy mode falls back to the standard PyArrow `to_pandas()` path.
- `"bytes"`: eagerly materializes each blob as `bytes`. Use this when you need the raw payload in the DataFrame, for example to decode an image or audio clip in-process.
- `"descriptions"`: returns blob descriptors (offsets, sizes, and positions) instead of the data itself. Use this when you want to plan I/O without paying the cost of loading every blob.
+`"bytes"` and `"descriptions"` require a filesystem-backed Lance dataset; they are not supported on in-memory tables or namespace-managed tables.
+
Extra keyword arguments are forwarded to the underlying PyArrow / Lance pandas conversion, so you can also pass options like `split_blocks` or `self_destruct`:
```python
From 29414c47d4c98b2fffbd7e163657c34ed60670bd Mon Sep 17 00:00:00 2001
From: "mintlify[bot]" <109931778+mintlify[bot]@users.noreply.github.com>
Date: Tue, 19 May 2026 14:32:06 +0000
Subject: [PATCH 3/6] docs: convert blob-to-pandas snippets to testable
CodeGroup blocks
---
docs/tables/multimodal.mdx | 66 +++++++++++++++++++++++++++++++-------
1 file changed, 55 insertions(+), 11 deletions(-)
diff --git a/docs/tables/multimodal.mdx b/docs/tables/multimodal.mdx
index 993d3bb..75461bd 100644
--- a/docs/tables/multimodal.mdx
+++ b/docs/tables/multimodal.mdx
@@ -208,33 +208,77 @@ When you call `to_pandas()` on a local LanceDB table that contains Blob API colu
Extra keyword arguments are forwarded to the underlying PyArrow / Lance pandas conversion, so you can also pass options like `split_blocks` or `self_destruct`:
-```python
-import lancedb
-
-tbl = lancedb.connect("data/sample-lancedb").open_table("videos")
+
+
+ {`import lancedb
+import pyarrow as pa
+
+# Create a filesystem-backed blob table
+db = lancedb.connect("data/sample-lancedb")
+schema = pa.schema([
+ pa.field("id", pa.int64()),
+ pa.field(
+ "video",
+ pa.large_binary(),
+ metadata={"lance-encoding:blob": "true"},
+ ),
+])
+tbl = db.create_table(
+ "videos",
+ data=[
+ {"id": 1, "video": b"fake_video_bytes_1"},
+ {"id": 2, "video": b"fake_video_bytes_2"},
+ ],
+ schema=schema,
+ mode="overwrite",
+)
-# Default: blob columns come back as PyArrow-style binary
-df = tbl.to_pandas()
+# Default: blob columns come back lazily
+df_lazy = tbl.to_pandas()
# Materialize blob bytes eagerly
df_bytes = tbl.to_pandas(blob_mode="bytes")
+assert isinstance(df_bytes["video"].iloc[0], bytes)
# Return descriptors instead of payloads
df_desc = tbl.to_pandas(blob_mode="descriptions")
# Forward extra kwargs to PyArrow's to_pandas
-df_typed = tbl.to_pandas(split_blocks=True, self_destruct=True)
-```
+df_typed = tbl.to_pandas(split_blocks=True, self_destruct=True)`}
+
+
Query builders accept the same PyArrow `to_pandas` kwargs, but not `blob_mode`. They materialize Arrow results first, apply LanceDB-specific `flatten` and `timeout` handling, and then forward the remaining kwargs:
-```python
+
+
+ {`import lancedb
+import numpy as np
+import pyarrow as pa
+
+db = lancedb.connect("data/sample-lancedb")
+schema = pa.schema([
+ pa.field("id", pa.int64()),
+ pa.field("vector", pa.list_(pa.float32(), 128)),
+])
+tbl = db.create_table(
+ "search_demo",
+ data=[
+ {"id": i, "vector": np.random.rand(128).astype(np.float32)}
+ for i in range(10)
+ ],
+ schema=schema,
+ mode="overwrite",
+)
+
+query_vector = np.random.rand(128).astype(np.float32)
df = (
tbl.search(query_vector)
.limit(10)
.to_pandas(split_blocks=True, self_destruct=True)
-)
-```
+)`}
+
+
## Other modalities
From 5e48807d4b891180aee743a9a2feaf8ad024eaa0 Mon Sep 17 00:00:00 2001
From: "mintlify[bot]" <109931778+mintlify[bot]@users.noreply.github.com>
Date: Tue, 19 May 2026 14:33:01 +0000
Subject: [PATCH 4/6] docs: convert blob-to-pandas snippets to testable
CodeGroup blocks
---
docs/tables/multimodal.mdx | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/tables/multimodal.mdx b/docs/tables/multimodal.mdx
index 75461bd..5e91e7b 100644
--- a/docs/tables/multimodal.mdx
+++ b/docs/tables/multimodal.mdx
@@ -213,7 +213,7 @@ Extra keyword arguments are forwarded to the underlying PyArrow / Lance pandas c
{`import lancedb
import pyarrow as pa
-# Create a filesystem-backed blob table
+# Create a filesystem-backed blob table.
db = lancedb.connect("data/sample-lancedb")
schema = pa.schema([
pa.field("id", pa.int64()),
From a03c756f47a42e905f9ebf4a0b161ee6a2ddb0d2 Mon Sep 17 00:00:00 2001
From: prrao87 <35005448+prrao87@users.noreply.github.com>
Date: Tue, 19 May 2026 10:38:34 -0400
Subject: [PATCH 5/6] Update multimodal pandas blob examples
---
docs/snippets/multimodal.mdx | 4 ++
docs/tables/multimodal.mdx | 64 ++------------------------
tests/py/test_multimodal.py | 88 ++++++++++++++++++++++++++++++++++++
3 files changed, 96 insertions(+), 60 deletions(-)
diff --git a/docs/snippets/multimodal.mdx b/docs/snippets/multimodal.mdx
index 334eec6..af57c06 100644
--- a/docs/snippets/multimodal.mdx
+++ b/docs/snippets/multimodal.mdx
@@ -4,6 +4,8 @@ export const PyBlobApiIngest = "import lancedb\n\ndb = lancedb.connect(db_path_f
export const PyBlobApiSchema = "import pyarrow as pa\n\n# Define schema with Blob API metadata for lazy loading\nschema = pa.schema([\n pa.field(\"id\", pa.int64()),\n pa.field(\n \"video\", \n pa.large_binary(), \n metadata={\"lance-encoding:blob\": \"true\"} # Enable Blob API\n ),\n])\n";
+export const PyBlobApiToPandas = "import tempfile\nfrom pathlib import Path\n\nimport lancedb\nimport pyarrow as pa\n\nwith tempfile.TemporaryDirectory() as tmpdir:\n # Create a filesystem-backed blob table.\n db = lancedb.connect(str(Path(tmpdir) / \"sample-lancedb\"))\n schema = pa.schema([\n pa.field(\"id\", pa.int64()),\n pa.field(\n \"video\",\n pa.large_binary(),\n metadata={\"lance-encoding:blob\": \"true\"},\n ),\n ])\n tbl = db.create_table(\n \"videos\",\n data=[\n {\"id\": 1, \"video\": b\"fake_video_bytes_1\"},\n {\"id\": 2, \"video\": b\"fake_video_bytes_2\"},\n ],\n schema=schema,\n mode=\"overwrite\",\n )\n\n # Default: blob columns come back lazily\n df_lazy = tbl.to_pandas()\n\n # Materialize blob bytes eagerly\n df_bytes = tbl.to_pandas(blob_mode=\"bytes\")\n\n # Return descriptors instead of payloads\n df_desc = tbl.to_pandas(blob_mode=\"descriptions\")\n\n # Forward extra kwargs to PyArrow's to_pandas\n df_typed = tbl.to_pandas(split_blocks=True, self_destruct=True)\n";
+
export const PyCreateDummyData = "# Create some dummy images\ndef create_dummy_image(color):\n img = Image.new('RGB', (100, 100), color=color)\n buf = io.BytesIO()\n img.save(buf, format='PNG')\n return buf.getvalue()\n\n# Create dataset with metadata, vectors, and image blobs\ndata = [\n {\n \"id\": 1,\n \"filename\": \"red_square.png\",\n \"vector\": np.random.rand(128).astype(np.float32),\n \"image_blob\": create_dummy_image('red'),\n \"label\": \"red\"\n },\n {\n \"id\": 2,\n \"filename\": \"blue_square.png\",\n \"vector\": np.random.rand(128).astype(np.float32),\n \"image_blob\": create_dummy_image('blue'),\n \"label\": \"blue\"\n }\n]\n";
export const PyDefineSchema = "# Define schema explictly to ensure image_blob is treated as binary\nschema = pa.schema([\n pa.field(\"id\", pa.int32()),\n pa.field(\"filename\", pa.string()),\n pa.field(\"vector\", pa.list_(pa.float32(), 128)),\n pa.field(\"image_blob\", pa.binary()), # Important: Use pa.binary() for blobs\n pa.field(\"label\", pa.string())\n])\n";
@@ -14,6 +16,8 @@ export const PyMultimodalImports = "import lancedb\nimport pyarrow as pa\nimport
export const PyProcessResults = "# Convert back to PIL Image\nfor _, row in results.iterrows():\n image_bytes = row['image_blob']\n image = Image.open(io.BytesIO(image_bytes))\n print(f\"Retrieved image: {row['filename']}, Size: {image.size}\")\n # You can now use 'image' with other libraries or display it\n";
+export const PyQueryToPandasKwargs = "import tempfile\nfrom pathlib import Path\n\nimport lancedb\nimport numpy as np\nimport pyarrow as pa\n\nwith tempfile.TemporaryDirectory() as tmpdir:\n db = lancedb.connect(str(Path(tmpdir) / \"sample-lancedb\"))\n schema = pa.schema([\n pa.field(\"id\", pa.int64()),\n pa.field(\"vector\", pa.list_(pa.float32(), 128)),\n ])\n tbl = db.create_table(\n \"search_demo\",\n data=[\n {\"id\": i, \"vector\": np.random.rand(128).astype(np.float32)}\n for i in range(10)\n ],\n schema=schema,\n mode=\"overwrite\",\n )\n\n query_vector = np.random.rand(128).astype(np.float32)\n df = (\n tbl.search(query_vector)\n .limit(10)\n .to_pandas(split_blocks=True, self_destruct=True)\n )\n";
+
export const PySearchData = "# Search for similar images\nquery_vector = np.random.rand(128).astype(np.float32)\nresults = tbl.search(query_vector).limit(1).to_pandas()\n";
export const TsBlobApiIngest = "const blobData = lancedb.makeArrowTable(\n [\n { id: 1, video: Buffer.from(\"fake_video_bytes_1\") },\n { id: 2, video: Buffer.from(\"fake_video_bytes_2\") },\n ],\n { schema: blobSchema },\n);\nconst blobTable = await db.createTable(\"videos\", blobData, {\n mode: \"overwrite\",\n});\n";
diff --git a/docs/tables/multimodal.mdx b/docs/tables/multimodal.mdx
index 5e91e7b..95670fc 100644
--- a/docs/tables/multimodal.mdx
+++ b/docs/tables/multimodal.mdx
@@ -31,6 +31,8 @@ import {
PyBlobApiIngest as BlobApiIngest,
TsBlobApiIngest as TsBlobApiIngest,
RsBlobApiIngest as RsBlobApiIngest,
+ PyBlobApiToPandas as BlobApiToPandas,
+ PyQueryToPandasKwargs as QueryToPandasKwargs,
} from '/snippets/multimodal.mdx';
LanceDB handles multimodal data—images, audio, video, and PDF files—natively by storing the raw bytes in a binary column alongside your vectors and metadata. This approach simplifies your data infrastructure by keeping the raw assets and their embeddings in the same database, eliminating the need for separate object storage for many use cases.
@@ -210,41 +212,7 @@ Extra keyword arguments are forwarded to the underlying PyArrow / Lance pandas c
- {`import lancedb
-import pyarrow as pa
-
-# Create a filesystem-backed blob table.
-db = lancedb.connect("data/sample-lancedb")
-schema = pa.schema([
- pa.field("id", pa.int64()),
- pa.field(
- "video",
- pa.large_binary(),
- metadata={"lance-encoding:blob": "true"},
- ),
-])
-tbl = db.create_table(
- "videos",
- data=[
- {"id": 1, "video": b"fake_video_bytes_1"},
- {"id": 2, "video": b"fake_video_bytes_2"},
- ],
- schema=schema,
- mode="overwrite",
-)
-
-# Default: blob columns come back lazily
-df_lazy = tbl.to_pandas()
-
-# Materialize blob bytes eagerly
-df_bytes = tbl.to_pandas(blob_mode="bytes")
-assert isinstance(df_bytes["video"].iloc[0], bytes)
-
-# Return descriptors instead of payloads
-df_desc = tbl.to_pandas(blob_mode="descriptions")
-
-# Forward extra kwargs to PyArrow's to_pandas
-df_typed = tbl.to_pandas(split_blocks=True, self_destruct=True)`}
+ {BlobApiToPandas}
@@ -252,31 +220,7 @@ Query builders accept the same PyArrow `to_pandas` kwargs, but not `blob_mode`.
- {`import lancedb
-import numpy as np
-import pyarrow as pa
-
-db = lancedb.connect("data/sample-lancedb")
-schema = pa.schema([
- pa.field("id", pa.int64()),
- pa.field("vector", pa.list_(pa.float32(), 128)),
-])
-tbl = db.create_table(
- "search_demo",
- data=[
- {"id": i, "vector": np.random.rand(128).astype(np.float32)}
- for i in range(10)
- ],
- schema=schema,
- mode="overwrite",
-)
-
-query_vector = np.random.rand(128).astype(np.float32)
-df = (
- tbl.search(query_vector)
- .limit(10)
- .to_pandas(split_blocks=True, self_destruct=True)
-)`}
+ {QueryToPandasKwargs}
diff --git a/tests/py/test_multimodal.py b/tests/py/test_multimodal.py
index 4069a77..9719a3e 100644
--- a/tests/py/test_multimodal.py
+++ b/tests/py/test_multimodal.py
@@ -120,3 +120,91 @@ def test_blob_api_definition(db_path_factory):
tbl = db.create_table("videos", data=data, schema=schema)
# --8<-- [end:blob_api_ingest]
assert len(tbl) == 2
+
+
+def test_blob_api_to_pandas():
+ # --8<-- [start:blob_api_to_pandas]
+ import tempfile
+ from pathlib import Path
+
+ import lancedb
+ import pyarrow as pa
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ # Create a filesystem-backed blob table.
+ db = lancedb.connect(str(Path(tmpdir) / "sample-lancedb"))
+ schema = pa.schema([
+ pa.field("id", pa.int64()),
+ pa.field(
+ "video",
+ pa.large_binary(),
+ metadata={"lance-encoding:blob": "true"},
+ ),
+ ])
+ tbl = db.create_table(
+ "videos",
+ data=[
+ {"id": 1, "video": b"fake_video_bytes_1"},
+ {"id": 2, "video": b"fake_video_bytes_2"},
+ ],
+ schema=schema,
+ mode="overwrite",
+ )
+
+ # Default: blob columns come back lazily
+ df_lazy = tbl.to_pandas()
+
+ # Materialize blob bytes eagerly
+ df_bytes = tbl.to_pandas(blob_mode="bytes")
+
+ # Return descriptors instead of payloads
+ df_desc = tbl.to_pandas(blob_mode="descriptions")
+
+ # Forward extra kwargs to PyArrow's to_pandas
+ df_typed = tbl.to_pandas(split_blocks=True, self_destruct=True)
+ # --8<-- [end:blob_api_to_pandas]
+
+ assert len(df_lazy) == 2
+ assert isinstance(df_bytes["video"].iloc[0], bytes)
+ assert df_bytes["video"].tolist() == [
+ b"fake_video_bytes_1",
+ b"fake_video_bytes_2",
+ ]
+ assert len(df_desc) == 2
+ assert len(df_typed) == 2
+
+
+def test_query_to_pandas_kwargs():
+ # --8<-- [start:query_to_pandas_kwargs]
+ import tempfile
+ from pathlib import Path
+
+ import lancedb
+ import numpy as np
+ import pyarrow as pa
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ db = lancedb.connect(str(Path(tmpdir) / "sample-lancedb"))
+ schema = pa.schema([
+ pa.field("id", pa.int64()),
+ pa.field("vector", pa.list_(pa.float32(), 128)),
+ ])
+ tbl = db.create_table(
+ "search_demo",
+ data=[
+ {"id": i, "vector": np.random.rand(128).astype(np.float32)}
+ for i in range(10)
+ ],
+ schema=schema,
+ mode="overwrite",
+ )
+
+ query_vector = np.random.rand(128).astype(np.float32)
+ df = (
+ tbl.search(query_vector)
+ .limit(10)
+ .to_pandas(split_blocks=True, self_destruct=True)
+ )
+ # --8<-- [end:query_to_pandas_kwargs]
+
+ assert len(df) == 10
From f58fa0c851a6ac977df0c0b1846fc51ce9f01315 Mon Sep 17 00:00:00 2001
From: prrao87 <35005448+prrao87@users.noreply.github.com>
Date: Tue, 19 May 2026 10:42:33 -0400
Subject: [PATCH 6/6] Fix snippets
---
docs/snippets/multimodal.mdx | 4 +-
tests/py/test_multimodal.py | 118 +++++++++++++++--------------------
2 files changed, 53 insertions(+), 69 deletions(-)
diff --git a/docs/snippets/multimodal.mdx b/docs/snippets/multimodal.mdx
index af57c06..1e0cde4 100644
--- a/docs/snippets/multimodal.mdx
+++ b/docs/snippets/multimodal.mdx
@@ -4,7 +4,7 @@ export const PyBlobApiIngest = "import lancedb\n\ndb = lancedb.connect(db_path_f
export const PyBlobApiSchema = "import pyarrow as pa\n\n# Define schema with Blob API metadata for lazy loading\nschema = pa.schema([\n pa.field(\"id\", pa.int64()),\n pa.field(\n \"video\", \n pa.large_binary(), \n metadata={\"lance-encoding:blob\": \"true\"} # Enable Blob API\n ),\n])\n";
-export const PyBlobApiToPandas = "import tempfile\nfrom pathlib import Path\n\nimport lancedb\nimport pyarrow as pa\n\nwith tempfile.TemporaryDirectory() as tmpdir:\n # Create a filesystem-backed blob table.\n db = lancedb.connect(str(Path(tmpdir) / \"sample-lancedb\"))\n schema = pa.schema([\n pa.field(\"id\", pa.int64()),\n pa.field(\n \"video\",\n pa.large_binary(),\n metadata={\"lance-encoding:blob\": \"true\"},\n ),\n ])\n tbl = db.create_table(\n \"videos\",\n data=[\n {\"id\": 1, \"video\": b\"fake_video_bytes_1\"},\n {\"id\": 2, \"video\": b\"fake_video_bytes_2\"},\n ],\n schema=schema,\n mode=\"overwrite\",\n )\n\n # Default: blob columns come back lazily\n df_lazy = tbl.to_pandas()\n\n # Materialize blob bytes eagerly\n df_bytes = tbl.to_pandas(blob_mode=\"bytes\")\n\n # Return descriptors instead of payloads\n df_desc = tbl.to_pandas(blob_mode=\"descriptions\")\n\n # Forward extra kwargs to PyArrow's to_pandas\n df_typed = tbl.to_pandas(split_blocks=True, self_destruct=True)\n";
+export const PyBlobApiToPandas = "# Default: blob columns come back lazily\ndf_lazy = tbl.to_pandas()\n\n# Materialize blob bytes eagerly\ndf_bytes = tbl.to_pandas(blob_mode=\"bytes\")\n\n# Return descriptors instead of payloads\ndf_desc = tbl.to_pandas(blob_mode=\"descriptions\")\n\n# Forward extra kwargs to PyArrow's to_pandas\ndf_typed = tbl.to_pandas(split_blocks=True, self_destruct=True)\n";
export const PyCreateDummyData = "# Create some dummy images\ndef create_dummy_image(color):\n img = Image.new('RGB', (100, 100), color=color)\n buf = io.BytesIO()\n img.save(buf, format='PNG')\n return buf.getvalue()\n\n# Create dataset with metadata, vectors, and image blobs\ndata = [\n {\n \"id\": 1,\n \"filename\": \"red_square.png\",\n \"vector\": np.random.rand(128).astype(np.float32),\n \"image_blob\": create_dummy_image('red'),\n \"label\": \"red\"\n },\n {\n \"id\": 2,\n \"filename\": \"blue_square.png\",\n \"vector\": np.random.rand(128).astype(np.float32),\n \"image_blob\": create_dummy_image('blue'),\n \"label\": \"blue\"\n }\n]\n";
@@ -16,7 +16,7 @@ export const PyMultimodalImports = "import lancedb\nimport pyarrow as pa\nimport
export const PyProcessResults = "# Convert back to PIL Image\nfor _, row in results.iterrows():\n image_bytes = row['image_blob']\n image = Image.open(io.BytesIO(image_bytes))\n print(f\"Retrieved image: {row['filename']}, Size: {image.size}\")\n # You can now use 'image' with other libraries or display it\n";
-export const PyQueryToPandasKwargs = "import tempfile\nfrom pathlib import Path\n\nimport lancedb\nimport numpy as np\nimport pyarrow as pa\n\nwith tempfile.TemporaryDirectory() as tmpdir:\n db = lancedb.connect(str(Path(tmpdir) / \"sample-lancedb\"))\n schema = pa.schema([\n pa.field(\"id\", pa.int64()),\n pa.field(\"vector\", pa.list_(pa.float32(), 128)),\n ])\n tbl = db.create_table(\n \"search_demo\",\n data=[\n {\"id\": i, \"vector\": np.random.rand(128).astype(np.float32)}\n for i in range(10)\n ],\n schema=schema,\n mode=\"overwrite\",\n )\n\n query_vector = np.random.rand(128).astype(np.float32)\n df = (\n tbl.search(query_vector)\n .limit(10)\n .to_pandas(split_blocks=True, self_destruct=True)\n )\n";
+export const PyQueryToPandasKwargs = "df = (\n tbl.search(query_vector)\n .limit(10)\n .to_pandas(split_blocks=True, self_destruct=True)\n)\n";
export const PySearchData = "# Search for similar images\nquery_vector = np.random.rand(128).astype(np.float32)\nresults = tbl.search(query_vector).limit(1).to_pandas()\n";
diff --git a/tests/py/test_multimodal.py b/tests/py/test_multimodal.py
index 9719a3e..7cf5e8e 100644
--- a/tests/py/test_multimodal.py
+++ b/tests/py/test_multimodal.py
@@ -122,46 +122,38 @@ def test_blob_api_definition(db_path_factory):
assert len(tbl) == 2
-def test_blob_api_to_pandas():
+def test_blob_api_to_pandas(db_path_factory):
+ db = lancedb.connect(db_path_factory("blob_to_pandas_db"))
+ schema = pa.schema([
+ pa.field("id", pa.int64()),
+ pa.field(
+ "video",
+ pa.large_binary(),
+ metadata={"lance-encoding:blob": "true"},
+ ),
+ ])
+ tbl = db.create_table(
+ "videos",
+ data=[
+ {"id": 1, "video": b"fake_video_bytes_1"},
+ {"id": 2, "video": b"fake_video_bytes_2"},
+ ],
+ schema=schema,
+ mode="overwrite",
+ )
+
# --8<-- [start:blob_api_to_pandas]
- import tempfile
- from pathlib import Path
+ # Default: blob columns come back lazily
+ df_lazy = tbl.to_pandas()
- import lancedb
- import pyarrow as pa
+ # Materialize blob bytes eagerly
+ df_bytes = tbl.to_pandas(blob_mode="bytes")
- with tempfile.TemporaryDirectory() as tmpdir:
- # Create a filesystem-backed blob table.
- db = lancedb.connect(str(Path(tmpdir) / "sample-lancedb"))
- schema = pa.schema([
- pa.field("id", pa.int64()),
- pa.field(
- "video",
- pa.large_binary(),
- metadata={"lance-encoding:blob": "true"},
- ),
- ])
- tbl = db.create_table(
- "videos",
- data=[
- {"id": 1, "video": b"fake_video_bytes_1"},
- {"id": 2, "video": b"fake_video_bytes_2"},
- ],
- schema=schema,
- mode="overwrite",
- )
-
- # Default: blob columns come back lazily
- df_lazy = tbl.to_pandas()
-
- # Materialize blob bytes eagerly
- df_bytes = tbl.to_pandas(blob_mode="bytes")
-
- # Return descriptors instead of payloads
- df_desc = tbl.to_pandas(blob_mode="descriptions")
-
- # Forward extra kwargs to PyArrow's to_pandas
- df_typed = tbl.to_pandas(split_blocks=True, self_destruct=True)
+ # Return descriptors instead of payloads
+ df_desc = tbl.to_pandas(blob_mode="descriptions")
+
+ # Forward extra kwargs to PyArrow's to_pandas
+ df_typed = tbl.to_pandas(split_blocks=True, self_destruct=True)
# --8<-- [end:blob_api_to_pandas]
assert len(df_lazy) == 2
@@ -174,37 +166,29 @@ def test_blob_api_to_pandas():
assert len(df_typed) == 2
-def test_query_to_pandas_kwargs():
- # --8<-- [start:query_to_pandas_kwargs]
- import tempfile
- from pathlib import Path
-
- import lancedb
- import numpy as np
- import pyarrow as pa
+def test_query_to_pandas_kwargs(db_path_factory):
+ db = lancedb.connect(db_path_factory("query_to_pandas_db"))
+ schema = pa.schema([
+ pa.field("id", pa.int64()),
+ pa.field("vector", pa.list_(pa.float32(), 128)),
+ ])
+ tbl = db.create_table(
+ "search_demo",
+ data=[
+ {"id": i, "vector": np.random.rand(128).astype(np.float32)}
+ for i in range(10)
+ ],
+ schema=schema,
+ mode="overwrite",
+ )
+ query_vector = np.random.rand(128).astype(np.float32)
- with tempfile.TemporaryDirectory() as tmpdir:
- db = lancedb.connect(str(Path(tmpdir) / "sample-lancedb"))
- schema = pa.schema([
- pa.field("id", pa.int64()),
- pa.field("vector", pa.list_(pa.float32(), 128)),
- ])
- tbl = db.create_table(
- "search_demo",
- data=[
- {"id": i, "vector": np.random.rand(128).astype(np.float32)}
- for i in range(10)
- ],
- schema=schema,
- mode="overwrite",
- )
-
- query_vector = np.random.rand(128).astype(np.float32)
- df = (
- tbl.search(query_vector)
- .limit(10)
- .to_pandas(split_blocks=True, self_destruct=True)
- )
+ # --8<-- [start:query_to_pandas_kwargs]
+ df = (
+ tbl.search(query_vector)
+ .limit(10)
+ .to_pandas(split_blocks=True, self_destruct=True)
+ )
# --8<-- [end:query_to_pandas_kwargs]
assert len(df) == 10