Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/snippets/multimodal.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ export const PyBlobApiIngest = "import lancedb\n\ndb = lancedb.connect(db_path_f

export const PyBlobApiSchema = "import pyarrow as pa\n\n# Define schema with Blob API metadata for lazy loading\nschema = pa.schema([\n pa.field(\"id\", pa.int64()),\n pa.field(\n \"video\", \n pa.large_binary(), \n metadata={\"lance-encoding:blob\": \"true\"} # Enable Blob API\n ),\n])\n";

export const PyBlobApiToPandas = "# Default: blob columns come back lazily\ndf_lazy = tbl.to_pandas()\n\n# Materialize blob bytes eagerly\ndf_bytes = tbl.to_pandas(blob_mode=\"bytes\")\n\n# Return descriptors instead of payloads\ndf_desc = tbl.to_pandas(blob_mode=\"descriptions\")\n\n# Forward extra kwargs to PyArrow's to_pandas\ndf_typed = tbl.to_pandas(split_blocks=True, self_destruct=True)\n";

export const PyCreateDummyData = "# Create some dummy images\ndef create_dummy_image(color):\n img = Image.new('RGB', (100, 100), color=color)\n buf = io.BytesIO()\n img.save(buf, format='PNG')\n return buf.getvalue()\n\n# Create dataset with metadata, vectors, and image blobs\ndata = [\n {\n \"id\": 1,\n \"filename\": \"red_square.png\",\n \"vector\": np.random.rand(128).astype(np.float32),\n \"image_blob\": create_dummy_image('red'),\n \"label\": \"red\"\n },\n {\n \"id\": 2,\n \"filename\": \"blue_square.png\",\n \"vector\": np.random.rand(128).astype(np.float32),\n \"image_blob\": create_dummy_image('blue'),\n \"label\": \"blue\"\n }\n]\n";

export const PyDefineSchema = "# Define schema explictly to ensure image_blob is treated as binary\nschema = pa.schema([\n pa.field(\"id\", pa.int32()),\n pa.field(\"filename\", pa.string()),\n pa.field(\"vector\", pa.list_(pa.float32(), 128)),\n pa.field(\"image_blob\", pa.binary()), # Important: Use pa.binary() for blobs\n pa.field(\"label\", pa.string())\n])\n";
Expand All @@ -14,6 +16,8 @@ export const PyMultimodalImports = "import lancedb\nimport pyarrow as pa\nimport

export const PyProcessResults = "# Convert back to PIL Image\nfor _, row in results.iterrows():\n image_bytes = row['image_blob']\n image = Image.open(io.BytesIO(image_bytes))\n print(f\"Retrieved image: {row['filename']}, Size: {image.size}\")\n # You can now use 'image' with other libraries or display it\n";

export const PyQueryToPandasKwargs = "df = (\n tbl.search(query_vector)\n .limit(10)\n .to_pandas(split_blocks=True, self_destruct=True)\n)\n";

export const PySearchData = "# Search for similar images\nquery_vector = np.random.rand(128).astype(np.float32)\nresults = tbl.search(query_vector).limit(1).to_pandas()\n";

export const TsBlobApiIngest = "const blobData = lancedb.makeArrowTable(\n [\n { id: 1, video: Buffer.from(\"fake_video_bytes_1\") },\n { id: 2, video: Buffer.from(\"fake_video_bytes_2\") },\n ],\n { schema: blobSchema },\n);\nconst blobTable = await db.createTable(\"videos\", blobData, {\n mode: \"overwrite\",\n});\n";
Expand Down
30 changes: 30 additions & 0 deletions docs/tables/multimodal.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ import {
PyBlobApiIngest as BlobApiIngest,
TsBlobApiIngest as TsBlobApiIngest,
RsBlobApiIngest as RsBlobApiIngest,
PyBlobApiToPandas as BlobApiToPandas,
PyQueryToPandasKwargs as QueryToPandasKwargs,
} from '/snippets/multimodal.mdx';

LanceDB handles multimodal data—images, audio, video, and PDF files—natively by storing the raw bytes in a binary column alongside your vectors and metadata. This approach simplifies your data infrastructure by keeping the raw assets and their embeddings in the same database, eliminating the need for separate object storage for many use cases.
Expand Down Expand Up @@ -194,6 +196,34 @@ For more advanced usage, including random access and file-like reading of blobs,
Lance format's [blob API documentation](https://lance.org/guide/blob/).
</Card>

### 3. Convert blob tables to pandas

When you call `to_pandas()` on a local LanceDB table that contains Blob API columns, the `blob_mode` argument controls how those columns materialize. This is available in the Python SDK on local tables; remote tables raise `NotImplementedError`.

`blob_mode` accepts:

- `"lazy"` (default): returns blob columns without eagerly materializing their payloads. Use this when you want existing behavior and don't need to inspect blob metadata. For namespace-managed tables and in-memory datasets, lazy mode falls back to the standard PyArrow `to_pandas()` path.
- `"bytes"`: eagerly materializes each blob as `bytes`. Use this when you need the raw payload in the DataFrame, for example to decode an image or audio clip in-process.
- `"descriptions"`: returns blob descriptors (offsets, sizes, and positions) instead of the data itself. Use this when you want to plan I/O without paying the cost of loading every blob.

`"bytes"` and `"descriptions"` require a filesystem-backed Lance dataset; they are not supported on in-memory tables or namespace-managed tables.

Extra keyword arguments are forwarded to the underlying PyArrow / Lance pandas conversion, so you can also pass options like `split_blocks` or `self_destruct`:

<CodeGroup>
<CodeBlock filename="Python" language="Python" icon="python">
{BlobApiToPandas}
</CodeBlock>
</CodeGroup>

Query builders accept the same PyArrow `to_pandas` kwargs, but not `blob_mode`. They materialize Arrow results first, apply LanceDB-specific `flatten` and `timeout` handling, and then forward the remaining kwargs:

<CodeGroup>
<CodeBlock filename="Python" language="Python" icon="python">
{QueryToPandasKwargs}
</CodeBlock>
</CodeGroup>

## Other modalities

The `pa.binary()` and `pa.large_binary()` types are universal. You can use this same pattern for other types of multimodal data:
Expand Down
72 changes: 72 additions & 0 deletions tests/py/test_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,75 @@ def test_blob_api_definition(db_path_factory):
tbl = db.create_table("videos", data=data, schema=schema)
# --8<-- [end:blob_api_ingest]
assert len(tbl) == 2


def test_blob_api_to_pandas(db_path_factory):
db = lancedb.connect(db_path_factory("blob_to_pandas_db"))
schema = pa.schema([
pa.field("id", pa.int64()),
pa.field(
"video",
pa.large_binary(),
metadata={"lance-encoding:blob": "true"},
),
])
tbl = db.create_table(
"videos",
data=[
{"id": 1, "video": b"fake_video_bytes_1"},
{"id": 2, "video": b"fake_video_bytes_2"},
],
schema=schema,
mode="overwrite",
)

# --8<-- [start:blob_api_to_pandas]
# Default: blob columns come back lazily
df_lazy = tbl.to_pandas()

# Materialize blob bytes eagerly
df_bytes = tbl.to_pandas(blob_mode="bytes")

# Return descriptors instead of payloads
df_desc = tbl.to_pandas(blob_mode="descriptions")

# Forward extra kwargs to PyArrow's to_pandas
df_typed = tbl.to_pandas(split_blocks=True, self_destruct=True)
# --8<-- [end:blob_api_to_pandas]

assert len(df_lazy) == 2
assert isinstance(df_bytes["video"].iloc[0], bytes)
assert df_bytes["video"].tolist() == [
b"fake_video_bytes_1",
b"fake_video_bytes_2",
]
assert len(df_desc) == 2
assert len(df_typed) == 2


def test_query_to_pandas_kwargs(db_path_factory):
db = lancedb.connect(db_path_factory("query_to_pandas_db"))
schema = pa.schema([
pa.field("id", pa.int64()),
pa.field("vector", pa.list_(pa.float32(), 128)),
])
tbl = db.create_table(
"search_demo",
data=[
{"id": i, "vector": np.random.rand(128).astype(np.float32)}
for i in range(10)
],
schema=schema,
mode="overwrite",
)
query_vector = np.random.rand(128).astype(np.float32)

# --8<-- [start:query_to_pandas_kwargs]
df = (
tbl.search(query_vector)
.limit(10)
.to_pandas(split_blocks=True, self_destruct=True)
)
# --8<-- [end:query_to_pandas_kwargs]

assert len(df) == 10
Loading