Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion docs/indexing/fts-index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ description: "Create and tune BM25-based full-text search indexes in LanceDB."
icon: "book"
mode: "wide"
---
import { PyFtsIndexAsync as FtsIndexAsync, PyFtsIndexCreate as FtsIndexCreate, PyFtsIndexWait as FtsIndexWait } from '/snippets/indexing.mdx';
import { PyFtsIndexAsync as FtsIndexAsync, PyFtsIndexCreate as FtsIndexCreate, PyFtsIndexNested as FtsIndexNested, PyFtsIndexWait as FtsIndexWait } from '/snippets/indexing.mdx';

LanceDB provides performant full-text search based on BM25, allowing you to incorporate keyword-based search in your retrieval solutions. This page shows
examples on how to create and configure FTS indexes in LanceDB OSS and Enterprise, using the synchronous and asynchronous APIs.
Expand Down Expand Up @@ -48,6 +48,32 @@ When using async connections (`connect_async`), use `create_index` with the `FTS
The `create_fts_index` method is not available on `AsyncTable`. Use `create_index` with `FTS` config instead.
</Note>

## Nested field paths

FTS indexes can target text leaves inside struct columns by passing a dotted path (for example, `payload.text`). The same path works for [`MatchQuery`](/search/full-text-search) and [`PhraseQuery`](/search/full-text-search), and for the `columns` argument on async `nearest_to_text` queries.

You can point an index at any string leaf nested in a struct, regardless of depth. The struct container itself isn't indexable: you have to name a specific text field.

<CodeGroup>
<CodeBlock filename="Python" language="Python" icon="python">
{FtsIndexNested}
</CodeBlock>
</CodeGroup>

LanceDB rejects paths that don't resolve to a text leaf:

- A struct container (for example, `payload`): raises `ValueError: FTS index cannot be created ...`.
- A non-text leaf such as an integer or float (for example, `payload.count`): raises the same error.
- A path that doesn't exist in the schema (for example, `payload.missing`): raises `ValueError: Field path ... not found`.

The async API accepts the same dotted paths through `create_index`:

```python Python icon="python"
from lancedb.index import FTS

await async_table.create_index("payload.text", config=FTS(with_position=True))
```

## Configuration Options

### FTS Parameters
Expand Down
2 changes: 2 additions & 0 deletions docs/snippets/indexing.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ export const PyFtsIndexAsync = "import asyncio\n\nimport lancedb\nimport polars

export const PyFtsIndexCreate = "table_name = \"fts-index-create\"\ntable = db.open_table(table_name)\ntable.create_fts_index(\"text\")\n";

export const PyFtsIndexNested = "from lancedb.query import MatchQuery, PhraseQuery\n\ntable = db.open_table(\"fts-index-nested\")\n\n# Index a text leaf inside a struct column using a dotted path.\ntable.create_fts_index(\"payload.text\", with_position=True)\n\n# The same dotted path works in MatchQuery and PhraseQuery.\nmatches = (\n table.search(MatchQuery(\"puppy\", \"payload.text\")).limit(5).to_list()\n)\nphrases = (\n table.search(PhraseQuery(\"puppy runs\", \"payload.text\"))\n .limit(5)\n .to_list()\n)\n";

export const PyFtsIndexWait = "table_name = \"fts-index-wait\"\n\ntable = db.open_table(table_name)\ntable.create_fts_index(\"text\")\n\nindex_name = \"text_idx\"\ntable.wait_for_index([index_name])\n";

export const PyGpuIndexCuda = "table.create_index(\n num_partitions=256,\n num_sub_vectors=96,\n accelerator=\"cuda\",\n)\n";
Expand Down
53 changes: 53 additions & 0 deletions tests/py/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,59 @@ def test_fts_index_wait(tmp_db):
assert table.list_indices()


def test_fts_index_nested_field(tmp_db):
nested_schema = pa.struct([
pa.field("text", pa.string()),
pa.field("count", pa.int32()),
])
schema = pa.schema([
pa.field("id", pa.int64()),
pa.field("payload", nested_schema),
])
tmp_db.create_table(
"fts-index-nested",
pa.table(
{
"id": pa.array([1, 2], pa.int64()),
"payload": pa.array(
[
{"text": "Frodo was a happy puppy", "count": 1},
{"text": "puppy runs through the meadow", "count": 2},
],
type=nested_schema,
),
},
schema=schema,
),
mode="overwrite",
)

db = tmp_db
# --8<-- [start:fts_index_nested]
from lancedb.query import MatchQuery, PhraseQuery

table = db.open_table("fts-index-nested")

# Index a text leaf inside a struct column using a dotted path.
table.create_fts_index("payload.text", with_position=True)

# The same dotted path works in MatchQuery and PhraseQuery.
matches = (
table.search(MatchQuery("puppy", "payload.text")).limit(5).to_list()
)
phrases = (
table.search(PhraseQuery("puppy runs", "payload.text"))
.limit(5)
.to_list()
)
# --8<-- [end:fts_index_nested]

assert len(matches) > 0
assert all("puppy" in row["payload"]["text"] for row in matches)
assert len(phrases) > 0
assert all("puppy runs" in row["payload"]["text"] for row in phrases)


@pytest.mark.asyncio
async def test_fts_index_async(tmp_path, monkeypatch):
monkeypatch.chdir(tmp_path)
Expand Down
Loading