Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions java/src/main/java/org/lance/index/scalar/InvertedIndexParams.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ public static final class Builder {
private Integer maxNgramLength;
private Boolean prefixOnly;
private Boolean skipMerge;
private Integer formatVersion;

/**
* Configure the base tokenizer.
Expand Down Expand Up @@ -236,6 +237,23 @@ public Builder skipMerge(boolean skipMerge) {
return this;
}

/**
* Configure the on-disk FTS format version to write when creating a new index.
*
* <p>If unset, Lance chooses the current default format.
*
* @param formatVersion FTS format version, must be 1 or 2
* @return this builder
* @throws IllegalArgumentException
*/
public Builder formatVersion(int formatVersion) {
if (formatVersion != 1 && formatVersion != 2) {
throw new IllegalArgumentException("formatVersion must be 1 or 2");
}
this.formatVersion = formatVersion;
return this;
}

/** Build a {@link ScalarIndexParams} instance for an inverted index. */
public ScalarIndexParams build() {
Map<String, Object> params = new HashMap<>();
Expand Down Expand Up @@ -283,6 +301,9 @@ public ScalarIndexParams build() {
if (skipMerge != null) {
params.put("skip_merge", skipMerge);
}
if (formatVersion != null) {
params.put("format_version", formatVersion);
}

String json = JsonUtils.toJson(params);
return ScalarIndexParams.create(INDEX_TYPE, json);
Expand Down
8 changes: 8 additions & 0 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3134,6 +3134,7 @@ def create_scalar_index(
fragment_ids: Optional[List[int]] = None,
index_uuid: Optional[str] = None,
progress_callback: Optional[Callable[[IndexProgress], None]] = None,
format_version: Optional[Union[int, str]] = None,
**kwargs,
):
"""Create a scalar index on a column.
Expand Down Expand Up @@ -3239,6 +3240,11 @@ def create_scalar_index(
progress_callback : callable, optional
A callback that receives :class:`lance.progress.IndexProgress` events while
the index is being built.
format_version: int or str, optional
This is for the ``INVERTED`` / ``FTS`` index. Explicit on-disk FTS
format version to write when creating a new index. Accepts ``1``,
``2``, ``"v1"``, or ``"v2"``. If unset, Lance chooses the current
default format.

with_position: bool, default False
This is for the ``INVERTED`` index. If True, the index will store the
Expand Down Expand Up @@ -3344,6 +3350,8 @@ def create_scalar_index(
kwargs["index_uuid"] = index_uuid
if progress_callback is not None:
kwargs["progress_callback"] = progress_callback
if format_version is not None:
kwargs["format_version"] = format_version

self._ds.create_index([column], index_type, name, replace, train, None, kwargs)

Expand Down
21 changes: 13 additions & 8 deletions python/python/tests/compat/compat_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
"ignore the index" mode to diff against, so its oracle reconstructs ground truth from a
full scan: tokenize every live row, then require an FTS search for a spread of sampled
terms to return exactly the rows that contain them. The FTS scenarios run under both
on-disk format versions (LANCE_FTS_FORMAT_VERSION 1 and 2), which take different merge
paths.
on-disk format versions (1 and 2), which take different merge paths. New Lance versions
pin this with the create-index `format_version` parameter; old Lance versions still use
`LANCE_FTS_FORMAT_VERSION`.

The op vocabulary and bounds are deliberately small so the search is runnable; this is
exhaustive over the maintenance-lifecycle grammar up to the configured lengths, not over
Expand Down Expand Up @@ -63,11 +64,12 @@ def describe(kind, from_ref, to_ref, setup_ops, exercise_ops, fts_version=None):
class IndexScenario:
"""A picklable, kind-parameterized scenario run across a version split."""

def __init__(self, kind, path, setup_ops, exercise_ops):
def __init__(self, kind, path, setup_ops, exercise_ops, fts_version=None):
self.kind = kind
self.path = str(path)
self.setup_ops = list(setup_ops)
self.exercise_ops = list(exercise_ops)
self.fts_version = fts_version
self.next_idx = 0

# --- in-venv helpers (only lance + pyarrow available) ---
Expand Down Expand Up @@ -123,6 +125,8 @@ def _op_W(self):

def _op_I(self):
kwargs = {"with_position": True} if self.kind == "INVERTED" else {}
if self.kind == "INVERTED" and self.fts_version is not None:
kwargs["format_version"] = int(self.fts_version)
self._open().create_scalar_index("key", self._index_type(), **kwargs)

def _op_D(self):
Expand Down Expand Up @@ -234,9 +238,10 @@ def search(
"""Search index-maintenance sequences up to `max_length` ops for one `kind`, across
(from_ref -> to_ref). Runs only scenarios in this shard (i % num_shards == shard) so
the space can be split across parallel workers. For INVERTED, `fts_version` ("1" or
"2") pins the on-disk FTS format (LANCE_FTS_FORMAT_VERSION) on both sides; both are
Fst token sets and exercise distinct merge paths. Returns failures; stops on the
first when `stop_on_first`."""
"2") pins the on-disk FTS format on both sides. New Lance versions receive this
through the create-index parameter and old Lance versions receive it through
LANCE_FTS_FORMAT_VERSION. Both are Fst token sets and exercise distinct merge paths.
Returns failures; stops on the first when `stop_on_first`."""
from_venv = venv_factory.get_venv(from_ref)
to_venv = venv_factory.get_venv(to_ref)
env = {}
Expand All @@ -256,7 +261,7 @@ def search(
if key not in snapshots:
snap = base / f"snap_{kind}_{len(snapshots)}"
shutil.rmtree(snap, ignore_errors=True)
builder = IndexScenario(kind, snap, setup_tail, [])
builder = IndexScenario(kind, snap, setup_tail, [], fts_version)
try:
next_idx = from_venv.execute_method(builder, "setup", env)
snapshots[key] = (snap, next_idx)
Expand All @@ -277,7 +282,7 @@ def search(
ex_path = base / f"ex_{kind}_{i}"
shutil.rmtree(ex_path, ignore_errors=True)
shutil.copytree(snap, ex_path)
scenario = IndexScenario(kind, ex_path, setup_tail, exercise)
scenario = IndexScenario(kind, ex_path, setup_tail, exercise, fts_version)
scenario.next_idx = next_idx
label = describe(kind, from_ref, to_ref, setup_tail, exercise, fts_version)
try:
Expand Down
10 changes: 5 additions & 5 deletions python/python/tests/compat/test_scalar_indices.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,9 @@ def create(self):
max_rows_per_file=100,
data_storage_version=safe_data_storage_version(self.compat_version),
)
dataset.create_scalar_index("text", "INVERTED", with_position=True)
dataset.create_scalar_index(
"text", "INVERTED", with_position=True, format_version=1
)

def check_read(self):
"""Verify FTS index can be queried."""
Expand Down Expand Up @@ -349,9 +351,7 @@ def check_write(self):
def skip_downgrade(self, version: str) -> bool:
return version.startswith("0.")

def current_env(self, method_name: str) -> dict[str, str]:
if method_name == "create":
def compat_env(self, version: str, method_name: str) -> dict[str, str]:
if method_name in {"create", "check_write"}:
return {"LANCE_FTS_FORMAT_VERSION": "1"}
if method_name == "check_write":
return {"LANCE_FTS_FORMAT_VERSION": "2"}
return {}
31 changes: 26 additions & 5 deletions python/python/tests/test_scalar_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4802,9 +4802,11 @@ def test_json_inverted_match_query(tmp_path):
assert results.num_rows == 1


@pytest.mark.parametrize("fts_format_version", ["1", "2"])
def test_describe_indices(tmp_path, monkeypatch, fts_format_version):
monkeypatch.setenv("LANCE_FTS_FORMAT_VERSION", fts_format_version)
@pytest.mark.parametrize(
("format_version", "expected_format_version"),
[(1, 1), (2, 2), ("v1", 1), ("v2", 2)],
)
def test_describe_indices(tmp_path, format_version, expected_format_version):
data = pa.table(
{
"id": range(100),
Expand All @@ -4820,7 +4822,7 @@ def test_describe_indices(tmp_path, monkeypatch, fts_format_version):
}
)
ds = lance.write_dataset(data, tmp_path)
ds.create_scalar_index("text", index_type="INVERTED")
ds.create_scalar_index("text", index_type="INVERTED", format_version=format_version)
indices = ds.describe_indices()
assert len(indices) == 1

Expand All @@ -4834,7 +4836,7 @@ def test_describe_indices(tmp_path, monkeypatch, fts_format_version):
assert indices[0].segments[0].uuid is not None
assert indices[0].segments[0].fragment_ids == {0}
assert indices[0].segments[0].dataset_version_at_last_update == 1
assert indices[0].segments[0].index_version == int(fts_format_version)
assert indices[0].segments[0].index_version == expected_format_version
assert indices[0].segments[0].created_at is not None
assert isinstance(indices[0].segments[0].created_at, datetime)
assert indices[0].segments[0].size_bytes is not None
Expand Down Expand Up @@ -4933,6 +4935,25 @@ def test_describe_indices(tmp_path, monkeypatch, fts_format_version):
assert index.num_rows_indexed == 50


def test_create_inverted_index_defaults_to_v2_and_ignores_env(tmp_path, monkeypatch):
monkeypatch.setenv("LANCE_FTS_FORMAT_VERSION", "1")
data = pa.table({"text": ["document about lance database"]})
ds = lance.write_dataset(data, tmp_path)

ds.create_scalar_index("text", index_type="INVERTED")

indices = ds.describe_indices()
assert indices[0].segments[0].index_version == 2


def test_create_inverted_index_rejects_invalid_format_version(tmp_path):
data = pa.table({"text": ["document about lance database"]})
ds = lance.write_dataset(data, tmp_path)

with pytest.raises(ValueError, match="unsupported FTS format version"):
ds.create_scalar_index("text", index_type="INVERTED", format_version="v3")


def test_vector_filter_fts_search(tmp_path):
# Create test data
ids = list(range(1, 301))
Expand Down
18 changes: 18 additions & 0 deletions python/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ use lance_index::{
FtsPrewarmOptions, IndexParams, IndexType, PrewarmOptions,
optimize::OptimizeOptions,
progress::{IndexBuildProgress, NoopIndexBuildProgress},
scalar::inverted::InvertedListFormatVersion,
scalar::{FullTextSearchQuery, InvertedIndexParams, ScalarIndexParams},
vector::{
ApproxMode, DEFAULT_QUERY_PARALLELISM, Query as VectorQuery,
Expand Down Expand Up @@ -2298,6 +2299,23 @@ impl Dataset {
if let Some(num_workers) = kwargs.get_item("num_workers")? {
params = params.num_workers(num_workers.extract()?);
}
if let Some(format_version) = kwargs.get_item("format_version")?
&& !format_version.is_none()
{
let value = if let Ok(value) = format_version.cast::<PyString>() {
value.to_string_lossy().to_string()
} else if let Ok(value) = format_version.extract::<u32>() {
value.to_string()
} else {
return Err(PyValueError::new_err(
"format_version must be 1, 2, 'v1', or 'v2'",
));
};
let format_version = value
.parse::<InvertedListFormatVersion>()
.map_err(|err| PyValueError::new_err(err.to_string()))?;
params = params.format_version(format_version);
}
}
Box::new(params)
}
Expand Down
3 changes: 2 additions & 1 deletion rust/lance-index/src/scalar/inverted.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,14 +147,15 @@ impl InvertedIndexPlugin {
}
});

let format_version = params.resolved_format_version();
let details = pbold::InvertedIndexDetails::try_from(&params)?;
let mut inverted_index =
InvertedIndexBuilder::new_with_fragment_mask(params, fragment_mask)
.with_progress(progress);
let files = inverted_index.update(data, index_store, None).await?;
Ok(CreatedIndex {
index_details: prost_types::Any::from_msg(&details).unwrap(),
index_version: current_fts_format_version().index_version(),
index_version: format_version.index_version(),
files,
})
}
Expand Down
16 changes: 13 additions & 3 deletions rust/lance-index/src/scalar/inverted/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -246,15 +246,16 @@ impl InvertedIndexBuilder {
fragment_mask: Option<u64>,
deleted_fragments: RoaringBitmap,
) -> Self {
let format_version = params.resolved_format_version();
Self {
params,
partitions,
new_partitions: Vec::new(),
src_store: store,
token_set_format,
fragment_mask,
format_version: current_fts_format_version(),
posting_tail_codec: current_fts_format_version().posting_tail_codec(),
format_version,
posting_tail_codec: format_version.posting_tail_codec(),
progress: noop_progress(),
deleted_fragments,
}
Expand Down Expand Up @@ -3217,7 +3218,8 @@ mod tests {
token_set_format,
None,
RoaringBitmap::new(),
);
)
.with_format_version(InvertedListFormatVersion::V1);
builder.write(dest_store.as_ref()).await?;

let metadata_reader = dest_store.open_index_file(METADATA_FILE).await?;
Expand Down Expand Up @@ -3281,6 +3283,14 @@ mod tests {
.await?;

let index = InvertedIndex::load(src_store, None, &LanceCache::no_cache()).await?;
let derived_params = index.derive_index_params()?;
let derived_params: InvertedIndexParams =
serde_json::from_str(derived_params.params.as_deref().unwrap())?;
assert_eq!(
derived_params.format_version,
Some(InvertedListFormatVersion::V1)
);

let schema = Arc::new(Schema::new(vec![
Field::new("doc", DataType::Utf8, true),
Field::new(ROW_ID, DataType::UInt64, false),
Expand Down
Loading
Loading