diff --git a/java/src/main/java/org/lance/index/scalar/InvertedIndexParams.java b/java/src/main/java/org/lance/index/scalar/InvertedIndexParams.java
index ca0a7a46c70..d58cd71ece2 100755
--- a/java/src/main/java/org/lance/index/scalar/InvertedIndexParams.java
+++ b/java/src/main/java/org/lance/index/scalar/InvertedIndexParams.java
@@ -54,6 +54,7 @@ public static final class Builder {
private Integer maxNgramLength;
private Boolean prefixOnly;
private Boolean skipMerge;
+ private Integer formatVersion;
/**
* Configure the base tokenizer.
@@ -236,6 +237,23 @@ public Builder skipMerge(boolean skipMerge) {
return this;
}
+ /**
+ * Configure the on-disk FTS format version to write when creating a new index.
+ *
+ *
If unset, Lance chooses the current default format.
+ *
+ * @param formatVersion FTS format version, must be 1 or 2
+ * @return this builder
+ * @throws IllegalArgumentException
+ */
+ public Builder formatVersion(int formatVersion) {
+ if (formatVersion != 1 && formatVersion != 2) {
+ throw new IllegalArgumentException("formatVersion must be 1 or 2");
+ }
+ this.formatVersion = formatVersion;
+ return this;
+ }
+
/** Build a {@link ScalarIndexParams} instance for an inverted index. */
public ScalarIndexParams build() {
Map params = new HashMap<>();
@@ -283,6 +301,9 @@ public ScalarIndexParams build() {
if (skipMerge != null) {
params.put("skip_merge", skipMerge);
}
+ if (formatVersion != null) {
+ params.put("format_version", formatVersion);
+ }
String json = JsonUtils.toJson(params);
return ScalarIndexParams.create(INDEX_TYPE, json);
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index 45dc1b253d3..369b3648655 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -3134,6 +3134,7 @@ def create_scalar_index(
fragment_ids: Optional[List[int]] = None,
index_uuid: Optional[str] = None,
progress_callback: Optional[Callable[[IndexProgress], None]] = None,
+ format_version: Optional[Union[int, str]] = None,
**kwargs,
):
"""Create a scalar index on a column.
@@ -3239,6 +3240,11 @@ def create_scalar_index(
progress_callback : callable, optional
A callback that receives :class:`lance.progress.IndexProgress` events while
the index is being built.
+ format_version: int or str, optional
+ This is for the ``INVERTED`` / ``FTS`` index. Explicit on-disk FTS
+ format version to write when creating a new index. Accepts ``1``,
+ ``2``, ``"v1"``, or ``"v2"``. If unset, Lance chooses the current
+ default format.
with_position: bool, default False
This is for the ``INVERTED`` index. If True, the index will store the
@@ -3344,6 +3350,8 @@ def create_scalar_index(
kwargs["index_uuid"] = index_uuid
if progress_callback is not None:
kwargs["progress_callback"] = progress_callback
+ if format_version is not None:
+ kwargs["format_version"] = format_version
self._ds.create_index([column], index_type, name, replace, train, None, kwargs)
diff --git a/python/python/tests/compat/compat_sequence.py b/python/python/tests/compat/compat_sequence.py
index bec216d6a6f..48df46500ca 100644
--- a/python/python/tests/compat/compat_sequence.py
+++ b/python/python/tests/compat/compat_sequence.py
@@ -18,8 +18,9 @@
"ignore the index" mode to diff against, so its oracle reconstructs ground truth from a
full scan: tokenize every live row, then require an FTS search for a spread of sampled
terms to return exactly the rows that contain them. The FTS scenarios run under both
-on-disk format versions (LANCE_FTS_FORMAT_VERSION 1 and 2), which take different merge
-paths.
+on-disk format versions (1 and 2), which take different merge paths. New Lance versions
+pin this with the create-index `format_version` parameter; old Lance versions still use
+`LANCE_FTS_FORMAT_VERSION`.
The op vocabulary and bounds are deliberately small so the search is runnable; this is
exhaustive over the maintenance-lifecycle grammar up to the configured lengths, not over
@@ -63,11 +64,12 @@ def describe(kind, from_ref, to_ref, setup_ops, exercise_ops, fts_version=None):
class IndexScenario:
"""A picklable, kind-parameterized scenario run across a version split."""
- def __init__(self, kind, path, setup_ops, exercise_ops):
+ def __init__(self, kind, path, setup_ops, exercise_ops, fts_version=None):
self.kind = kind
self.path = str(path)
self.setup_ops = list(setup_ops)
self.exercise_ops = list(exercise_ops)
+ self.fts_version = fts_version
self.next_idx = 0
# --- in-venv helpers (only lance + pyarrow available) ---
@@ -123,6 +125,8 @@ def _op_W(self):
def _op_I(self):
kwargs = {"with_position": True} if self.kind == "INVERTED" else {}
+ if self.kind == "INVERTED" and self.fts_version is not None:
+ kwargs["format_version"] = int(self.fts_version)
self._open().create_scalar_index("key", self._index_type(), **kwargs)
def _op_D(self):
@@ -234,9 +238,10 @@ def search(
"""Search index-maintenance sequences up to `max_length` ops for one `kind`, across
(from_ref -> to_ref). Runs only scenarios in this shard (i % num_shards == shard) so
the space can be split across parallel workers. For INVERTED, `fts_version` ("1" or
- "2") pins the on-disk FTS format (LANCE_FTS_FORMAT_VERSION) on both sides; both are
- Fst token sets and exercise distinct merge paths. Returns failures; stops on the
- first when `stop_on_first`."""
+ "2") pins the on-disk FTS format on both sides. New Lance versions receive this
+ through the create-index parameter and old Lance versions receive it through
+ LANCE_FTS_FORMAT_VERSION. Both are Fst token sets and exercise distinct merge paths.
+ Returns failures; stops on the first when `stop_on_first`."""
from_venv = venv_factory.get_venv(from_ref)
to_venv = venv_factory.get_venv(to_ref)
env = {}
@@ -256,7 +261,7 @@ def search(
if key not in snapshots:
snap = base / f"snap_{kind}_{len(snapshots)}"
shutil.rmtree(snap, ignore_errors=True)
- builder = IndexScenario(kind, snap, setup_tail, [])
+ builder = IndexScenario(kind, snap, setup_tail, [], fts_version)
try:
next_idx = from_venv.execute_method(builder, "setup", env)
snapshots[key] = (snap, next_idx)
@@ -277,7 +282,7 @@ def search(
ex_path = base / f"ex_{kind}_{i}"
shutil.rmtree(ex_path, ignore_errors=True)
shutil.copytree(snap, ex_path)
- scenario = IndexScenario(kind, ex_path, setup_tail, exercise)
+ scenario = IndexScenario(kind, ex_path, setup_tail, exercise, fts_version)
scenario.next_idx = next_idx
label = describe(kind, from_ref, to_ref, setup_tail, exercise, fts_version)
try:
diff --git a/python/python/tests/compat/test_scalar_indices.py b/python/python/tests/compat/test_scalar_indices.py
index 35022df3b12..c3bf301eee0 100644
--- a/python/python/tests/compat/test_scalar_indices.py
+++ b/python/python/tests/compat/test_scalar_indices.py
@@ -320,7 +320,9 @@ def create(self):
max_rows_per_file=100,
data_storage_version=safe_data_storage_version(self.compat_version),
)
- dataset.create_scalar_index("text", "INVERTED", with_position=True)
+ dataset.create_scalar_index(
+ "text", "INVERTED", with_position=True, format_version=1
+ )
def check_read(self):
"""Verify FTS index can be queried."""
@@ -349,9 +351,7 @@ def check_write(self):
def skip_downgrade(self, version: str) -> bool:
return version.startswith("0.")
- def current_env(self, method_name: str) -> dict[str, str]:
- if method_name == "create":
+ def compat_env(self, version: str, method_name: str) -> dict[str, str]:
+ if method_name in {"create", "check_write"}:
return {"LANCE_FTS_FORMAT_VERSION": "1"}
- if method_name == "check_write":
- return {"LANCE_FTS_FORMAT_VERSION": "2"}
return {}
diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py
index b6e882633f5..348d615d3da 100644
--- a/python/python/tests/test_scalar_index.py
+++ b/python/python/tests/test_scalar_index.py
@@ -4802,9 +4802,11 @@ def test_json_inverted_match_query(tmp_path):
assert results.num_rows == 1
-@pytest.mark.parametrize("fts_format_version", ["1", "2"])
-def test_describe_indices(tmp_path, monkeypatch, fts_format_version):
- monkeypatch.setenv("LANCE_FTS_FORMAT_VERSION", fts_format_version)
+@pytest.mark.parametrize(
+ ("format_version", "expected_format_version"),
+ [(1, 1), (2, 2), ("v1", 1), ("v2", 2)],
+)
+def test_describe_indices(tmp_path, format_version, expected_format_version):
data = pa.table(
{
"id": range(100),
@@ -4820,7 +4822,7 @@ def test_describe_indices(tmp_path, monkeypatch, fts_format_version):
}
)
ds = lance.write_dataset(data, tmp_path)
- ds.create_scalar_index("text", index_type="INVERTED")
+ ds.create_scalar_index("text", index_type="INVERTED", format_version=format_version)
indices = ds.describe_indices()
assert len(indices) == 1
@@ -4834,7 +4836,7 @@ def test_describe_indices(tmp_path, monkeypatch, fts_format_version):
assert indices[0].segments[0].uuid is not None
assert indices[0].segments[0].fragment_ids == {0}
assert indices[0].segments[0].dataset_version_at_last_update == 1
- assert indices[0].segments[0].index_version == int(fts_format_version)
+ assert indices[0].segments[0].index_version == expected_format_version
assert indices[0].segments[0].created_at is not None
assert isinstance(indices[0].segments[0].created_at, datetime)
assert indices[0].segments[0].size_bytes is not None
@@ -4933,6 +4935,25 @@ def test_describe_indices(tmp_path, monkeypatch, fts_format_version):
assert index.num_rows_indexed == 50
+def test_create_inverted_index_defaults_to_v2_and_ignores_env(tmp_path, monkeypatch):
+ monkeypatch.setenv("LANCE_FTS_FORMAT_VERSION", "1")
+ data = pa.table({"text": ["document about lance database"]})
+ ds = lance.write_dataset(data, tmp_path)
+
+ ds.create_scalar_index("text", index_type="INVERTED")
+
+ indices = ds.describe_indices()
+ assert indices[0].segments[0].index_version == 2
+
+
+def test_create_inverted_index_rejects_invalid_format_version(tmp_path):
+ data = pa.table({"text": ["document about lance database"]})
+ ds = lance.write_dataset(data, tmp_path)
+
+ with pytest.raises(ValueError, match="unsupported FTS format version"):
+ ds.create_scalar_index("text", index_type="INVERTED", format_version="v3")
+
+
def test_vector_filter_fts_search(tmp_path):
# Create test data
ids = list(range(1, 301))
diff --git a/python/src/dataset.rs b/python/src/dataset.rs
index 31eaa96a654..f16ae198d72 100644
--- a/python/src/dataset.rs
+++ b/python/src/dataset.rs
@@ -76,6 +76,7 @@ use lance_index::{
FtsPrewarmOptions, IndexParams, IndexType, PrewarmOptions,
optimize::OptimizeOptions,
progress::{IndexBuildProgress, NoopIndexBuildProgress},
+ scalar::inverted::InvertedListFormatVersion,
scalar::{FullTextSearchQuery, InvertedIndexParams, ScalarIndexParams},
vector::{
ApproxMode, DEFAULT_QUERY_PARALLELISM, Query as VectorQuery,
@@ -2298,6 +2299,23 @@ impl Dataset {
if let Some(num_workers) = kwargs.get_item("num_workers")? {
params = params.num_workers(num_workers.extract()?);
}
+ if let Some(format_version) = kwargs.get_item("format_version")?
+ && !format_version.is_none()
+ {
+ let value = if let Ok(value) = format_version.cast::() {
+ value.to_string_lossy().to_string()
+ } else if let Ok(value) = format_version.extract::() {
+ value.to_string()
+ } else {
+ return Err(PyValueError::new_err(
+ "format_version must be 1, 2, 'v1', or 'v2'",
+ ));
+ };
+ let format_version = value
+ .parse::()
+ .map_err(|err| PyValueError::new_err(err.to_string()))?;
+ params = params.format_version(format_version);
+ }
}
Box::new(params)
}
diff --git a/rust/lance-index/src/scalar/inverted.rs b/rust/lance-index/src/scalar/inverted.rs
index d0bb0e40d3a..d4e3bfdd400 100644
--- a/rust/lance-index/src/scalar/inverted.rs
+++ b/rust/lance-index/src/scalar/inverted.rs
@@ -147,6 +147,7 @@ impl InvertedIndexPlugin {
}
});
+ let format_version = params.resolved_format_version();
let details = pbold::InvertedIndexDetails::try_from(¶ms)?;
let mut inverted_index =
InvertedIndexBuilder::new_with_fragment_mask(params, fragment_mask)
@@ -154,7 +155,7 @@ impl InvertedIndexPlugin {
let files = inverted_index.update(data, index_store, None).await?;
Ok(CreatedIndex {
index_details: prost_types::Any::from_msg(&details).unwrap(),
- index_version: current_fts_format_version().index_version(),
+ index_version: format_version.index_version(),
files,
})
}
diff --git a/rust/lance-index/src/scalar/inverted/builder.rs b/rust/lance-index/src/scalar/inverted/builder.rs
index 17cb18c5e96..332aae2e8da 100644
--- a/rust/lance-index/src/scalar/inverted/builder.rs
+++ b/rust/lance-index/src/scalar/inverted/builder.rs
@@ -246,6 +246,7 @@ impl InvertedIndexBuilder {
fragment_mask: Option,
deleted_fragments: RoaringBitmap,
) -> Self {
+ let format_version = params.resolved_format_version();
Self {
params,
partitions,
@@ -253,8 +254,8 @@ impl InvertedIndexBuilder {
src_store: store,
token_set_format,
fragment_mask,
- format_version: current_fts_format_version(),
- posting_tail_codec: current_fts_format_version().posting_tail_codec(),
+ format_version,
+ posting_tail_codec: format_version.posting_tail_codec(),
progress: noop_progress(),
deleted_fragments,
}
@@ -3217,7 +3218,8 @@ mod tests {
token_set_format,
None,
RoaringBitmap::new(),
- );
+ )
+ .with_format_version(InvertedListFormatVersion::V1);
builder.write(dest_store.as_ref()).await?;
let metadata_reader = dest_store.open_index_file(METADATA_FILE).await?;
@@ -3281,6 +3283,14 @@ mod tests {
.await?;
let index = InvertedIndex::load(src_store, None, &LanceCache::no_cache()).await?;
+ let derived_params = index.derive_index_params()?;
+ let derived_params: InvertedIndexParams =
+ serde_json::from_str(derived_params.params.as_deref().unwrap())?;
+ assert_eq!(
+ derived_params.format_version,
+ Some(InvertedListFormatVersion::V1)
+ );
+
let schema = Arc::new(Schema::new(vec![
Field::new("doc", DataType::Utf8, true),
Field::new(ROW_ID, DataType::UInt64, false),
diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs
index c23dc1c4e78..4dcfee03587 100644
--- a/rust/lance-index/src/scalar/inverted/index.rs
+++ b/rust/lance-index/src/scalar/inverted/index.rs
@@ -133,15 +133,21 @@ pub static FTS_SCHEMA: LazyLock =
static ROW_ID_SCHEMA: LazyLock =
LazyLock::new(|| Arc::new(Schema::new(vec![ROW_ID_FIELD.clone()])));
-fn resolve_fts_format_version(
+pub fn resolve_fts_format_version(
value: Option<&str>,
) -> std::result::Result {
- value.unwrap_or("1").parse()
+ match value {
+ Some(value) => value.parse(),
+ None => Ok(default_fts_format_version()),
+ }
+}
+
+pub fn default_fts_format_version() -> InvertedListFormatVersion {
+ InvertedListFormatVersion::V2
}
pub fn current_fts_format_version() -> InvertedListFormatVersion {
- resolve_fts_format_version(std::env::var("LANCE_FTS_FORMAT_VERSION").ok().as_deref())
- .expect("failed to parse LANCE_FTS_FORMAT_VERSION")
+ default_fts_format_version()
}
pub fn max_supported_fts_format_version() -> InvertedListFormatVersion {
@@ -150,8 +156,8 @@ pub fn max_supported_fts_format_version() -> InvertedListFormatVersion {
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
pub enum InvertedListFormatVersion {
- #[default]
V1,
+ #[default]
V2,
}
@@ -418,6 +424,7 @@ pub struct InvertedIndex {
store: Arc,
tokenizer: Box,
token_set_format: TokenSetFormat,
+ format_version: InvertedListFormatVersion,
pub(crate) partitions: Vec>,
corpus_stats: Arc>,
// Fragments which are contained in the index, but no longer in the dataset.
@@ -430,6 +437,7 @@ impl Debug for InvertedIndex {
f.debug_struct("InvertedIndex")
.field("params", &self.params)
.field("token_set_format", &self.token_set_format)
+ .field("format_version", &self.format_version)
.field("partitions", &self.partitions)
.field("deleted_fragments", &self.deleted_fragments)
.finish()
@@ -473,14 +481,7 @@ async fn resolve_deferred_candidates(
impl InvertedIndex {
fn format_version(&self) -> InvertedListFormatVersion {
- self.partitions
- .first()
- .map(|partition| {
- InvertedListFormatVersion::from_posting_tail_codec(
- partition.inverted_list.posting_tail_codec(),
- )
- })
- .unwrap_or_else(current_fts_format_version)
+ self.format_version
}
fn index_version(&self) -> u32 {
@@ -995,6 +996,7 @@ impl InvertedIndex {
store: store.clone(),
tokenizer,
token_set_format: TokenSetFormat::Arrow,
+ format_version: InvertedListFormatVersion::V1,
partitions: vec![Arc::new(InvertedPartition {
id: 0,
store,
@@ -1045,6 +1047,7 @@ impl InvertedIndex {
.map(|name| TokenSetFormat::from_str(name))
.transpose()?
.unwrap_or(TokenSetFormat::Arrow);
+ let format_version = parse_format_version_from_metadata(&reader.schema().metadata)?;
// Load deleted_fragments if present (optional for backward compatibility)
let deleted_fragments = if reader.num_rows() > 0 {
@@ -1090,6 +1093,7 @@ impl InvertedIndex {
store,
tokenizer,
token_set_format,
+ format_version,
partitions,
corpus_stats: Arc::new(OnceCell::new()),
deleted_fragments,
@@ -1324,8 +1328,9 @@ impl ScalarIndex for InvertedIndex {
// Empty tokenizer metadata only appears in legacy simple-tokenizer indexes.
params.base_tokenizer = "simple".to_string();
}
+ params = params.format_version(self.format_version());
- let params_json = serde_json::to_string(¶ms)?;
+ let params_json = params.to_training_json()?.to_string();
Ok(ScalarIndexParams {
index_type: BuiltinIndexType::Inverted.as_str().to_string(),
@@ -5964,10 +5969,10 @@ mod tests {
}
#[test]
- fn test_resolve_fts_format_version_defaults_to_v1() {
+ fn test_resolve_fts_format_version_defaults_to_v2() {
assert_eq!(
resolve_fts_format_version(None).unwrap(),
- InvertedListFormatVersion::V1
+ InvertedListFormatVersion::V2
);
assert_eq!(
resolve_fts_format_version(Some("2")).unwrap(),
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs
index 5a2a701dc73..c4abb1ce816 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs
@@ -22,6 +22,9 @@ use crate::pbold;
use crate::scalar::inverted::tokenizer::document_tokenizer::{
JsonTokenizer, LanceTokenizer, TextTokenizer,
};
+use crate::scalar::inverted::{
+ InvertedListFormatVersion, default_fts_format_version, resolve_fts_format_version,
+};
pub use lance_tokenizer::Language;
use lance_tokenizer::{
AsciiFoldingFilter, IcuTokenizer, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter,
@@ -119,6 +122,18 @@ pub struct InvertedIndexParams {
/// The effective worker count is clamped to `[1, num_cpus - 2]`.
#[serde(rename = "num_workers", skip_serializing, default)]
pub(crate) num_workers: Option,
+
+ /// On-disk FTS format version to write when creating a new index.
+ ///
+ /// This is a build-time only parameter and is not persisted with the index.
+ /// If unset, Lance writes the current default FTS format.
+ #[serde(
+ rename = "format_version",
+ skip_serializing,
+ default,
+ deserialize_with = "deserialize_format_version"
+ )]
+ pub(crate) format_version: Option,
}
impl TryFrom<&InvertedIndexParams> for pbold::InvertedIndexDetails {
@@ -166,6 +181,7 @@ impl TryFrom<&pbold::InvertedIndexDetails> for InvertedIndexParams {
prefix_only: details.prefix_only,
memory_limit_mb: defaults.memory_limit_mb,
num_workers: defaults.num_workers,
+ format_version: defaults.format_version,
})
}
}
@@ -182,6 +198,37 @@ fn default_max_ngram_length() -> u32 {
3
}
+fn deserialize_format_version<'de, D>(
+ deserializer: D,
+) -> std::result::Result