From 42bdf6a4f3edfb4b76197fc1af90d5e799b94b93 Mon Sep 17 00:00:00 2001
From: Tomasz Kalinowski <kalinowskit@gmail.com>
Date: Thu, 23 Apr 2026 15:29:46 -0400
Subject: [PATCH 1/6] Raise a clearer error before BM25 retrieval without
 build_index

---
 src/raghilda/_duckdb_store.py | 20 ++++++++++++++++++++
 tests/test_store.py           |  8 ++++++++
 2 files changed, 28 insertions(+)

diff --git a/src/raghilda/_duckdb_store.py b/src/raghilda/_duckdb_store.py
index 7a6aabd..69b3ebc 100644
--- a/src/raghilda/_duckdb_store.py
+++ b/src/raghilda/_duckdb_store.py
@@ -1011,6 +1011,7 @@ def retrieve_bm25(
         """
 
         with self._db_lock:
+            self._require_bm25_index()
             result = self.con.execute(
                 sql,
                 {
@@ -1048,6 +1049,25 @@ def retrieve_bm25(
 
         return output
 
+    def _require_bm25_index(self) -> None:
+        row = self.con.execute(
+            """
+            SELECT EXISTS (
+                SELECT 1
+                FROM duckdb_functions()
+                WHERE schema_name = 'fts_main_chunks'
+                    AND function_name = 'match_bm25'
+            )
+            """
+        ).fetchone()
+        assert row is not None
+        if not row[0]:
+            raise RuntimeError(
+                "DuckDBStore retrieval requires a BM25 index. "
+                'Call `store.build_index("bm25")` or `store.build_index()` '
+                "before calling `retrieve_bm25()` or `retrieve()`."
+            )
+
     def build_index(
         self,
         type: Optional[IndexType | str | list[IndexType | str]] = None,
diff --git a/tests/test_store.py b/tests/test_store.py
index 77301b2..2b71d5b 100644
--- a/tests/test_store.py
+++ b/tests/test_store.py
@@ -603,6 +603,10 @@ def test_retrieve_bm25(self, store_with_docs):
             assert isinstance(chunk, RetrievedDuckDBMarkdownChunk)
             assert chunk.text is not None
 
+    def test_retrieve_bm25_requires_build_index(self, store_with_docs):
+        with pytest.raises(RuntimeError, match="build_index"):
+            store_with_docs.retrieve_bm25("document", top_k=3)
+
     def test_retrieve_bm25_returns_document_slice_for_non_zero_start(self, store):
         # Guard against 0-based/1-based off-by-one slicing errors for non-zero starts.
         doc = MarkdownDocument(origin="bm25-text-source", content="alphabetagamma")
@@ -641,6 +645,10 @@ def test_retrieve(self, store_with_docs):
             assert isinstance(chunk, RetrievedDuckDBMarkdownChunk)
             assert chunk.text is not None
 
+    def test_retrieve_requires_build_index(self, store_with_docs):
+        with pytest.raises(RuntimeError, match="build_index"):
+            store_with_docs.retrieve("document", top_k=3, deoverlap=False)
+
     def test_retrieve_with_deoverlap(self, store):
         # Create a document with overlapping chunks
         # "hello world test document" = 24 chars

From 82204d4bc7f983b9e95b7c7f4b91e52a316c3195 Mon Sep 17 00:00:00 2001
From: Tomasz Kalinowski <kalinowskit@gmail.com>
Date: Thu, 23 Apr 2026 15:30:39 -0400
Subject: [PATCH 2/6] Add the missing build_index call to DuckDB examples

---
 README.md                     | 3 +++
 src/raghilda/_duckdb_store.py | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 7941f69..ffc6f64 100644
--- a/README.md
+++ b/README.md
@@ -50,6 +50,9 @@ for link in links:
     chunked_document = chunker.chunk(document)
     store.upsert(chunked_document)
 
+# Build indexes before retrieval
+store.build_index()
+
 # Retrieve relevant chunks
 chunks = store.retrieve("How do I stream a response?", top_k=5)
 for chunk in chunks:
diff --git a/src/raghilda/_duckdb_store.py b/src/raghilda/_duckdb_store.py
index 69b3ebc..fc53c5b 100644
--- a/src/raghilda/_duckdb_store.py
+++ b/src/raghilda/_duckdb_store.py
@@ -179,6 +179,9 @@ class DuckDBStore(BaseStore):
     )
     store.upsert(MarkdownChunker().chunk(doc))
 
+    # Build indexes before retrieval
+    store.build_index()
+
     # Retrieve similar chunks
     chunks = store.retrieve("How do I use this?", top_k=5)
     ```

From 57939dd26aab01485f5c82a539469adaebe762bc Mon Sep 17 00:00:00 2001
From: Tomasz Kalinowski <kalinowskit@gmail.com>
Date: Thu, 23 Apr 2026 15:57:10 -0400
Subject: [PATCH 3/6] Cache BM25 index state off the retrieval hot path

---
 src/raghilda/_duckdb_store.py | 30 ++++++++++++++++++------------
 tests/test_store.py           | 22 ++++++++++++++++++++++
 2 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/src/raghilda/_duckdb_store.py b/src/raghilda/_duckdb_store.py
index fc53c5b..854fbc0 100644
--- a/src/raghilda/_duckdb_store.py
+++ b/src/raghilda/_duckdb_store.py
@@ -404,6 +404,7 @@ def __init__(
             require_embedding=self.metadata.embed is not None,
         )
         self._db_lock = threading.Lock()
+        self._has_bm25_index = _has_bm25_index(self.con)
 
     def upsert(
         self,
@@ -1053,18 +1054,7 @@ def retrieve_bm25(
         return output
 
     def _require_bm25_index(self) -> None:
-        row = self.con.execute(
-            """
-            SELECT EXISTS (
-                SELECT 1
-                FROM duckdb_functions()
-                WHERE schema_name = 'fts_main_chunks'
-                    AND function_name = 'match_bm25'
-            )
-            """
-        ).fetchone()
-        assert row is not None
-        if not row[0]:
+        if not self._has_bm25_index:
             raise RuntimeError(
                 "DuckDBStore retrieval requires a BM25 index. "
                 'Call `store.build_index("bm25")` or `store.build_index()` '
@@ -1098,6 +1088,7 @@ def build_index(
                 self.con.begin()
                 self._create_fts_index()
                 self.con.commit()
+                self._has_bm25_index = True
             except Exception as e:
                 self.con.rollback()
                 raise e
@@ -1230,6 +1221,21 @@ def _load_extensions_for_existing_indexes(con: duckdb.DuckDBPyConnection) -> Non
         con.execute("INSTALL vss; LOAD vss;")
 
 
+def _has_bm25_index(con: duckdb.DuckDBPyConnection) -> bool:
+    row = con.execute(
+        """
+        SELECT EXISTS (
+            SELECT 1
+            FROM duckdb_functions()
+            WHERE schema_name = 'fts_main_chunks'
+                AND function_name = 'match_bm25'
+        )
+        """
+    ).fetchone()
+    assert row is not None
+    return bool(row[0])
+
+
 def _validate_required_schema(
     con: duckdb.DuckDBPyConnection,
     *,
diff --git a/tests/test_store.py b/tests/test_store.py
index 2b71d5b..46aa67d 100644
--- a/tests/test_store.py
+++ b/tests/test_store.py
@@ -2977,6 +2977,28 @@ def test_connect(tmp_path):
     assert results[0].text == "hello"
 
 
+def test_connect_restores_bm25_index_state(tmp_path):
+    db_path = tmp_path / "test_bm25.db"
+
+    store = DuckDBStore.create(
+        location=str(db_path),
+        embed=None,
+        name="connect_bm25_test",
+        title="Connect BM25 Test Store",
+    )
+    doc = MarkdownDocument(origin="test", content="hello world")
+    doc = doc.to_chunked([_get_markdown_chunk(doc, start=0, end=5)])
+    store.upsert(doc)
+    store.build_index("bm25")
+    store.con.close()
+
+    store2 = DuckDBStore.connect(str(db_path))
+    results = store2.retrieve_bm25("hello", top_k=1)
+
+    assert len(results) == 1
+    assert results[0].text == "hello"
+
+
 def test_upsert_after_hnsw_index_on_reconnect(tmp_path):
     """Upserting after reconnecting to a DB with HNSW indexes must work.
 

From 5281f9c622be9a4ecfa77745e8b669603f9ca74e Mon Sep 17 00:00:00 2001
From: Tomasz Kalinowski <kalinowskit@gmail.com>
Date: Thu, 23 Apr 2026 16:32:05 -0400
Subject: [PATCH 4/6] Invalidate cached BM25 index state after writes

---
 src/raghilda/_duckdb_store.py |  8 ++++++--
 tests/test_store.py           | 17 +++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/raghilda/_duckdb_store.py b/src/raghilda/_duckdb_store.py
index 854fbc0..250ee10 100644
--- a/src/raghilda/_duckdb_store.py
+++ b/src/raghilda/_duckdb_store.py
@@ -513,6 +513,9 @@ def upsert(
                     _duckdb_append(self.con, "documents", [doc_row])
                 _duckdb_append(self.con, "embeddings", chunk_rows)
                 self.con.commit()
+                # DuckDB FTS materializes BM25 state in side tables and does not
+                # refresh it after writes, while HNSW indexes are maintained.
+                self._has_bm25_index = False
             except Exception:
                 try:
                     self.con.rollback()
@@ -1056,9 +1059,10 @@ def retrieve_bm25(
     def _require_bm25_index(self) -> None:
         if not self._has_bm25_index:
             raise RuntimeError(
-                "DuckDBStore retrieval requires a BM25 index. "
+                "DuckDBStore retrieval requires a current BM25 index. "
                 'Call `store.build_index("bm25")` or `store.build_index()` '
-                "before calling `retrieve_bm25()` or `retrieve()`."
+                "after inserting or updating documents and before calling "
+                "`retrieve_bm25()` or `retrieve()`."
             )
 
     def build_index(
diff --git a/tests/test_store.py b/tests/test_store.py
index 46aa67d..934f7b3 100644
--- a/tests/test_store.py
+++ b/tests/test_store.py
@@ -607,6 +607,23 @@ def test_retrieve_bm25_requires_build_index(self, store_with_docs):
         with pytest.raises(RuntimeError, match="build_index"):
             store_with_docs.retrieve_bm25("document", top_k=3)
 
+    def test_retrieve_bm25_requires_rebuild_after_upsert(self, store):
+        doc1 = MarkdownDocument(origin="bm25-doc-1", content="alpha beta")
+        doc1 = doc1.to_chunked(
+            [_get_markdown_chunk(doc1, start=0, end=len(doc1.content))]
+        )
+        store.upsert(doc1)
+        store.build_index("bm25")
+
+        doc2 = MarkdownDocument(origin="bm25-doc-2", content="gamma delta")
+        doc2 = doc2.to_chunked(
+            [_get_markdown_chunk(doc2, start=0, end=len(doc2.content))]
+        )
+        store.upsert(doc2)
+
+        with pytest.raises(RuntimeError, match="build_index"):
+            store.retrieve_bm25("gamma", top_k=3)
+
     def test_retrieve_bm25_returns_document_slice_for_non_zero_start(self, store):
         # Guard against 0-based/1-based off-by-one slicing errors for non-zero starts.
         doc = MarkdownDocument(origin="bm25-text-source", content="alphabetagamma")

From 91821d5d3788f042842cfa2fabdca6500b1d7b2f Mon Sep 17 00:00:00 2001
From: Tomasz Kalinowski <kalinowskit@gmail.com>
Date: Fri, 24 Apr 2026 07:44:06 -0400
Subject: [PATCH 5/6] Track BM25 freshness for DuckDB stores

---
 src/raghilda/_duckdb_store.py | 104 ++++++++++-----
 tests/test_store.py           | 238 +++++++++++++++++++++++++++++++---
 2 files changed, 295 insertions(+), 47 deletions(-)

diff --git a/src/raghilda/_duckdb_store.py b/src/raghilda/_duckdb_store.py
index 250ee10..46d9ce6 100644
--- a/src/raghilda/_duckdb_store.py
+++ b/src/raghilda/_duckdb_store.py
@@ -57,7 +57,6 @@
     "context",
 }
 
-
 @dataclass(repr=False)
 class DuckDBMarkdownChunk(MarkdownChunk):
     """MarkdownChunk with DuckDB-specific fields for database storage"""
@@ -329,7 +328,8 @@ def create(
             name VARCHAR,
             title VARCHAR,
             embed_config VARCHAR,
-            attributes_schema_json VARCHAR
+            attributes_schema_json VARCHAR,
+            bm25_index_is_current BOOLEAN DEFAULT FALSE
         );
 
         CREATE OR REPLACE TABLE documents (
@@ -370,14 +370,16 @@ def create(
                 name,
                 title,
                 embed_config,
-                attributes_schema_json
-            ) VALUES (?, ?, ?, ?)
+                attributes_schema_json,
+                bm25_index_is_current
+            ) VALUES (?, ?, ?, ?, ?)
             """,
             [
                 name,
                 title,
                 embed_config_json,
                 attributes_schema_json,
+                False,
             ],
         )
 
@@ -403,8 +405,11 @@ def __init__(
             attributes_schema=self.metadata.attributes_schema,
             require_embedding=self.metadata.embed is not None,
         )
-        self._db_lock = threading.Lock()
-        self._has_bm25_index = _has_bm25_index(self.con)
+        self._db_lock = threading.RLock()
+        # Best-effort BM25 state for this handle. We intentionally avoid a
+        # metadata read on every retrieval; multiple live stores per DB file
+        # are unsupported and are not kept in sync.
+        self._has_bm25_index = _read_bm25_index_state(self.con)
 
     def upsert(
         self,
@@ -512,6 +517,7 @@ def upsert(
                 else:
                     _duckdb_append(self.con, "documents", [doc_row])
                 _duckdb_append(self.con, "embeddings", chunk_rows)
+                _set_bm25_index_state(self.con, False)
                 self.con.commit()
                 # DuckDB FTS materializes BM25 state in side tables and does not
                 # refresh it after writes, while HNSW indexes are maintained.
@@ -996,12 +1002,12 @@ def retrieve_bm25(
         sql = f"""
         WITH ranked AS (
             SELECT
-                e.chunk_id, 
+                e.chunk_id,
                 doc.origin AS origin,
-                e.start_index, 
-                e.end_index, 
+                e.start_index,
+                e.end_index,
                 e.char_count,
-                e.context, 
+                e.context,
                 {attribute_select}
                 doc.text[e.start_index + 1:e.end_index] AS text,
                 'bm25' AS metric_name,
@@ -1058,9 +1064,12 @@ def retrieve_bm25(
 
     def _require_bm25_index(self) -> None:
         if not self._has_bm25_index:
+            rebuild_hint = 'Call `store.build_index("bm25")`'
+            if self.metadata.embed is not None:
+                rebuild_hint += " or `store.build_index()`"
             raise RuntimeError(
                 "DuckDBStore retrieval requires a current BM25 index. "
-                'Call `store.build_index("bm25")` or `store.build_index()` '
+                f"{rebuild_hint} "
                 "after inserting or updating documents and before calling "
                 "`retrieve_bm25()` or `retrieve()`."
             )
@@ -1086,26 +1095,28 @@ def build_index(
         else:
             index_types = [_coerce_index_type(item) for item in type]
 
-        if IndexType.BM25 in index_types:
-            self.con.execute("INSTALL FTS; LOAD FTS;")
-            try:
-                self.con.begin()
-                self._create_fts_index()
-                self.con.commit()
-                self._has_bm25_index = True
-            except Exception as e:
-                self.con.rollback()
-                raise e
+        with self._db_lock:
+            if IndexType.BM25 in index_types:
+                self.con.execute("INSTALL FTS; LOAD FTS;")
+                try:
+                    self.con.begin()
+                    self._create_fts_index()
+                    _set_bm25_index_state(self.con, True)
+                    self.con.commit()
+                    self._has_bm25_index = True
+                except Exception as e:
+                    self.con.rollback()
+                    raise e
 
-        if IndexType.HNSW in index_types:
-            self.con.execute("INSTALL vss; LOAD vss;")
-            try:
-                self.con.begin()
-                self._create_hnsw_index()
-                self.con.commit()
-            except Exception as e:
-                self.con.rollback()
-                raise e
+            if IndexType.HNSW in index_types:
+                self.con.execute("INSTALL vss; LOAD vss;")
+                try:
+                    self.con.begin()
+                    self._create_hnsw_index()
+                    self.con.commit()
+                except Exception as e:
+                    self.con.rollback()
+                    raise e
 
     def _create_fts_index(self):
         self.con.execute(
@@ -1225,7 +1236,30 @@ def _load_extensions_for_existing_indexes(con: duckdb.DuckDBPyConnection) -> Non
         con.execute("INSTALL vss; LOAD vss;")
 
 
-def _has_bm25_index(con: duckdb.DuckDBPyConnection) -> bool:
+def _ensure_bm25_index_state_column(
+    con: duckdb.DuckDBPyConnection,
+) -> None:
+    if "bm25_index_is_current" in _table_columns(con, table="metadata"):
+        return
+    con.execute(
+        "ALTER TABLE metadata ADD COLUMN bm25_index_is_current BOOLEAN DEFAULT FALSE"
+    )
+
+
+def _read_bm25_index_state(con: duckdb.DuckDBPyConnection) -> bool:
+    if "bm25_index_is_current" not in _table_columns(con, table="metadata"):
+        # NOTE: legacy stores predate explicit BM25 freshness tracking.
+        # For backward compatibility we keep trusting any existing FTS index
+        # until this release writes the new metadata field. TODO: switch the
+        # missing-column case to conservative "stale until rebuilt" behavior
+        # in a future breaking release.
+        return _has_legacy_bm25_index(con)
+    row = con.execute("SELECT bm25_index_is_current FROM metadata").fetchone()
+    assert row is not None
+    return bool(row[0])
+
+
+def _has_legacy_bm25_index(con: duckdb.DuckDBPyConnection) -> bool:
     row = con.execute(
         """
         SELECT EXISTS (
@@ -1240,6 +1274,14 @@ def _has_bm25_index(con: duckdb.DuckDBPyConnection) -> bool:
     return bool(row[0])
 
 
+def _set_bm25_index_state(con: duckdb.DuckDBPyConnection, is_current: bool) -> None:
+    _ensure_bm25_index_state_column(con)
+    con.execute(
+        "UPDATE metadata SET bm25_index_is_current = ?",
+        [is_current],
+    )
+
+
 def _validate_required_schema(
     con: duckdb.DuckDBPyConnection,
     *,
diff --git a/tests/test_store.py b/tests/test_store.py
index 934f7b3..d197628 100644
--- a/tests/test_store.py
+++ b/tests/test_store.py
@@ -4,6 +4,7 @@
 import subprocess
 import sys
 import textwrap
+import threading
 from pathlib import Path
 from types import SimpleNamespace
 from typing import Annotated, Any, cast
@@ -607,6 +608,12 @@ def test_retrieve_bm25_requires_build_index(self, store_with_docs):
         with pytest.raises(RuntimeError, match="build_index"):
             store_with_docs.retrieve_bm25("document", top_k=3)
 
+    def test_retrieve_bm25_requires_build_index_without_embeddings(self, store):
+        with pytest.raises(RuntimeError, match='store.build_index\\("bm25"\\)') as exc:
+            store.retrieve_bm25("document", top_k=3)
+
+        assert "store.build_index()" not in str(exc.value)
+
     def test_retrieve_bm25_requires_rebuild_after_upsert(self, store):
         doc1 = MarkdownDocument(origin="bm25-doc-1", content="alpha beta")
         doc1 = doc1.to_chunked(
@@ -1298,7 +1305,7 @@ def test_insert_result_document_preserves_defaulted_attributes(self):
             "priority": 0,
         }
 
-    def test_insert_snapshot_reads_are_serialized_under_db_lock(self, monkeypatch):
+    def test_upsert_replaces_document_and_preserves_attribute_defaults(self):
         store = DuckDBStore.create(
             location=":memory:",
             embed=None,
@@ -1306,18 +1313,6 @@ def test_insert_snapshot_reads_are_serialized_under_db_lock(self, monkeypatch):
             attributes={"tenant": str, "priority": (int, 0)},
         )
 
-        observed_lock_states: list[bool] = []
-        original_snapshot = store._load_document_snapshot
-
-        def wrapped_snapshot(*, origin: str, text: str):
-            observed_lock_states.append(store._db_lock.locked())
-            return original_snapshot(
-                origin=origin,
-                text=text,
-            )
-
-        monkeypatch.setattr(store, "_load_document_snapshot", wrapped_snapshot)
-
         first = MarkdownDocument(
             origin="lock-snapshot-test",
             content="alpha",
@@ -1350,10 +1345,11 @@ def wrapped_snapshot(*, origin: str, text: str):
                 )
             ]
         )
-        store.upsert(second, skip_if_unchanged=False)
+        replaced = store.upsert(second, skip_if_unchanged=False)
 
-        assert observed_lock_states
-        assert all(observed_lock_states)
+        assert replaced.action == "replaced"
+        assert replaced.document.content == "alpha beta"
+        assert replaced.document.attributes == {"tenant": "docs", "priority": 0}
 
     def test_insert_snapshot_preserves_nullable_none_attributes(self):
         store = DuckDBStore.create(
@@ -3016,6 +3012,216 @@ def test_connect_restores_bm25_index_state(tmp_path):
     assert results[0].text == "hello"
 
 
+def test_connect_requires_bm25_rebuild_after_upsert(tmp_path):
+    db_path = tmp_path / "test_bm25_reconnect_stale.db"
+
+    store = DuckDBStore.create(
+        location=str(db_path),
+        embed=None,
+        name="connect_bm25_stale_test",
+        title="Connect BM25 Stale Test Store",
+    )
+    doc1 = MarkdownDocument(origin="doc-1", content="alpha beta")
+    doc1 = doc1.to_chunked([_get_markdown_chunk(doc1, start=0, end=len(doc1.content))])
+    store.upsert(doc1)
+    store.build_index("bm25")
+
+    doc2 = MarkdownDocument(origin="doc-2", content="gamma delta")
+    doc2 = doc2.to_chunked([_get_markdown_chunk(doc2, start=0, end=len(doc2.content))])
+    store.upsert(doc2)
+    store.con.close()
+
+    reconnected = DuckDBStore.connect(str(db_path))
+
+    with pytest.raises(RuntimeError, match="build_index"):
+        reconnected.retrieve_bm25("gamma", top_k=1)
+
+
+def test_connect_migrates_legacy_bm25_index_as_current(tmp_path):
+    db_path = tmp_path / "test_bm25_legacy_migrate.db"
+
+    store = DuckDBStore.create(
+        location=str(db_path),
+        embed=None,
+        name="connect_bm25_legacy_test",
+        title="Connect BM25 Legacy Test Store",
+    )
+    doc = MarkdownDocument(origin="test", content="hello world")
+    doc = doc.to_chunked([_get_markdown_chunk(doc, start=0, end=5)])
+    store.upsert(doc)
+    store.build_index("bm25")
+    store.con.execute("ALTER TABLE metadata DROP COLUMN bm25_index_is_current")
+    store.con.close()
+
+    reconnected = DuckDBStore.connect(str(db_path))
+
+    results = reconnected.retrieve_bm25("hello", top_k=1)
+    assert len(results) == 1
+    assert results[0].text == "hello"
+
+    combined = reconnected.retrieve("hello", top_k=1)
+    assert len(combined) == 1
+    assert combined[0].text == "hello"
+
+
+def test_connect_read_only_detects_legacy_bm25_index(tmp_path):
+    db_path = tmp_path / "test_bm25_legacy_read_only.db"
+
+    store = DuckDBStore.create(
+        location=str(db_path),
+        embed=None,
+        name="connect_bm25_legacy_read_only_test",
+        title="Connect BM25 Legacy Read Only Test Store",
+    )
+    doc = MarkdownDocument(origin="test", content="hello world")
+    doc = doc.to_chunked([_get_markdown_chunk(doc, start=0, end=5)])
+    store.upsert(doc)
+    store.build_index("bm25")
+    store.con.execute("ALTER TABLE metadata DROP COLUMN bm25_index_is_current")
+    store.con.close()
+
+    reconnected = DuckDBStore.connect(str(db_path), read_only=True)
+
+    results = reconnected.retrieve_bm25("hello", top_k=1)
+    assert len(results) == 1
+    assert results[0].text == "hello"
+
+    combined = reconnected.retrieve("hello", top_k=1)
+    assert len(combined) == 1
+    assert combined[0].text == "hello"
+
+
+def test_upsert_on_legacy_store_starts_tracking_bm25_freshness(tmp_path):
+    db_path = tmp_path / "test_bm25_legacy_upgrade_write.db"
+
+    store = DuckDBStore.create(
+        location=str(db_path),
+        embed=None,
+        name="connect_bm25_legacy_upgrade_write_test",
+        title="Connect BM25 Legacy Upgrade Write Test Store",
+    )
+    doc1 = MarkdownDocument(origin="doc-1", content="alpha beta")
+    doc1 = doc1.to_chunked([_get_markdown_chunk(doc1, start=0, end=len(doc1.content))])
+    store.upsert(doc1)
+    store.build_index("bm25")
+    store.con.execute("ALTER TABLE metadata DROP COLUMN bm25_index_is_current")
+    store.con.close()
+
+    upgraded = DuckDBStore.connect(str(db_path))
+    doc2 = MarkdownDocument(origin="doc-2", content="gamma delta")
+    doc2 = doc2.to_chunked([_get_markdown_chunk(doc2, start=0, end=len(doc2.content))])
+    upgraded.upsert(doc2)
+    upgraded.con.close()
+
+    reconnected = DuckDBStore.connect(str(db_path))
+
+    with pytest.raises(RuntimeError, match="build_index"):
+        reconnected.retrieve_bm25("gamma", top_k=1)
+
+
+def test_build_index_waits_for_db_lock():
+    store = DuckDBStore.create(
+        location=":memory:",
+        embed=None,
+        overwrite=True,
+        name="build_index_lock_test",
+    )
+    doc = MarkdownDocument(origin="test", content="hello world")
+    doc = doc.to_chunked([_get_markdown_chunk(doc, start=0, end=5)])
+    store.upsert(doc)
+
+    attempted = threading.Event()
+    thread_error: list[BaseException] = []
+
+    def build_index():
+        attempted.set()
+        try:
+            store.build_index("bm25")
+        except BaseException as exc:
+            thread_error.append(exc)
+
+    store._db_lock.acquire()
+    thread = threading.Thread(target=build_index)
+    thread.start()
+    assert attempted.wait(timeout=1)
+
+    thread.join(timeout=0.1)
+    assert thread.is_alive()
+
+    store._db_lock.release()
+    thread.join(timeout=5)
+
+    assert not thread.is_alive()
+    assert thread_error == []
+
+
+def test_retrieve_waits_for_in_progress_bm25_build(monkeypatch):
+    store = DuckDBStore.create(
+        location=":memory:",
+        embed=None,
+        overwrite=True,
+        name="retrieve_waits_for_bm25_build_test",
+    )
+    doc = MarkdownDocument(origin="test", content="hello world")
+    doc = doc.to_chunked([_get_markdown_chunk(doc, start=0, end=5)])
+    store.upsert(doc)
+
+    build_started = threading.Event()
+    allow_build_to_finish = threading.Event()
+
+    original_create_fts_index = store._create_fts_index
+
+    def blocking_create_fts_index():
+        build_started.set()
+        assert allow_build_to_finish.wait(timeout=5)
+        original_create_fts_index()
+
+    monkeypatch.setattr(store, "_create_fts_index", blocking_create_fts_index)
+
+    thread_error: list[BaseException] = []
+
+    def build_index():
+        try:
+            store.build_index("bm25")
+        except BaseException as exc:
+            thread_error.append(exc)
+
+    thread = threading.Thread(target=build_index)
+    thread.start()
+    assert build_started.wait(timeout=1)
+
+    retrieve_started = threading.Event()
+    retrieve_finished = threading.Event()
+    retrieved_results: list[RetrievedDuckDBMarkdownChunk] = []
+    retrieve_error: list[BaseException] = []
+
+    def retrieve():
+        retrieve_started.set()
+        try:
+            results = store.retrieve("hello", top_k=1)
+            retrieved_results.extend(results)
+        except BaseException as exc:
+            retrieve_error.append(exc)
+        finally:
+            retrieve_finished.set()
+
+    retrieve_thread = threading.Thread(target=retrieve)
+    retrieve_thread.start()
+    assert retrieve_started.wait(timeout=1)
+    retrieve_thread.join(timeout=0.1)
+    assert retrieve_thread.is_alive()
+
+    allow_build_to_finish.set()
+    thread.join(timeout=5)
+    retrieve_thread.join(timeout=5)
+
+    assert thread_error == []
+    assert retrieve_error == []
+    assert retrieve_finished.is_set()
+    assert len(retrieved_results) == 1
+    assert retrieved_results[0].text == "hello"
+
+
 def test_upsert_after_hnsw_index_on_reconnect(tmp_path):
     """Upserting after reconnecting to a DB with HNSW indexes must work.
 

From 15220a2e0271266d6ec5563e3e3b1a1204e5f937 Mon Sep 17 00:00:00 2001
From: Tomasz Kalinowski <kalinowskit@gmail.com>
Date: Fri, 24 Apr 2026 08:03:47 -0400
Subject: [PATCH 6/6] Format DuckDB store and document formatter step

---
 AGENTS.md                     | 1 +
 src/raghilda/_duckdb_store.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/AGENTS.md b/AGENTS.md
index eaaa211..775006f 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -15,6 +15,7 @@
 - Docs are built with Quarto, so use it's syntax in docstrings.
 
 ## Must-Run Commands Before Hand-off
+- Before committing or pushing Python changes, run `./.venv/bin/task format`; CI runs `format_check` and will fail on unformatted files.
 - `./.venv/bin/task check`
 - `./.venv/bin/task tests`
 - For docs changes, also `./.venv/bin/task docs_build`
diff --git a/src/raghilda/_duckdb_store.py b/src/raghilda/_duckdb_store.py
index 46d9ce6..dc4bce5 100644
--- a/src/raghilda/_duckdb_store.py
+++ b/src/raghilda/_duckdb_store.py
@@ -57,6 +57,7 @@
     "context",
 }
 
+
 @dataclass(repr=False)
 class DuckDBMarkdownChunk(MarkdownChunk):
     """MarkdownChunk with DuckDB-specific fields for database storage"""