From 42bdf6a4f3edfb4b76197fc1af90d5e799b94b93 Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Thu, 23 Apr 2026 15:29:46 -0400 Subject: [PATCH 1/6] Raise a clearer error before BM25 retrieval without build_index --- src/raghilda/_duckdb_store.py | 20 ++++++++++++++++++++ tests/test_store.py | 8 ++++++++ 2 files changed, 28 insertions(+) diff --git a/src/raghilda/_duckdb_store.py b/src/raghilda/_duckdb_store.py index 7a6aabd..69b3ebc 100644 --- a/src/raghilda/_duckdb_store.py +++ b/src/raghilda/_duckdb_store.py @@ -1011,6 +1011,7 @@ def retrieve_bm25( """ with self._db_lock: + self._require_bm25_index() result = self.con.execute( sql, { @@ -1048,6 +1049,25 @@ def retrieve_bm25( return output + def _require_bm25_index(self) -> None: + row = self.con.execute( + """ + SELECT EXISTS ( + SELECT 1 + FROM duckdb_functions() + WHERE schema_name = 'fts_main_chunks' + AND function_name = 'match_bm25' + ) + """ + ).fetchone() + assert row is not None + if not row[0]: + raise RuntimeError( + "DuckDBStore retrieval requires a BM25 index. " + 'Call `store.build_index("bm25")` or `store.build_index()` ' + "before calling `retrieve_bm25()` or `retrieve()`." + ) + def build_index( self, type: Optional[IndexType | str | list[IndexType | str]] = None, diff --git a/tests/test_store.py b/tests/test_store.py index 77301b2..2b71d5b 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -603,6 +603,10 @@ def test_retrieve_bm25(self, store_with_docs): assert isinstance(chunk, RetrievedDuckDBMarkdownChunk) assert chunk.text is not None + def test_retrieve_bm25_requires_build_index(self, store_with_docs): + with pytest.raises(RuntimeError, match="build_index"): + store_with_docs.retrieve_bm25("document", top_k=3) + def test_retrieve_bm25_returns_document_slice_for_non_zero_start(self, store): # Guard against 0-based/1-based off-by-one slicing errors for non-zero starts. doc = MarkdownDocument(origin="bm25-text-source", content="alphabetagamma") @@ -641,6 +645,10 @@ def test_retrieve(self, store_with_docs): assert isinstance(chunk, RetrievedDuckDBMarkdownChunk) assert chunk.text is not None + def test_retrieve_requires_build_index(self, store_with_docs): + with pytest.raises(RuntimeError, match="build_index"): + store_with_docs.retrieve("document", top_k=3, deoverlap=False) + def test_retrieve_with_deoverlap(self, store): # Create a document with overlapping chunks # "hello world test document" = 24 chars From 82204d4bc7f983b9e95b7c7f4b91e52a316c3195 Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Thu, 23 Apr 2026 15:30:39 -0400 Subject: [PATCH 2/6] Add the missing build_index call to DuckDB examples --- README.md | 3 +++ src/raghilda/_duckdb_store.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/README.md b/README.md index 7941f69..ffc6f64 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,9 @@ for link in links: chunked_document = chunker.chunk(document) store.upsert(chunked_document) +# Build indexes before retrieval +store.build_index() + # Retrieve relevant chunks chunks = store.retrieve("How do I stream a response?", top_k=5) for chunk in chunks: diff --git a/src/raghilda/_duckdb_store.py b/src/raghilda/_duckdb_store.py index 69b3ebc..fc53c5b 100644 --- a/src/raghilda/_duckdb_store.py +++ b/src/raghilda/_duckdb_store.py @@ -179,6 +179,9 @@ class DuckDBStore(BaseStore): ) store.upsert(MarkdownChunker().chunk(doc)) + # Build indexes before retrieval + store.build_index() + # Retrieve similar chunks chunks = store.retrieve("How do I use this?", top_k=5) ``` From 57939dd26aab01485f5c82a539469adaebe762bc Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Thu, 23 Apr 2026 15:57:10 -0400 Subject: [PATCH 3/6] Cache BM25 index state off the retrieval hot path --- src/raghilda/_duckdb_store.py | 30 ++++++++++++++++++------------ tests/test_store.py | 22 ++++++++++++++++++++++ 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/src/raghilda/_duckdb_store.py b/src/raghilda/_duckdb_store.py index fc53c5b..854fbc0 100644 --- a/src/raghilda/_duckdb_store.py +++ b/src/raghilda/_duckdb_store.py @@ -404,6 +404,7 @@ def __init__( require_embedding=self.metadata.embed is not None, ) self._db_lock = threading.Lock() + self._has_bm25_index = _has_bm25_index(self.con) def upsert( self, @@ -1053,18 +1054,7 @@ def retrieve_bm25( return output def _require_bm25_index(self) -> None: - row = self.con.execute( - """ - SELECT EXISTS ( - SELECT 1 - FROM duckdb_functions() - WHERE schema_name = 'fts_main_chunks' - AND function_name = 'match_bm25' - ) - """ - ).fetchone() - assert row is not None - if not row[0]: + if not self._has_bm25_index: raise RuntimeError( "DuckDBStore retrieval requires a BM25 index. " 'Call `store.build_index("bm25")` or `store.build_index()` ' @@ -1098,6 +1088,7 @@ def build_index( self.con.begin() self._create_fts_index() self.con.commit() + self._has_bm25_index = True except Exception as e: self.con.rollback() raise e @@ -1230,6 +1221,21 @@ def _load_extensions_for_existing_indexes(con: duckdb.DuckDBPyConnection) -> Non con.execute("INSTALL vss; LOAD vss;") +def _has_bm25_index(con: duckdb.DuckDBPyConnection) -> bool: + row = con.execute( + """ + SELECT EXISTS ( + SELECT 1 + FROM duckdb_functions() + WHERE schema_name = 'fts_main_chunks' + AND function_name = 'match_bm25' + ) + """ + ).fetchone() + assert row is not None + return bool(row[0]) + + def _validate_required_schema( con: duckdb.DuckDBPyConnection, *, diff --git a/tests/test_store.py b/tests/test_store.py index 2b71d5b..46aa67d 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -2977,6 +2977,28 @@ def test_connect(tmp_path): assert results[0].text == "hello" +def test_connect_restores_bm25_index_state(tmp_path): + db_path = tmp_path / "test_bm25.db" + + store = DuckDBStore.create( + location=str(db_path), + embed=None, + name="connect_bm25_test", + title="Connect BM25 Test Store", + ) + doc = MarkdownDocument(origin="test", content="hello world") + doc = doc.to_chunked([_get_markdown_chunk(doc, start=0, end=5)]) + store.upsert(doc) + store.build_index("bm25") + store.con.close() + + store2 = DuckDBStore.connect(str(db_path)) + results = store2.retrieve_bm25("hello", top_k=1) + + assert len(results) == 1 + assert results[0].text == "hello" + + def test_upsert_after_hnsw_index_on_reconnect(tmp_path): """Upserting after reconnecting to a DB with HNSW indexes must work. From 5281f9c622be9a4ecfa77745e8b669603f9ca74e Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Thu, 23 Apr 2026 16:32:05 -0400 Subject: [PATCH 4/6] Invalidate cached BM25 index state after writes --- src/raghilda/_duckdb_store.py | 8 ++++++-- tests/test_store.py | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/raghilda/_duckdb_store.py b/src/raghilda/_duckdb_store.py index 854fbc0..250ee10 100644 --- a/src/raghilda/_duckdb_store.py +++ b/src/raghilda/_duckdb_store.py @@ -513,6 +513,9 @@ def upsert( _duckdb_append(self.con, "documents", [doc_row]) _duckdb_append(self.con, "embeddings", chunk_rows) self.con.commit() + # DuckDB FTS materializes BM25 state in side tables and does not + # refresh it after writes, while HNSW indexes are maintained. + self._has_bm25_index = False except Exception: try: self.con.rollback() @@ -1056,9 +1059,10 @@ def retrieve_bm25( def _require_bm25_index(self) -> None: if not self._has_bm25_index: raise RuntimeError( - "DuckDBStore retrieval requires a BM25 index. " + "DuckDBStore retrieval requires a current BM25 index. " 'Call `store.build_index("bm25")` or `store.build_index()` ' - "before calling `retrieve_bm25()` or `retrieve()`." + "after inserting or updating documents and before calling " + "`retrieve_bm25()` or `retrieve()`." ) def build_index( diff --git a/tests/test_store.py b/tests/test_store.py index 46aa67d..934f7b3 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -607,6 +607,23 @@ def test_retrieve_bm25_requires_build_index(self, store_with_docs): with pytest.raises(RuntimeError, match="build_index"): store_with_docs.retrieve_bm25("document", top_k=3) + def test_retrieve_bm25_requires_rebuild_after_upsert(self, store): + doc1 = MarkdownDocument(origin="bm25-doc-1", content="alpha beta") + doc1 = doc1.to_chunked( + [_get_markdown_chunk(doc1, start=0, end=len(doc1.content))] + ) + store.upsert(doc1) + store.build_index("bm25") + + doc2 = MarkdownDocument(origin="bm25-doc-2", content="gamma delta") + doc2 = doc2.to_chunked( + [_get_markdown_chunk(doc2, start=0, end=len(doc2.content))] + ) + store.upsert(doc2) + + with pytest.raises(RuntimeError, match="build_index"): + store.retrieve_bm25("gamma", top_k=3) + def test_retrieve_bm25_returns_document_slice_for_non_zero_start(self, store): # Guard against 0-based/1-based off-by-one slicing errors for non-zero starts. doc = MarkdownDocument(origin="bm25-text-source", content="alphabetagamma") From 91821d5d3788f042842cfa2fabdca6500b1d7b2f Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Fri, 24 Apr 2026 07:44:06 -0400 Subject: [PATCH 5/6] Track BM25 freshness for DuckDB stores --- src/raghilda/_duckdb_store.py | 104 ++++++++++----- tests/test_store.py | 238 +++++++++++++++++++++++++++++++--- 2 files changed, 295 insertions(+), 47 deletions(-) diff --git a/src/raghilda/_duckdb_store.py b/src/raghilda/_duckdb_store.py index 250ee10..46d9ce6 100644 --- a/src/raghilda/_duckdb_store.py +++ b/src/raghilda/_duckdb_store.py @@ -57,7 +57,6 @@ "context", } - @dataclass(repr=False) class DuckDBMarkdownChunk(MarkdownChunk): """MarkdownChunk with DuckDB-specific fields for database storage""" @@ -329,7 +328,8 @@ def create( name VARCHAR, title VARCHAR, embed_config VARCHAR, - attributes_schema_json VARCHAR + attributes_schema_json VARCHAR, + bm25_index_is_current BOOLEAN DEFAULT FALSE ); CREATE OR REPLACE TABLE documents ( @@ -370,14 +370,16 @@ def create( name, title, embed_config, - attributes_schema_json - ) VALUES (?, ?, ?, ?) + attributes_schema_json, + bm25_index_is_current + ) VALUES (?, ?, ?, ?, ?) """, [ name, title, embed_config_json, attributes_schema_json, + False, ], ) @@ -403,8 +405,11 @@ def __init__( attributes_schema=self.metadata.attributes_schema, require_embedding=self.metadata.embed is not None, ) - self._db_lock = threading.Lock() - self._has_bm25_index = _has_bm25_index(self.con) + self._db_lock = threading.RLock() + # Best-effort BM25 state for this handle. We intentionally avoid a + # metadata read on every retrieval; multiple live stores per DB file + # are unsupported and are not kept in sync. + self._has_bm25_index = _read_bm25_index_state(self.con) def upsert( self, @@ -512,6 +517,7 @@ def upsert( else: _duckdb_append(self.con, "documents", [doc_row]) _duckdb_append(self.con, "embeddings", chunk_rows) + _set_bm25_index_state(self.con, False) self.con.commit() # DuckDB FTS materializes BM25 state in side tables and does not # refresh it after writes, while HNSW indexes are maintained. @@ -996,12 +1002,12 @@ def retrieve_bm25( sql = f""" WITH ranked AS ( SELECT - e.chunk_id, + e.chunk_id, doc.origin AS origin, - e.start_index, - e.end_index, + e.start_index, + e.end_index, e.char_count, - e.context, + e.context, {attribute_select} doc.text[e.start_index + 1:e.end_index] AS text, 'bm25' AS metric_name, @@ -1058,9 +1064,12 @@ def retrieve_bm25( def _require_bm25_index(self) -> None: if not self._has_bm25_index: + rebuild_hint = 'Call `store.build_index("bm25")`' + if self.metadata.embed is not None: + rebuild_hint += " or `store.build_index()`" raise RuntimeError( "DuckDBStore retrieval requires a current BM25 index. " - 'Call `store.build_index("bm25")` or `store.build_index()` ' + f"{rebuild_hint} " "after inserting or updating documents and before calling " "`retrieve_bm25()` or `retrieve()`." ) @@ -1086,26 +1095,28 @@ def build_index( else: index_types = [_coerce_index_type(item) for item in type] - if IndexType.BM25 in index_types: - self.con.execute("INSTALL FTS; LOAD FTS;") - try: - self.con.begin() - self._create_fts_index() - self.con.commit() - self._has_bm25_index = True - except Exception as e: - self.con.rollback() - raise e + with self._db_lock: + if IndexType.BM25 in index_types: + self.con.execute("INSTALL FTS; LOAD FTS;") + try: + self.con.begin() + self._create_fts_index() + _set_bm25_index_state(self.con, True) + self.con.commit() + self._has_bm25_index = True + except Exception as e: + self.con.rollback() + raise e - if IndexType.HNSW in index_types: - self.con.execute("INSTALL vss; LOAD vss;") - try: - self.con.begin() - self._create_hnsw_index() - self.con.commit() - except Exception as e: - self.con.rollback() - raise e + if IndexType.HNSW in index_types: + self.con.execute("INSTALL vss; LOAD vss;") + try: + self.con.begin() + self._create_hnsw_index() + self.con.commit() + except Exception as e: + self.con.rollback() + raise e def _create_fts_index(self): self.con.execute( @@ -1225,7 +1236,30 @@ def _load_extensions_for_existing_indexes(con: duckdb.DuckDBPyConnection) -> Non con.execute("INSTALL vss; LOAD vss;") -def _has_bm25_index(con: duckdb.DuckDBPyConnection) -> bool: +def _ensure_bm25_index_state_column( + con: duckdb.DuckDBPyConnection, +) -> None: + if "bm25_index_is_current" in _table_columns(con, table="metadata"): + return + con.execute( + "ALTER TABLE metadata ADD COLUMN bm25_index_is_current BOOLEAN DEFAULT FALSE" + ) + + +def _read_bm25_index_state(con: duckdb.DuckDBPyConnection) -> bool: + if "bm25_index_is_current" not in _table_columns(con, table="metadata"): + # NOTE: legacy stores predate explicit BM25 freshness tracking. + # For backward compatibility we keep trusting any existing FTS index + # until this release writes the new metadata field. TODO: switch the + # missing-column case to conservative "stale until rebuilt" behavior + # in a future breaking release. + return _has_legacy_bm25_index(con) + row = con.execute("SELECT bm25_index_is_current FROM metadata").fetchone() + assert row is not None + return bool(row[0]) + + +def _has_legacy_bm25_index(con: duckdb.DuckDBPyConnection) -> bool: row = con.execute( """ SELECT EXISTS ( @@ -1240,6 +1274,14 @@ def _has_bm25_index(con: duckdb.DuckDBPyConnection) -> bool: return bool(row[0]) +def _set_bm25_index_state(con: duckdb.DuckDBPyConnection, is_current: bool) -> None: + _ensure_bm25_index_state_column(con) + con.execute( + "UPDATE metadata SET bm25_index_is_current = ?", + [is_current], + ) + + def _validate_required_schema( con: duckdb.DuckDBPyConnection, *, diff --git a/tests/test_store.py b/tests/test_store.py index 934f7b3..d197628 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -4,6 +4,7 @@ import subprocess import sys import textwrap +import threading from pathlib import Path from types import SimpleNamespace from typing import Annotated, Any, cast @@ -607,6 +608,12 @@ def test_retrieve_bm25_requires_build_index(self, store_with_docs): with pytest.raises(RuntimeError, match="build_index"): store_with_docs.retrieve_bm25("document", top_k=3) + def test_retrieve_bm25_requires_build_index_without_embeddings(self, store): + with pytest.raises(RuntimeError, match='store.build_index\\("bm25"\\)') as exc: + store.retrieve_bm25("document", top_k=3) + + assert "store.build_index()" not in str(exc.value) + def test_retrieve_bm25_requires_rebuild_after_upsert(self, store): doc1 = MarkdownDocument(origin="bm25-doc-1", content="alpha beta") doc1 = doc1.to_chunked( @@ -1298,7 +1305,7 @@ def test_insert_result_document_preserves_defaulted_attributes(self): "priority": 0, } - def test_insert_snapshot_reads_are_serialized_under_db_lock(self, monkeypatch): + def test_upsert_replaces_document_and_preserves_attribute_defaults(self): store = DuckDBStore.create( location=":memory:", embed=None, @@ -1306,18 +1313,6 @@ def test_insert_snapshot_reads_are_serialized_under_db_lock(self, monkeypatch): attributes={"tenant": str, "priority": (int, 0)}, ) - observed_lock_states: list[bool] = [] - original_snapshot = store._load_document_snapshot - - def wrapped_snapshot(*, origin: str, text: str): - observed_lock_states.append(store._db_lock.locked()) - return original_snapshot( - origin=origin, - text=text, - ) - - monkeypatch.setattr(store, "_load_document_snapshot", wrapped_snapshot) - first = MarkdownDocument( origin="lock-snapshot-test", content="alpha", @@ -1350,10 +1345,11 @@ def wrapped_snapshot(*, origin: str, text: str): ) ] ) - store.upsert(second, skip_if_unchanged=False) + replaced = store.upsert(second, skip_if_unchanged=False) - assert observed_lock_states - assert all(observed_lock_states) + assert replaced.action == "replaced" + assert replaced.document.content == "alpha beta" + assert replaced.document.attributes == {"tenant": "docs", "priority": 0} def test_insert_snapshot_preserves_nullable_none_attributes(self): store = DuckDBStore.create( @@ -3016,6 +3012,216 @@ def test_connect_restores_bm25_index_state(tmp_path): assert results[0].text == "hello" +def test_connect_requires_bm25_rebuild_after_upsert(tmp_path): + db_path = tmp_path / "test_bm25_reconnect_stale.db" + + store = DuckDBStore.create( + location=str(db_path), + embed=None, + name="connect_bm25_stale_test", + title="Connect BM25 Stale Test Store", + ) + doc1 = MarkdownDocument(origin="doc-1", content="alpha beta") + doc1 = doc1.to_chunked([_get_markdown_chunk(doc1, start=0, end=len(doc1.content))]) + store.upsert(doc1) + store.build_index("bm25") + + doc2 = MarkdownDocument(origin="doc-2", content="gamma delta") + doc2 = doc2.to_chunked([_get_markdown_chunk(doc2, start=0, end=len(doc2.content))]) + store.upsert(doc2) + store.con.close() + + reconnected = DuckDBStore.connect(str(db_path)) + + with pytest.raises(RuntimeError, match="build_index"): + reconnected.retrieve_bm25("gamma", top_k=1) + + +def test_connect_migrates_legacy_bm25_index_as_current(tmp_path): + db_path = tmp_path / "test_bm25_legacy_migrate.db" + + store = DuckDBStore.create( + location=str(db_path), + embed=None, + name="connect_bm25_legacy_test", + title="Connect BM25 Legacy Test Store", + ) + doc = MarkdownDocument(origin="test", content="hello world") + doc = doc.to_chunked([_get_markdown_chunk(doc, start=0, end=5)]) + store.upsert(doc) + store.build_index("bm25") + store.con.execute("ALTER TABLE metadata DROP COLUMN bm25_index_is_current") + store.con.close() + + reconnected = DuckDBStore.connect(str(db_path)) + + results = reconnected.retrieve_bm25("hello", top_k=1) + assert len(results) == 1 + assert results[0].text == "hello" + + combined = reconnected.retrieve("hello", top_k=1) + assert len(combined) == 1 + assert combined[0].text == "hello" + + +def test_connect_read_only_detects_legacy_bm25_index(tmp_path): + db_path = tmp_path / "test_bm25_legacy_read_only.db" + + store = DuckDBStore.create( + location=str(db_path), + embed=None, + name="connect_bm25_legacy_read_only_test", + title="Connect BM25 Legacy Read Only Test Store", + ) + doc = MarkdownDocument(origin="test", content="hello world") + doc = doc.to_chunked([_get_markdown_chunk(doc, start=0, end=5)]) + store.upsert(doc) + store.build_index("bm25") + store.con.execute("ALTER TABLE metadata DROP COLUMN bm25_index_is_current") + store.con.close() + + reconnected = DuckDBStore.connect(str(db_path), read_only=True) + + results = reconnected.retrieve_bm25("hello", top_k=1) + assert len(results) == 1 + assert results[0].text == "hello" + + combined = reconnected.retrieve("hello", top_k=1) + assert len(combined) == 1 + assert combined[0].text == "hello" + + +def test_upsert_on_legacy_store_starts_tracking_bm25_freshness(tmp_path): + db_path = tmp_path / "test_bm25_legacy_upgrade_write.db" + + store = DuckDBStore.create( + location=str(db_path), + embed=None, + name="connect_bm25_legacy_upgrade_write_test", + title="Connect BM25 Legacy Upgrade Write Test Store", + ) + doc1 = MarkdownDocument(origin="doc-1", content="alpha beta") + doc1 = doc1.to_chunked([_get_markdown_chunk(doc1, start=0, end=len(doc1.content))]) + store.upsert(doc1) + store.build_index("bm25") + store.con.execute("ALTER TABLE metadata DROP COLUMN bm25_index_is_current") + store.con.close() + + upgraded = DuckDBStore.connect(str(db_path)) + doc2 = MarkdownDocument(origin="doc-2", content="gamma delta") + doc2 = doc2.to_chunked([_get_markdown_chunk(doc2, start=0, end=len(doc2.content))]) + upgraded.upsert(doc2) + upgraded.con.close() + + reconnected = DuckDBStore.connect(str(db_path)) + + with pytest.raises(RuntimeError, match="build_index"): + reconnected.retrieve_bm25("gamma", top_k=1) + + +def test_build_index_waits_for_db_lock(): + store = DuckDBStore.create( + location=":memory:", + embed=None, + overwrite=True, + name="build_index_lock_test", + ) + doc = MarkdownDocument(origin="test", content="hello world") + doc = doc.to_chunked([_get_markdown_chunk(doc, start=0, end=5)]) + store.upsert(doc) + + attempted = threading.Event() + thread_error: list[BaseException] = [] + + def build_index(): + attempted.set() + try: + store.build_index("bm25") + except BaseException as exc: + thread_error.append(exc) + + store._db_lock.acquire() + thread = threading.Thread(target=build_index) + thread.start() + assert attempted.wait(timeout=1) + + thread.join(timeout=0.1) + assert thread.is_alive() + + store._db_lock.release() + thread.join(timeout=5) + + assert not thread.is_alive() + assert thread_error == [] + + +def test_retrieve_waits_for_in_progress_bm25_build(monkeypatch): + store = DuckDBStore.create( + location=":memory:", + embed=None, + overwrite=True, + name="retrieve_waits_for_bm25_build_test", + ) + doc = MarkdownDocument(origin="test", content="hello world") + doc = doc.to_chunked([_get_markdown_chunk(doc, start=0, end=5)]) + store.upsert(doc) + + build_started = threading.Event() + allow_build_to_finish = threading.Event() + + original_create_fts_index = store._create_fts_index + + def blocking_create_fts_index(): + build_started.set() + assert allow_build_to_finish.wait(timeout=5) + original_create_fts_index() + + monkeypatch.setattr(store, "_create_fts_index", blocking_create_fts_index) + + thread_error: list[BaseException] = [] + + def build_index(): + try: + store.build_index("bm25") + except BaseException as exc: + thread_error.append(exc) + + thread = threading.Thread(target=build_index) + thread.start() + assert build_started.wait(timeout=1) + + retrieve_started = threading.Event() + retrieve_finished = threading.Event() + retrieved_results: list[RetrievedDuckDBMarkdownChunk] = [] + retrieve_error: list[BaseException] = [] + + def retrieve(): + retrieve_started.set() + try: + results = store.retrieve("hello", top_k=1) + retrieved_results.extend(results) + except BaseException as exc: + retrieve_error.append(exc) + finally: + retrieve_finished.set() + + retrieve_thread = threading.Thread(target=retrieve) + retrieve_thread.start() + assert retrieve_started.wait(timeout=1) + retrieve_thread.join(timeout=0.1) + assert retrieve_thread.is_alive() + + allow_build_to_finish.set() + thread.join(timeout=5) + retrieve_thread.join(timeout=5) + + assert thread_error == [] + assert retrieve_error == [] + assert retrieve_finished.is_set() + assert len(retrieved_results) == 1 + assert retrieved_results[0].text == "hello" + + def test_upsert_after_hnsw_index_on_reconnect(tmp_path): """Upserting after reconnecting to a DB with HNSW indexes must work. From 15220a2e0271266d6ec5563e3e3b1a1204e5f937 Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Fri, 24 Apr 2026 08:03:47 -0400 Subject: [PATCH 6/6] Format DuckDB store and document formatter step --- AGENTS.md | 1 + src/raghilda/_duckdb_store.py | 1 + 2 files changed, 2 insertions(+) diff --git a/AGENTS.md b/AGENTS.md index eaaa211..775006f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -15,6 +15,7 @@ - Docs are built with Quarto, so use it's syntax in docstrings. ## Must-Run Commands Before Hand-off +- Before committing or pushing Python changes, run `./.venv/bin/task format`; CI runs `format_check` and will fail on unformatted files. - `./.venv/bin/task check` - `./.venv/bin/task tests` - For docs changes, also `./.venv/bin/task docs_build` diff --git a/src/raghilda/_duckdb_store.py b/src/raghilda/_duckdb_store.py index 46d9ce6..dc4bce5 100644 --- a/src/raghilda/_duckdb_store.py +++ b/src/raghilda/_duckdb_store.py @@ -57,6 +57,7 @@ "context", } + @dataclass(repr=False) class DuckDBMarkdownChunk(MarkdownChunk): """MarkdownChunk with DuckDB-specific fields for database storage"""