From f081f21cc99c3fc11d998d62d4d52cae9952df34 Mon Sep 17 00:00:00 2001 From: Antawari Date: Mon, 29 Jun 2026 12:31:01 -0600 Subject: [PATCH 1/2] Add a persistent sqlite knowledge-vault backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A third VaultBackend implementation backed by the standard-library sqlite3 module — no extra dependencies, so it runs in CI (unlike the embedded-vector backend, whose optional deps are absent there). It conforms to the existing VaultBackend protocol and the frozen vault entry shape, and mirrors the in-memory backend's keyword retrieval byte-for-byte (LIKE prefilter, ranking done in Python) so results are deterministic and identical across environments. Storage is a single table with a forward-only versioned schema recorded in a small meta table; all SQL is parameterized. Keyword retrieval only, no embeddings. Wired into the backend factory behind a new "sqlite" option; the existing options are untouched. Tests mirror the in-memory contract plus persistence across reopening the same file. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/bonfire/knowledge/__init__.py | 7 + src/bonfire/knowledge/sqlite_backend.py | 233 ++++++++++++++++++++++ tests/unit/test_sqlite_vault.py | 247 ++++++++++++++++++++++++ 3 files changed, 487 insertions(+) create mode 100644 src/bonfire/knowledge/sqlite_backend.py create mode 100644 tests/unit/test_sqlite_vault.py diff --git a/src/bonfire/knowledge/__init__.py b/src/bonfire/knowledge/__init__.py index 8ab4799e..125c6a21 100644 --- a/src/bonfire/knowledge/__init__.py +++ b/src/bonfire/knowledge/__init__.py @@ -31,6 +31,8 @@ def get_vault_backend( - ``enabled=False`` → :class:`InMemoryVaultBackend` - ``backend="memory"`` → :class:`InMemoryVaultBackend` + - ``backend="sqlite"`` → :class:`SqliteVaultBackend` (persistent, stdlib + only; ``vault_path`` is the database file, ``":memory:"`` for ephemeral) - ``backend="lancedb"`` → :class:`LanceDBBackend` - anything else → :class:`InMemoryVaultBackend` (safe fallback) """ @@ -39,6 +41,11 @@ def get_vault_backend( return InMemoryVaultBackend() + if backend == "sqlite": + from bonfire.knowledge.sqlite_backend import SqliteVaultBackend + + return SqliteVaultBackend(db_path=vault_path) + if backend == "lancedb": from bonfire.knowledge.backend import LanceDBBackend from bonfire.knowledge.embeddings import get_embedder diff --git a/src/bonfire/knowledge/sqlite_backend.py b/src/bonfire/knowledge/sqlite_backend.py new file mode 100644 index 00000000..3ddeca64 --- /dev/null +++ b/src/bonfire/knowledge/sqlite_backend.py @@ -0,0 +1,233 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2026 BonfireAI + +"""Persistent vault backend backed by the stdlib ``sqlite3`` module. + +This is the *portable* persistent backend: it needs no third-party +dependencies (unlike the LanceDB backend, whose vector deps are absent in +CI), so it runs everywhere CPython does. A single plain table holds one row +per :class:`~bonfire.protocols.VaultEntry`; list/dict fields are stored as +JSON text. + +Retrieval is **honest keyword search**, not semantic search. ``query`` does +exactly what :class:`~bonfire.knowledge.memory.InMemoryVaultBackend` does: it +splits the query into words and scores each entry by how many of those words +appear as a case-insensitive substring of the entry's content -- no +embeddings, no vectors. SQLite ``LIKE`` is used only as a parameterized +prefilter to avoid scanning unmatched rows; the final scoring and ranking +mirror the in-memory backend byte-for-byte. + +The async methods wrap synchronous ``sqlite3`` calls (the same pattern the +in-memory backend uses) -- no ``aiosqlite`` or other added dependency. + +Schema is versioned (BubbleGum): ``_SCHEMA_VERSION`` plus an idempotent, +forward-only ``_ensure_schema``. A ``vault_meta`` row records the version so +a future migration can detect and upgrade an older file. +""" + +from __future__ import annotations + +import json +import sqlite3 +from typing import TYPE_CHECKING + +from bonfire.knowledge.hasher import content_hash as compute_hash +from bonfire.protocols import VaultEntry + +if TYPE_CHECKING: + from collections.abc import Iterable + +# Forward-only schema version. Bump only alongside a migration step in +# ``_ensure_schema``; never rewrite history. +_SCHEMA_VERSION = 1 + +# Ordered VaultEntry fields stored as their own columns. The two structured +# fields (``tags`` -> JSON array, ``metadata`` -> JSON object) are handled +# separately when (de)serializing; everything else round-trips as TEXT. +_TEXT_FIELDS = ( + "entry_id", + "content", + "entry_type", + "source_path", + "project_name", + "scanned_at", + "git_hash", + "content_hash", +) + + +class SqliteVaultBackend: + """Persistent vault over a single ``sqlite3`` connection. + + Pass a filesystem ``db_path`` to persist across process restarts, or + ``":memory:"`` (the default) for an ephemeral in-process database used by + tests. Keyword retrieval only -- no embeddings. + """ + + def __init__(self, db_path: str = ":memory:") -> None: + self._db_path = db_path + # check_same_thread=False keeps the connection usable from the asyncio + # event loop's worker context; access here is serialized by the single + # event loop so no cross-thread races occur. + self._conn = sqlite3.connect(db_path, check_same_thread=False) + self._conn.row_factory = sqlite3.Row + self._ensure_schema() + + # -- schema ---------------------------------------------------------- + + def _ensure_schema(self) -> None: + """Create the table and record the schema version (idempotent). + + Forward-only: safe to call on every open. Creating the objects + ``IF NOT EXISTS`` means an existing file is left intact; the version + row is inserted only when absent. + """ + self._conn.execute( + """ + CREATE TABLE IF NOT EXISTS vault_entries ( + entry_id TEXT PRIMARY KEY, + content TEXT NOT NULL, + entry_type TEXT NOT NULL, + source_path TEXT NOT NULL DEFAULT '', + project_name TEXT NOT NULL DEFAULT '', + scanned_at TEXT NOT NULL DEFAULT '', + git_hash TEXT NOT NULL DEFAULT '', + content_hash TEXT NOT NULL DEFAULT '', + tags TEXT NOT NULL DEFAULT '[]', + metadata TEXT NOT NULL DEFAULT '{}' + ) + """ + ) + self._conn.execute( + "CREATE INDEX IF NOT EXISTS idx_vault_entries_content_hash " + "ON vault_entries (content_hash)" + ) + self._conn.execute( + "CREATE INDEX IF NOT EXISTS idx_vault_entries_source_path " + "ON vault_entries (source_path)" + ) + self._conn.execute( + "CREATE TABLE IF NOT EXISTS vault_meta (key TEXT PRIMARY KEY, value TEXT NOT NULL)" + ) + self._conn.execute( + "INSERT OR IGNORE INTO vault_meta (key, value) VALUES ('schema_version', ?)", + (str(_SCHEMA_VERSION),), + ) + self._conn.commit() + + # -- (de)serialization ---------------------------------------------- + + @staticmethod + def _to_row(entry: VaultEntry) -> tuple[object, ...]: + """Flatten a VaultEntry into the column tuple (JSON for tags/metadata).""" + values: list[object] = [getattr(entry, field) for field in _TEXT_FIELDS] + values.append(json.dumps(entry.tags)) + values.append(json.dumps(entry.metadata)) + return tuple(values) + + @staticmethod + def _from_row(row: sqlite3.Row) -> VaultEntry: + """Rebuild a VaultEntry from a stored row (JSON-decode tags/metadata).""" + data = {field: row[field] for field in _TEXT_FIELDS} + data["tags"] = json.loads(row["tags"]) + data["metadata"] = json.loads(row["metadata"]) + return VaultEntry(**data) + + # -- protocol methods ----------------------------------------------- + + async def store(self, entry: VaultEntry) -> str: + """Persist *entry* (upsert by ``entry_id``) and return its ``entry_id``. + + Computes ``content_hash`` from the content when the caller left it + blank, mirroring the in-memory backend. + """ + if not entry.content_hash: + entry = entry.model_copy(update={"content_hash": compute_hash(entry.content)}) + columns = (*_TEXT_FIELDS, "tags", "metadata") + placeholders = ", ".join("?" for _ in columns) + column_list = ", ".join(columns) + # Upsert by primary key so re-storing the same entry_id replaces the + # row rather than failing on the PK constraint. + updates = ", ".join(f"{col}=excluded.{col}" for col in columns if col != "entry_id") + self._conn.execute( + f"INSERT INTO vault_entries ({column_list}) VALUES ({placeholders}) " + f"ON CONFLICT(entry_id) DO UPDATE SET {updates}", + self._to_row(entry), + ) + self._conn.commit() + return entry.entry_id + + async def query( + self, + query: str, + *, + limit: int = 5, + entry_type: str | None = None, + ) -> list[VaultEntry]: + """Keyword retrieval: score by per-word substring hits, top *limit*. + + Mirrors :class:`InMemoryVaultBackend.query` exactly -- the query is + lowercased and split into words; each candidate entry scores one point + per distinct query word found as a substring of its (lowercased) + content; only positive-scoring entries are returned, highest score + first, capped at *limit*. ``LIKE`` is used purely as a parameterized + prefilter; no semantic/vector matching is involved. + """ + query_words = query.lower().split() + if not query_words: + return [] + + rows = self._candidate_rows(query_words, entry_type) + scored: list[tuple[VaultEntry, int]] = [] + for row in rows: + lowered = row["content"].lower() + score = sum(1 for w in query_words if w in lowered) + if score > 0: + scored.append((self._from_row(row), score)) + scored.sort(key=lambda pair: pair[1], reverse=True) + return [entry for entry, _ in scored[:limit]] + + def _candidate_rows( + self, + query_words: Iterable[str], + entry_type: str | None, + ) -> list[sqlite3.Row]: + """Fetch rows where content matches ANY query word (parameterized). + + A row scores > 0 in :meth:`query` only if at least one (already + lowercased) query word is a substring of the entry's lowercased + content, so an OR of ``LIKE`` clauses against ``lower(content)`` is a + sound, loss-free prefilter -- it can only over-include. The + authoritative scoring in :meth:`query` re-checks every word in Python, + so the returned set and ranking match the in-memory backend exactly. + """ + params: list[object] = [] + like_clauses: list[str] = [] + for word in query_words: + like_clauses.append("lower(content) LIKE '%' || ? || '%'") + params.append(word) + where = f"({' OR '.join(like_clauses)})" + if entry_type is not None: + where += " AND entry_type = ?" + params.append(entry_type) + cursor = self._conn.execute( + f"SELECT * FROM vault_entries WHERE {where}", + tuple(params), + ) + return cursor.fetchall() + + async def exists(self, content_hash: str) -> bool: + """Return ``True`` if a stored entry has this ``content_hash``.""" + cursor = self._conn.execute( + "SELECT 1 FROM vault_entries WHERE content_hash = ? LIMIT 1", + (content_hash,), + ) + return cursor.fetchone() is not None + + async def get_by_source(self, source_path: str) -> list[VaultEntry]: + """Return all entries whose ``source_path`` equals *source_path*.""" + cursor = self._conn.execute( + "SELECT * FROM vault_entries WHERE source_path = ?", + (source_path,), + ) + return [self._from_row(row) for row in cursor.fetchall()] diff --git a/tests/unit/test_sqlite_vault.py b/tests/unit/test_sqlite_vault.py new file mode 100644 index 00000000..87f1bd07 --- /dev/null +++ b/tests/unit/test_sqlite_vault.py @@ -0,0 +1,247 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2026 BonfireAI + +"""Contract tests for the stdlib-``sqlite3`` persistent vault backend. + +Mirrors the in-memory backend's contract (store -> exists -> query -> +get_by_source, content_hash dedup, entry_id round-trip) and adds the property +the in-memory backend cannot have: **persistence across reopening the same +file path**. No third-party dependencies -- this backend is the one that runs +in CI where the LanceDB vector deps are absent. + +Async tests auto-discover (``asyncio_mode = "auto"``); no marker needed. +Implementation NEVER edits this file. +""" + +from __future__ import annotations + +from bonfire.knowledge import get_vault_backend +from bonfire.knowledge.hasher import content_hash as compute_hash +from bonfire.knowledge.sqlite_backend import SqliteVaultBackend +from bonfire.protocols import VaultBackend, VaultEntry + + +def _entry(content: str, **overrides: object) -> VaultEntry: + """Build a VaultEntry with a default entry_type, overridable per call.""" + data: dict[str, object] = {"content": content, "entry_type": "code_chunk"} + data.update(overrides) + return VaultEntry(**data) + + +class TestProtocolConformance: + def test_satisfies_vault_backend_protocol(self) -> None: + backend = SqliteVaultBackend() + assert isinstance(backend, VaultBackend) + + +class TestStoreAndExists: + async def test_store_returns_entry_id(self) -> None: + backend = SqliteVaultBackend() + entry = _entry("hello world") + returned = await backend.store(entry) + assert returned == entry.entry_id + + async def test_store_computes_content_hash_when_absent(self) -> None: + backend = SqliteVaultBackend() + entry = _entry("compute my hash") + assert entry.content_hash == "" + await backend.store(entry) + assert await backend.exists(compute_hash("compute my hash")) is True + + async def test_store_preserves_supplied_content_hash(self) -> None: + backend = SqliteVaultBackend() + await backend.store(_entry("payload", content_hash="explicit-hash")) + assert await backend.exists("explicit-hash") is True + + async def test_exists_false_for_unknown_hash(self) -> None: + backend = SqliteVaultBackend() + assert await backend.exists("never-stored") is False + + +class TestRoundTrip: + async def test_query_returns_full_entry(self) -> None: + backend = SqliteVaultBackend() + original = _entry( + "alpha beta gamma", + source_path="src/foo.py", + project_name="proj", + scanned_at="2026-06-29", + git_hash="deadbeef", + tags=["a", "b"], + metadata={"k": "v", "n": 1}, + ) + await backend.store(original) + results = await backend.query("alpha") + assert len(results) == 1 + got = results[0] + assert got.entry_id == original.entry_id + assert got.content == "alpha beta gamma" + assert got.entry_type == "code_chunk" + assert got.source_path == "src/foo.py" + assert got.project_name == "proj" + assert got.scanned_at == "2026-06-29" + assert got.git_hash == "deadbeef" + assert got.tags == ["a", "b"] + assert got.metadata == {"k": "v", "n": 1} + + +class TestQuery: + async def test_query_substring_match(self) -> None: + backend = SqliteVaultBackend() + await backend.store(_entry("the quick brown fox")) + await backend.store(_entry("a lazy dog sleeps")) + results = await backend.query("quick") + assert len(results) == 1 + assert results[0].content == "the quick brown fox" + + async def test_query_is_case_insensitive(self) -> None: + backend = SqliteVaultBackend() + await backend.store(_entry("UPPER CASE CONTENT")) + results = await backend.query("upper") + assert len(results) == 1 + + async def test_query_no_match_returns_empty(self) -> None: + backend = SqliteVaultBackend() + await backend.store(_entry("hello world")) + assert await backend.query("absent") == [] + + async def test_query_empty_string_returns_empty(self) -> None: + backend = SqliteVaultBackend() + await backend.store(_entry("hello world")) + assert await backend.query(" ") == [] + + async def test_query_respects_limit(self) -> None: + backend = SqliteVaultBackend() + for i in range(10): + await backend.store(_entry(f"shared token entry {i}")) + results = await backend.query("shared", limit=3) + assert len(results) == 3 + + async def test_query_filters_by_entry_type(self) -> None: + backend = SqliteVaultBackend() + await backend.store(_entry("token here", entry_type="code_chunk")) + await backend.store(_entry("token there", entry_type="scout_report")) + results = await backend.query("token", entry_type="scout_report") + assert len(results) == 1 + assert results[0].entry_type == "scout_report" + + async def test_query_ranks_more_word_hits_first(self) -> None: + backend = SqliteVaultBackend() + await backend.store(_entry("alpha only here", content_hash="one")) + await backend.store(_entry("alpha and beta both", content_hash="two")) + results = await backend.query("alpha beta") + assert results[0].content == "alpha and beta both" + + +class TestGetBySource: + async def test_get_by_source_returns_matching(self) -> None: + backend = SqliteVaultBackend() + await backend.store(_entry("a", source_path="src/x.py", content_hash="ha")) + await backend.store(_entry("b", source_path="src/x.py", content_hash="hb")) + await backend.store(_entry("c", source_path="src/y.py", content_hash="hc")) + results = await backend.get_by_source("src/x.py") + assert len(results) == 2 + assert {r.content for r in results} == {"a", "b"} + + async def test_get_by_source_empty_when_none(self) -> None: + backend = SqliteVaultBackend() + assert await backend.get_by_source("src/missing.py") == [] + + +class TestDedupByContentHash: + async def test_distinct_hashes_both_exist(self) -> None: + backend = SqliteVaultBackend() + await backend.store(_entry("first", content_hash="h1")) + await backend.store(_entry("second", content_hash="h2")) + assert await backend.exists("h1") is True + assert await backend.exists("h2") is True + + async def test_exists_drives_ingest_dedup(self) -> None: + """The ingest pattern: skip store when exists() reports the hash.""" + backend = SqliteVaultBackend() + c_hash = compute_hash("dedup me") + entry = _entry("dedup me") + if not await backend.exists(c_hash): + await backend.store(entry) + # Second pass: hash now present, so ingest skips the store. + would_store_again = not await backend.exists(c_hash) + assert would_store_again is False + # Exactly one row landed despite two ingest passes. + assert len(await backend.query("dedup")) == 1 + + +class TestPersistenceAcrossReopen: + async def test_data_survives_reopening_same_file(self, tmp_path) -> None: + """Write through one connection, reopen the SAME path, read it back. + + This is the property the in-memory backend cannot provide and the + reason this backend exists: durable storage on disk. + """ + db_file = str(tmp_path / "vault.db") + + writer = SqliteVaultBackend(db_path=db_file) + entry = _entry( + "persistent payload token", + source_path="src/persist.py", + content_hash="persist-hash", + tags=["keep"], + metadata={"durable": True}, + ) + await writer.store(entry) + + # A fresh backend over the same file path must see the prior write. + reader = SqliteVaultBackend(db_path=db_file) + assert await reader.exists("persist-hash") is True + by_source = await reader.get_by_source("src/persist.py") + assert len(by_source) == 1 + restored = by_source[0] + assert restored.entry_id == entry.entry_id + assert restored.content == "persistent payload token" + assert restored.tags == ["keep"] + assert restored.metadata == {"durable": True} + + hits = await reader.query("persistent") + assert len(hits) == 1 + assert hits[0].entry_id == entry.entry_id + + async def test_reopen_does_not_duplicate_schema(self, tmp_path) -> None: + """Reopening repeatedly is idempotent; data accumulates correctly.""" + db_file = str(tmp_path / "vault.db") + first = SqliteVaultBackend(db_path=db_file) + await first.store(_entry("one", content_hash="k1")) + second = SqliteVaultBackend(db_path=db_file) + await second.store(_entry("two", content_hash="k2")) + third = SqliteVaultBackend(db_path=db_file) + assert await third.exists("k1") is True + assert await third.exists("k2") is True + + +class TestUpsertByEntryId: + async def test_restoring_same_entry_id_replaces_row(self, tmp_path) -> None: + backend = SqliteVaultBackend() + first = _entry("original", entry_id="fixed-id", content_hash="orig") + await backend.store(first) + second = _entry("updated", entry_id="fixed-id", content_hash="upd") + await backend.store(second) + # Same id => single row; latest content wins. + results = await backend.query("updated") + assert len(results) == 1 + assert results[0].entry_id == "fixed-id" + assert await backend.query("original") == [] + + +class TestFactoryWiring: + def test_factory_returns_sqlite_backend(self) -> None: + backend = get_vault_backend(backend="sqlite", vault_path=":memory:") + assert isinstance(backend, SqliteVaultBackend) + + def test_factory_memory_still_default(self) -> None: + backend = get_vault_backend() + assert not isinstance(backend, SqliteVaultBackend) + + async def test_factory_sqlite_persists_to_path(self, tmp_path) -> None: + db_file = str(tmp_path / "factory.db") + writer = get_vault_backend(backend="sqlite", vault_path=db_file) + await writer.store(_entry("via factory", content_hash="fac")) + reader = get_vault_backend(backend="sqlite", vault_path=db_file) + assert await reader.exists("fac") is True From 805d40792077f6dfd29477ec1d00b438529c516d Mon Sep 17 00:00:00 2001 From: Antawari Date: Mon, 29 Jun 2026 12:43:13 -0600 Subject: [PATCH 2/2] Use static SQL in the sqlite vault and declare its test for the budget MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the two dynamically-built statements (the upsert and the query prefilter) with static literals: a fixed-column INSERT ... ON CONFLICT, and a SELECT that reads the table (optionally narrowed by entry_type) with the scoring done in Python as before. Every value is still a bound parameter; this clears the shared gate's SQL-construction lint and is simpler — the keyword ranking still mirrors the in-memory backend. Declare the new test file in the file-budget ledger so it does not draw against the frozen tests/unit package total (the established pattern for new test coverage). Co-Authored-By: Claude Opus 4.8 (1M context) --- file-budget.json | 3 + src/bonfire/knowledge/sqlite_backend.py | 84 ++++++++++++------------- 2 files changed, 42 insertions(+), 45 deletions(-) diff --git a/file-budget.json b/file-budget.json index 81836a07..e6803cc0 100644 --- a/file-budget.json +++ b/file-budget.json @@ -61,6 +61,9 @@ }, "tests/unit/test_session_lifecycle_cli.py": { "purpose": "new e2e coverage for the status/resume/handoff verbs" + }, + "tests/unit/test_sqlite_vault.py": { + "purpose": "contract + persistence coverage for the sqlite vault backend" } }, "packages": { diff --git a/src/bonfire/knowledge/sqlite_backend.py b/src/bonfire/knowledge/sqlite_backend.py index 3ddeca64..ac8f6026 100644 --- a/src/bonfire/knowledge/sqlite_backend.py +++ b/src/bonfire/knowledge/sqlite_backend.py @@ -13,9 +13,10 @@ exactly what :class:`~bonfire.knowledge.memory.InMemoryVaultBackend` does: it splits the query into words and scores each entry by how many of those words appear as a case-insensitive substring of the entry's content -- no -embeddings, no vectors. SQLite ``LIKE`` is used only as a parameterized -prefilter to avoid scanning unmatched rows; the final scoring and ranking -mirror the in-memory backend byte-for-byte. +embeddings, no vectors. It reads the rows with a static ``SELECT`` (optionally +narrowed by ``entry_type``) and does the scoring and ranking in Python, which +mirrors the in-memory backend byte-for-byte. The SQL carries only bound +parameters -- no value is ever formatted into a statement string. The async methods wrap synchronous ``sqlite3`` calls (the same pattern the in-memory backend uses) -- no ``aiosqlite`` or other added dependency. @@ -29,18 +30,32 @@ import json import sqlite3 -from typing import TYPE_CHECKING from bonfire.knowledge.hasher import content_hash as compute_hash from bonfire.protocols import VaultEntry -if TYPE_CHECKING: - from collections.abc import Iterable - # Forward-only schema version. Bump only alongside a migration step in # ``_ensure_schema``; never rewrite history. _SCHEMA_VERSION = 1 +# Static statements. Every value is bound (``?``); no identifier or value is +# ever formatted into the SQL string. The INSERT column order matches +# ``_to_row`` (``_TEXT_FIELDS`` then ``tags``, ``metadata``). +_INSERT_SQL = ( + "INSERT INTO vault_entries " + "(entry_id, content, entry_type, source_path, project_name, " + "scanned_at, git_hash, content_hash, tags, metadata) " + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) " + "ON CONFLICT(entry_id) DO UPDATE SET " + "content=excluded.content, entry_type=excluded.entry_type, " + "source_path=excluded.source_path, project_name=excluded.project_name, " + "scanned_at=excluded.scanned_at, git_hash=excluded.git_hash, " + "content_hash=excluded.content_hash, tags=excluded.tags, " + "metadata=excluded.metadata" +) +_SELECT_ALL = "SELECT * FROM vault_entries" +_SELECT_BY_TYPE = "SELECT * FROM vault_entries WHERE entry_type = ?" + # Ordered VaultEntry fields stored as their own columns. The two structured # fields (``tags`` -> JSON array, ``metadata`` -> JSON object) are handled # separately when (de)serializing; everything else round-trips as TEXT. @@ -143,17 +158,9 @@ async def store(self, entry: VaultEntry) -> str: """ if not entry.content_hash: entry = entry.model_copy(update={"content_hash": compute_hash(entry.content)}) - columns = (*_TEXT_FIELDS, "tags", "metadata") - placeholders = ", ".join("?" for _ in columns) - column_list = ", ".join(columns) # Upsert by primary key so re-storing the same entry_id replaces the # row rather than failing on the PK constraint. - updates = ", ".join(f"{col}=excluded.{col}" for col in columns if col != "entry_id") - self._conn.execute( - f"INSERT INTO vault_entries ({column_list}) VALUES ({placeholders}) " - f"ON CONFLICT(entry_id) DO UPDATE SET {updates}", - self._to_row(entry), - ) + self._conn.execute(_INSERT_SQL, self._to_row(entry)) self._conn.commit() return entry.entry_id @@ -170,14 +177,15 @@ async def query( lowercased and split into words; each candidate entry scores one point per distinct query word found as a substring of its (lowercased) content; only positive-scoring entries are returned, highest score - first, capped at *limit*. ``LIKE`` is used purely as a parameterized - prefilter; no semantic/vector matching is involved. + first, capped at *limit*. The rows are read with a static ``SELECT`` + (optionally narrowed by ``entry_type``); no semantic/vector matching is + involved. """ query_words = query.lower().split() if not query_words: return [] - rows = self._candidate_rows(query_words, entry_type) + rows = self._candidate_rows(entry_type) scored: list[tuple[VaultEntry, int]] = [] for row in rows: lowered = row["content"].lower() @@ -187,33 +195,19 @@ async def query( scored.sort(key=lambda pair: pair[1], reverse=True) return [entry for entry, _ in scored[:limit]] - def _candidate_rows( - self, - query_words: Iterable[str], - entry_type: str | None, - ) -> list[sqlite3.Row]: - """Fetch rows where content matches ANY query word (parameterized). - - A row scores > 0 in :meth:`query` only if at least one (already - lowercased) query word is a substring of the entry's lowercased - content, so an OR of ``LIKE`` clauses against ``lower(content)`` is a - sound, loss-free prefilter -- it can only over-include. The - authoritative scoring in :meth:`query` re-checks every word in Python, - so the returned set and ranking match the in-memory backend exactly. + def _candidate_rows(self, entry_type: str | None) -> list[sqlite3.Row]: + """Read the rows to score, optionally narrowed by ``entry_type``. + + The authoritative scoring in :meth:`query` re-checks every query word + in Python, exactly as the in-memory backend does, so reading the full + table (or the ``entry_type`` slice of it) yields the same result set + and ranking. Both statements are static literals carrying only a bound + parameter. """ - params: list[object] = [] - like_clauses: list[str] = [] - for word in query_words: - like_clauses.append("lower(content) LIKE '%' || ? || '%'") - params.append(word) - where = f"({' OR '.join(like_clauses)})" - if entry_type is not None: - where += " AND entry_type = ?" - params.append(entry_type) - cursor = self._conn.execute( - f"SELECT * FROM vault_entries WHERE {where}", - tuple(params), - ) + if entry_type is None: + cursor = self._conn.execute(_SELECT_ALL) + else: + cursor = self._conn.execute(_SELECT_BY_TYPE, (entry_type,)) return cursor.fetchall() async def exists(self, content_hash: str) -> bool: