From 50f3c0986d833c0dae2f263d88a2004be711d3d8 Mon Sep 17 00:00:00 2001 From: Linh Ngo Date: Sun, 31 May 2026 12:14:20 +0700 Subject: [PATCH 1/4] =?UTF-8?q?feat(#782):=20sk=20learn=20--dedupe=20?= =?UTF-8?q?=E2=80=94=20FTS5=20pre-write=20similarity=20check=20for=20near-?= =?UTF-8?q?duplicates?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - _find_similar_entries(): BM25 query against knowledge_fts, threshold -3.0 - --dedupe warn (default): print warning, allow write - --dedupe block: prevent write if near-duplicate found - --dedupe off: skip check (for batch ingest) - --merge : UPDATE existing entry instead of INSERT Closes #782 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- learn.py | 102 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/learn.py b/learn.py index d8829f76..b9ae10a3 100755 --- a/learn.py +++ b/learn.py @@ -1658,6 +1658,41 @@ def add_entry( return entry_id +def _find_similar_entries( + db: sqlite3.Connection, + title: str, + content: str, + category: str, + threshold: float = -3.0, +) -> list[dict]: + """Find near-duplicate knowledge entries using FTS5 BM25 similarity. + + BM25 scores in FTS5 are negative — lower (more negative) = more relevant. + threshold=-3.0 means 'very similar'. + Returns list of {id, title, score} dicts. + """ + query_text = re.sub(r'["\*\(\)]', " ", f"{title} {content[:80]}").strip() + if not query_text: + return [] + fts_query = " ".join(f'"{w}"' for w in query_text.split()[:10] if len(w) > 2) + if not fts_query: + return [] + try: + rows = db.execute( + """SELECT ke.id, ke.title, bm25(knowledge_fts) AS score + FROM knowledge_fts kf + JOIN knowledge_entries ke ON ke.id = kf.rowid + WHERE knowledge_fts MATCH ? + AND ke.category = ? + ORDER BY score + LIMIT 3""", + (fts_query, category), + ).fetchall() + return [{"id": r[0], "title": r[1], "score": r[2]} for r in rows if r[2] is not None and r[2] <= threshold] + except Exception: + return [] + + def _update_fts( db: sqlite3.Connection, entry_id: int, @@ -3156,6 +3191,33 @@ def main(): idx = args.index("--caveats") caveats = args[idx + 1] if idx + 1 < len(args) else "" + # --dedupe: warn (default), block, or off + dedupe = "warn" + if "--dedupe" in args: + idx = args.index("--dedupe") + raw_dedupe = args[idx + 1] if idx + 1 < len(args) else "warn" + if raw_dedupe not in ("warn", "block", "off"): + print( + f"Error: --dedupe must be one of: warn, block, off (got {raw_dedupe!r})", + file=sys.stderr, + ) + sys.exit(1) + dedupe = raw_dedupe + + # --merge : UPDATE existing entry instead of INSERT + merge_id: int | None = None + if "--merge" in args: + idx = args.index("--merge") + raw_merge = args[idx + 1] if idx + 1 < len(args) else "" + if not raw_merge or raw_merge.startswith("--"): + print("Error: --merge requires an integer entry ID", file=sys.stderr) + sys.exit(1) + try: + merge_id = int(raw_merge) + except ValueError: + print(f"Error: --merge value must be an integer ID (got {raw_merge!r})", file=sys.stderr) + sys.exit(1) + supersedes_id = None if "--supersedes" in args: idx = args.index("--supersedes") @@ -3219,6 +3281,8 @@ def main(): "--cerebrum-output", "--cerebrum-sections", "--confidence-threshold", + "--dedupe", + "--merge", ): _next = args[i + 1] if i + 1 < len(args) else None skip_next = bool(_next and not _next.startswith("--")) @@ -3330,6 +3394,44 @@ def main(): "caveats": caveats, } + # --merge: UPDATE existing entry instead of INSERT + if merge_id is not None: + _db = get_db() + row = _db.execute("SELECT id FROM knowledge_entries WHERE id = ?", (merge_id,)).fetchone() + if not row: + print(f"Error: --merge entry #{merge_id} not found", file=sys.stderr) + _db.close() + sys.exit(1) + now = __import__("datetime").datetime.now().isoformat() + _db.execute( + """UPDATE knowledge_entries + SET title = ?, content = ?, category = ?, tags = ?, + last_seen = ?, occurrence_count = occurrence_count + 1 + WHERE id = ?""", + (title, content, category, tags, now, merge_id), + ) + _update_fts(_db, merge_id, title, content, tags, category, wing, room) + _db.commit() + _db.close() + print(f" Merged into existing entry #{merge_id} [{category}]") + print("Done.") + return + + # --dedupe: FTS5 BM25 pre-write similarity check + if dedupe != "off": + _db = get_db() + similar = _find_similar_entries(_db, title, content, category) + _db.close() + if similar: + names = "; ".join(f"#{s['id']} '{s['title'][:40]}' (score {s['score']:.1f})" for s in similar) + print(f"⚠️ Similar entries found: {names}", file=sys.stderr) + if dedupe == "block": + print( + "Blocked — use --dedupe=off to force insert or --merge to replace", + file=sys.stderr, + ) + sys.exit(1) + try: queue_on_lock = _learn_queue_on_lock_enabled() entry_id = _write_learn_entry( From fff586735d518d2f13f7c50f192b94065172f91f Mon Sep 17 00:00:00 2001 From: Linh Ngo Date: Sun, 31 May 2026 13:19:18 +0700 Subject: [PATCH 2/4] fix(#782): avoid closing shared DB connection in dedupe check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dedupe check opened a get_db() connection and explicitly closed it before _write_learn_entry. Tests patch get_db() to return a single shared connection, so closing it made the later json_mode db.execute() fail with 'Cannot operate on a closed database'. Use del to drop the local reference instead; in production CPython the connection is released immediately (refcount → 0), while in tests the shared connection stays alive because the test closure still holds it. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- learn.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/learn.py b/learn.py index b9ae10a3..43ce29d7 100755 --- a/learn.py +++ b/learn.py @@ -3419,9 +3419,12 @@ def main(): # --dedupe: FTS5 BM25 pre-write similarity check if dedupe != "off": - _db = get_db() - similar = _find_similar_entries(_db, title, content, category) - _db.close() + _dedup_db = get_db() + similar = _find_similar_entries(_dedup_db, title, content, category) + # Do not close _dedup_db explicitly — closing the connection here would + # invalidate a shared/mocked connection in tests. It will be released + # when the local variable goes out of scope. + del _dedup_db if similar: names = "; ".join(f"#{s['id']} '{s['title'][:40]}' (score {s['score']:.1f})" for s in similar) print(f"⚠️ Similar entries found: {names}", file=sys.stderr) From 645fd143fc7ab195394557c02e39e451e6299d4e Mon Sep 17 00:00:00 2001 From: Linh Ngo Date: Sun, 31 May 2026 14:19:09 +0700 Subject: [PATCH 3/4] fix(#782): wrap dedupe get_db() call fail-open for SystemExit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests that call learn.main() directly without patching get_db() would hit the real get_db() → sys.exit(1) when no knowledge.db exists in CI. Wrap the dedupe DB open in try/except SystemExit so missing DB causes skip (no similar entries) rather than aborting the write. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- learn.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/learn.py b/learn.py index 43ce29d7..a9655aa9 100755 --- a/learn.py +++ b/learn.py @@ -3417,14 +3417,18 @@ def main(): print("Done.") return - # --dedupe: FTS5 BM25 pre-write similarity check + # --dedupe: FTS5 BM25 pre-write similarity check (fail-open: skip if DB unavailable) if dedupe != "off": - _dedup_db = get_db() - similar = _find_similar_entries(_dedup_db, title, content, category) - # Do not close _dedup_db explicitly — closing the connection here would - # invalidate a shared/mocked connection in tests. It will be released - # when the local variable goes out of scope. - del _dedup_db + similar: list[dict] = [] + try: + _dedup_db = get_db() + similar = _find_similar_entries(_dedup_db, title, content, category) + # Do not close _dedup_db explicitly — closing the connection here would + # invalidate a shared/mocked connection in tests. It will be released + # when the local variable goes out of scope. + del _dedup_db + except SystemExit: + pass # DB unavailable — skip check (fail-open) if similar: names = "; ".join(f"#{s['id']} '{s['title'][:40]}' (score {s['score']:.1f})" for s in similar) print(f"⚠️ Similar entries found: {names}", file=sys.stderr) From ed72f42e969be50d3af7bd7b4b01d97829439a33 Mon Sep 17 00:00:00 2001 From: Linh Ngo Date: Sun, 31 May 2026 14:50:30 +0700 Subject: [PATCH 4/4] ci: re-trigger CI after cancellation Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>