-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmigrate.py
More file actions
executable file
·2008 lines (1910 loc) · 87.6 KB
/
migrate.py
File metadata and controls
executable file
·2008 lines (1910 loc) · 87.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""Versioned DB migration for session-knowledge tools."""
import ast
import hashlib
import os
import re
import sqlite3
import sys
from datetime import datetime, timezone
from pathlib import Path
if os.name == "nt":
for _s in (sys.stdout, sys.stderr):
if hasattr(_s, "reconfigure"):
_s.reconfigure(encoding="utf-8", errors="replace")
def _default_db_path() -> str:
return os.environ.get("SK_DB_PATH") or os.path.expanduser("~/.copilot/session-state/knowledge.db")
def _wal_connect(path: "str | Path", **kwargs) -> sqlite3.Connection:
"""Open a SQLite connection with WAL journal mode and busy timeout."""
db = sqlite3.connect(str(path), **kwargs)
db.execute("PRAGMA journal_mode=WAL")
db.execute("PRAGMA busy_timeout=5000")
return db
def _latest_declared_migration_version() -> int | None:
"""Read the local migration literal for help text without executing migrations."""
try:
tree = ast.parse(Path(__file__).read_text(encoding="utf-8"))
for node in ast.walk(tree):
if not isinstance(node, ast.Assign):
continue
if not any(isinstance(target, ast.Name) and target.id == "MIGRATIONS" for target in node.targets):
continue
migrations = ast.literal_eval(node.value)
versions = [int(item[0]) for item in migrations]
return max(versions) if versions else None
except (OSError, SyntaxError, ValueError, TypeError):
return None
return None
def _usage() -> str:
latest = _latest_declared_migration_version()
latest_line = (
f"Latest declared migration: v{latest}" if latest is not None else "Latest declared migration: unknown"
)
return "\n".join(
[
"Usage: python migrate.py [DB_PATH] [--backup-only] [--backup-path PATH]",
"",
"Run schema migrations for the session-knowledge SQLite database.",
"If DB_PATH is omitted, SK_DB_PATH or ~/.copilot/session-state/knowledge.db is used.",
"",
"Options:",
" --backup-only Copy DB_PATH to a rollback backup and exit without migrating.",
" --backup-path PATH Destination path for --backup-only; fails if PATH exists.",
" -h, --help Show this help and exit without touching the database.",
"",
latest_line,
]
)
def _parse_cli_args(argv: list[str]) -> tuple[str, bool, str | None]:
db_path = None
backup_only = False
backup_path = None
index = 0
while index < len(argv):
arg = argv[index]
if arg in {"-h", "--help"}:
print(_usage())
raise SystemExit(0)
if arg == "--backup-only":
backup_only = True
elif arg == "--backup-path":
index += 1
if index >= len(argv):
print(" [migrate] --backup-path requires a destination path", file=sys.stderr)
raise SystemExit(2)
backup_path = argv[index]
elif arg.startswith("-"):
print(f" [migrate] Unknown option: {arg}", file=sys.stderr)
print(_usage(), file=sys.stderr)
raise SystemExit(2)
elif db_path is None:
db_path = arg
else:
print(f" [migrate] Unexpected extra argument: {arg}", file=sys.stderr)
print(_usage(), file=sys.stderr)
raise SystemExit(2)
index += 1
if backup_path and not backup_only:
print(" [migrate] --backup-path can only be used with --backup-only", file=sys.stderr)
raise SystemExit(2)
return db_path or _default_db_path(), backup_only, backup_path
def _create_backup_copy(db_path: str, backup_path: str | None = None) -> Path:
source = Path(db_path).expanduser()
if not source.is_file():
raise FileNotFoundError(f"database does not exist: {source}")
if backup_path:
destination = Path(backup_path).expanduser()
else:
stamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S%fZ")
destination = source.with_name(f"{source.name}.backup-{stamp}")
if destination.exists():
raise FileExistsError(f"backup destination already exists: {destination}")
destination.parent.mkdir(parents=True, exist_ok=True)
src_conn = _wal_connect(str(source))
dst_conn = _wal_connect(str(destination))
backup_error = None
try:
src_conn.backup(dst_conn)
except sqlite3.Error as exc:
backup_error = exc
finally:
dst_conn.close()
src_conn.close()
if backup_error is not None:
destination.unlink(missing_ok=True)
raise backup_error
verify_conn = _wal_connect(str(destination))
verify_error = None
try:
row = verify_conn.execute("PRAGMA quick_check").fetchone()
status = row[0] if row else "no quick_check result"
if str(status).lower() != "ok":
raise sqlite3.DatabaseError(f"backup quick_check returned {status!r}")
except sqlite3.Error as exc:
verify_error = exc
finally:
verify_conn.close()
if verify_error is not None:
destination.unlink(missing_ok=True)
raise verify_error
return destination
def _print_database_recovery_hint(db_path: str, error: str) -> None:
print(f" [migrate] Database check failed for {db_path}: {error}", file=sys.stderr)
print(
" [migrate] Recovery hint: restore a known-good backup, or move the database aside "
"and rerun migration to bootstrap a fresh schema. See "
"docs/RESILIENCE-RUNBOOK.md#5-database-schema-backup-and-rollback.",
file=sys.stderr,
)
def _print_database_retry_hint(db_path: str, error: str) -> None:
print(f" [migrate] Database check failed for {db_path}: {error}", file=sys.stderr)
print(
" [migrate] Recovery hint: database appears locked or busy; stop active session-knowledge "
"writers such as watch/sync processes, then retry the migration.",
file=sys.stderr,
)
def _validate_database_or_exit(db: sqlite3.Connection, db_path: str) -> None:
try:
row = db.execute("PRAGMA integrity_check").fetchone()
except sqlite3.OperationalError as exc:
db.close()
message = str(exc).lower()
if "locked" in message or "busy" in message:
_print_database_retry_hint(db_path, str(exc))
else:
_print_database_recovery_hint(db_path, str(exc))
raise SystemExit(1) from None
except sqlite3.DatabaseError as exc:
db.close()
_print_database_recovery_hint(db_path, str(exc))
raise SystemExit(1) from None
status = row[0] if row else "no integrity_check result"
if str(status).lower() != "ok":
db.close()
_print_database_recovery_hint(db_path, f"integrity_check returned {status!r}")
raise SystemExit(1)
def _normalize_title(title: str) -> str:
normalized = (title or "").strip().lower()
return re.sub(r"\s+", " ", normalized)
def _stable_sha256(*parts) -> str:
payload = "\0".join("" if p is None else str(p) for p in parts)
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
def _default_local_replica_id() -> str:
host = os.environ.get("HOSTNAME") or os.environ.get("COMPUTERNAME") or ""
user = os.environ.get("USER") or os.environ.get("USERNAME") or ""
return f"replica-{_stable_sha256('local-replica', host, user, os.path.expanduser('~'))[:16]}"
def _get_local_replica_id(db: sqlite3.Connection) -> str:
for table in ("sync_state", "sync_metadata"):
try:
row = db.execute(f"SELECT value FROM {table} WHERE key='local_replica_id'").fetchone()
except sqlite3.OperationalError:
continue
current = str(row[0]).strip() if row and row[0] else ""
if current and current != "local":
return current
replica_id = _default_local_replica_id()
for table in ("sync_state", "sync_metadata"):
try:
db.execute(
f"""
INSERT INTO {table} (key, value)
VALUES ('local_replica_id', ?)
ON CONFLICT(key) DO UPDATE SET
value = excluded.value,
updated_at = datetime('now')
""",
(replica_id,),
)
except sqlite3.OperationalError:
pass
return replica_id or "local"
def _normalize_search_feedback_origin(origin_replica_id: str, local_replica_id: str) -> str:
origin = (origin_replica_id or "").strip()
if not origin or origin == "local":
return local_replica_id or "local"
return origin
def _seed_sync_table_policies(db: sqlite3.Connection):
db.executescript("""
CREATE TABLE IF NOT EXISTS sync_metadata (
key TEXT PRIMARY KEY,
value TEXT NOT NULL,
updated_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS sync_state (
key TEXT PRIMARY KEY,
value TEXT NOT NULL,
updated_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS sync_txns (
txn_id TEXT PRIMARY KEY,
replica_id TEXT NOT NULL,
status TEXT NOT NULL CHECK(status IN ('pending', 'committed', 'failed')),
created_at TEXT NOT NULL,
committed_at TEXT DEFAULT ''
);
CREATE TABLE IF NOT EXISTS sync_ops (
id INTEGER PRIMARY KEY AUTOINCREMENT,
txn_id TEXT NOT NULL,
table_name TEXT NOT NULL,
op_type TEXT NOT NULL CHECK(op_type IN ('insert', 'update', 'delete', 'upsert')),
row_stable_id TEXT NOT NULL,
row_payload TEXT NOT NULL,
op_index INTEGER NOT NULL,
created_at TEXT NOT NULL,
UNIQUE(txn_id, op_index)
);
CREATE INDEX IF NOT EXISTS idx_sync_ops_txn ON sync_ops(txn_id);
CREATE INDEX IF NOT EXISTS idx_sync_ops_table_row ON sync_ops(table_name, row_stable_id);
CREATE TABLE IF NOT EXISTS sync_cursors (
replica_id TEXT PRIMARY KEY,
last_txn_id TEXT DEFAULT '',
updated_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS sync_failures (
id INTEGER PRIMARY KEY AUTOINCREMENT,
txn_id TEXT DEFAULT '',
table_name TEXT DEFAULT '',
row_stable_id TEXT DEFAULT '',
error_code TEXT DEFAULT '',
error_message TEXT DEFAULT '',
failed_at TEXT NOT NULL,
retry_count INTEGER DEFAULT 0
);
CREATE INDEX IF NOT EXISTS idx_sync_failures_txn ON sync_failures(txn_id);
""")
policy_sql = db.execute(
"SELECT sql FROM sqlite_master WHERE type='table' AND name='sync_table_policies'"
).fetchone()
needs_rebuild = policy_sql and "upload_only" not in (policy_sql[0] or "")
if needs_rebuild:
db.executescript("""
CREATE TABLE sync_table_policies_new (
table_name TEXT PRIMARY KEY,
sync_scope TEXT NOT NULL CHECK(sync_scope IN ('canonical', 'local_only', 'upload_only')),
stable_id_column TEXT DEFAULT ''
);
INSERT INTO sync_table_policies_new (table_name, sync_scope, stable_id_column)
SELECT table_name, sync_scope, COALESCE(stable_id_column, '')
FROM sync_table_policies;
DROP TABLE sync_table_policies;
ALTER TABLE sync_table_policies_new RENAME TO sync_table_policies;
""")
else:
db.executescript("""
CREATE TABLE IF NOT EXISTS sync_table_policies (
table_name TEXT PRIMARY KEY,
sync_scope TEXT NOT NULL CHECK(sync_scope IN ('canonical', 'local_only', 'upload_only')),
stable_id_column TEXT DEFAULT ''
);
""")
policies = [
("sessions", "canonical", "id"),
("documents", "canonical", "stable_id"),
("sections", "canonical", "stable_id"),
("knowledge_entries", "canonical", "stable_id"),
("knowledge_relations", "canonical", "stable_id"),
("entity_relations", "canonical", "stable_id"),
("search_feedback", "canonical", "stable_id"),
("recall_events", "upload_only", ""),
("entry_recall_stats", "upload_only", ""),
("entry_recall_day_log", "upload_only", ""),
("entry_recall_query_log", "upload_only", ""),
("knowledge_fts", "local_only", ""),
("ke_fts", "local_only", ""),
("sessions_fts", "local_only", ""),
("event_offsets", "local_only", ""),
("embeddings", "local_only", ""),
("embedding_meta", "local_only", ""),
("tfidf_model", "local_only", ""),
("entry_concept_tags", "local_only", ""),
("entry_dream_scores", "local_only", ""),
("file_annotations", "local_only", ""),
("project_registry", "canonical", "project_id"),
("code_index", "local_only", ""),
("code_fts", "local_only", ""),
]
db.executemany(
"""
INSERT INTO sync_table_policies (table_name, sync_scope, stable_id_column)
VALUES (?, ?, ?)
ON CONFLICT(table_name) DO UPDATE SET
sync_scope = excluded.sync_scope,
stable_id_column = excluded.stable_id_column
""",
policies,
)
db.execute("""
INSERT OR IGNORE INTO sync_metadata (key, value)
VALUES ('local_replica_id', 'local')
""")
db.execute("""
INSERT OR IGNORE INTO sync_state (key, value)
VALUES ('local_replica_id', 'local')
""")
_BACKFILL_BATCH_SIZE = 1000
# ── Issue #357: batch FTS rebuild ────────────────────────────────────────────
_FTS_REBUILD_BATCH_SIZE = 500
# ── Issue #358: cache FTS schema detection ───────────────────────────────────
# Bump this string whenever the ke_fts DDL changes so old caches are invalidated.
_KE_FTS_SCHEMA_VERSION = "v3-porter"
_KE_FTS_SCHEMA_VERSION_KEY = "ke_fts_schema_version"
def _get_cached_ke_fts_version(db: sqlite3.Connection) -> str:
"""Return the cached ke_fts schema version stored in wakeup_config, or ''."""
try:
row = db.execute("SELECT value FROM wakeup_config WHERE key=?", (_KE_FTS_SCHEMA_VERSION_KEY,)).fetchone()
return str(row[0]) if row else ""
except sqlite3.OperationalError:
return ""
def _set_cached_ke_fts_version(db: sqlite3.Connection, version: str) -> None:
"""Persist the ke_fts schema version in wakeup_config for future cache hits."""
try:
db.execute(
"""
INSERT INTO wakeup_config (key, value) VALUES (?, ?)
ON CONFLICT(key) DO UPDATE SET value = excluded.value,
updated_at = datetime('now')
""",
(_KE_FTS_SCHEMA_VERSION_KEY, version),
)
except sqlite3.OperationalError:
pass
def _ke_fts_needs_rebuild(db: sqlite3.Connection) -> bool:
"""Return True when ke_fts must be rebuilt.
Fast path (#358): if wakeup_config records the current schema version, skip
the sqlite_master query entirely.
"""
if _get_cached_ke_fts_version(db) == _KE_FTS_SCHEMA_VERSION:
return False
fts_row = db.execute("SELECT sql FROM sqlite_master WHERE name='ke_fts'").fetchone()
if not fts_row:
return False
fts_def = fts_row[0] or ""
needs = (
"wing" not in fts_def
or "facts" not in fts_def
or "error_type" not in fts_def
or "root_cause" not in fts_def
or "porter" not in fts_def # issue #373
)
return needs
def _rebuild_ke_fts_batched(db: sqlite3.Connection, new_ddl: str) -> None:
"""Rebuild ke_fts using batched inserts to avoid long write locks (#357).
Uses BEGIN EXCLUSIVE so the DROP→RENAME is atomic; prevents FTS permanent
loss if watch-sessions holds a read transaction.
"""
has_table = (
db.execute("SELECT 1 FROM sqlite_master WHERE type='table' AND name='knowledge_entries'").fetchone() is not None
)
if not has_table:
return
db.execute("BEGIN EXCLUSIVE")
try:
db.execute("DROP TABLE IF EXISTS ke_fts_new")
db.execute(new_ddl.replace("ke_fts", "ke_fts_new", 1))
# Batched INSERT (#357)
cur = db.execute(
"""
SELECT id, title, content, tags, category,
COALESCE(wing,''), COALESCE(room,''),
COALESCE(facts,'[]'), COALESCE(error_type,''),
COALESCE(root_cause,'')
FROM knowledge_entries
"""
)
while True:
batch = cur.fetchmany(_FTS_REBUILD_BATCH_SIZE)
if not batch:
break
db.executemany(
"""
INSERT INTO ke_fts_new(rowid, title, content, tags, category,
wing, room, facts, error_type, root_cause)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
batch,
)
db.execute("DROP TABLE IF EXISTS ke_fts")
db.execute("ALTER TABLE ke_fts_new RENAME TO ke_fts")
db.execute("COMMIT")
except Exception:
try:
db.execute("ROLLBACK")
except Exception:
pass
try:
db.execute("DROP TABLE IF EXISTS ke_fts_new")
except Exception:
pass
raise
# ── Issue #392: chunked WAL checkpoint scheduling ────────────────────────────
def _wal_frame_count(wal_path: str, page_size: int) -> int:
"""Return the number of frames in a WAL file based on its size.
WAL layout: 32-byte file header followed by frames of (24-byte frame
header + page_size bytes). Returns 0 if the file does not exist, is
empty, or is smaller than the WAL header.
"""
try:
size = os.path.getsize(wal_path)
except OSError:
return 0
if size <= 32 or page_size <= 0:
return 0
return (size - 32) // (24 + page_size)
def schedule_wal_checkpoint(db: sqlite3.Connection, threshold_pages: int = 1000) -> bool:
"""Run a PASSIVE WAL checkpoint when the WAL has grown past threshold_pages.
Returns True if a checkpoint was attempted, False if below threshold or
WAL mode is not active. Uses PASSIVE mode so it never blocks writers.
The threshold gate is evaluated *before* issuing PRAGMA wal_checkpoint so
that hot-path callers with a busy WAL never pay the checkpoint cost when
the WAL is still small.
"""
try:
mode_row = db.execute("PRAGMA journal_mode").fetchone()
if not mode_row or str(mode_row[0]).lower() != "wal":
return False
# Gate on actual WAL size before running the checkpoint.
if threshold_pages > 0:
db_list = db.execute("PRAGMA database_list").fetchall()
db_path = next((row[2] for row in db_list if row[1] == "main" and row[2]), None)
if db_path:
page_size_row = db.execute("PRAGMA page_size").fetchone()
page_size = page_size_row[0] if page_size_row else 4096
if _wal_frame_count(db_path + "-wal", page_size) < threshold_pages:
return False
# db_path is empty for in-memory DBs — WAL mode is not reachable
# for :memory: so control flow never reaches here in practice.
db.execute("PRAGMA wal_checkpoint(PASSIVE)")
return True
except sqlite3.OperationalError:
return False
# ── Issue #370: embedding dimension mismatch detection ───────────────────────
def detect_embedding_dimension_mismatch(db: sqlite3.Connection) -> list[dict]:
"""Return a list of mismatch records when stored embeddings use different dims.
Each record has keys: source_type, model, stored_dimensions, provider.
An empty list means no mismatch (or no embeddings table present).
"""
mismatches: list[dict] = []
try:
has_emb = db.execute("SELECT 1 FROM sqlite_master WHERE type='table' AND name='embeddings'").fetchone()
if not has_emb:
return mismatches
rows = db.execute(
"""
SELECT source_type, provider, model, dimensions, COUNT(*) as n
FROM embeddings
GROUP BY source_type, provider, model, dimensions
"""
).fetchall()
# Group by (source_type, provider, model) — if more than one dim value exists,
# or if it differs from embedding_meta's recorded configured dimension,
# we have a mismatch.
dim_map: dict[tuple, list[int]] = {}
for row in rows:
key = (str(row[0]), str(row[1]), str(row[2]))
dim_map.setdefault(key, []).append(int(row[3]))
for (source_type, provider, model), dims in dim_map.items():
if len(set(dims)) > 1:
mismatches.append(
{
"source_type": source_type,
"provider": provider,
"model": model,
"stored_dimensions": dims,
"issue": "mixed_dimensions",
}
)
# Also check against configured dimension in embedding_meta
try:
meta_row = db.execute("SELECT value FROM embedding_meta WHERE key='configured_dimensions'").fetchone()
if meta_row and meta_row[0]:
configured = int(meta_row[0])
for (source_type, provider, model), dims in dim_map.items():
for d in set(dims):
if d != configured:
mismatches.append(
{
"source_type": source_type,
"provider": provider,
"model": model,
"stored_dimensions": d,
"configured_dimensions": configured,
"issue": "dimension_config_mismatch",
}
)
except sqlite3.OperationalError:
pass
except sqlite3.OperationalError:
pass
return mismatches
def _backfill_stable_ids(db: sqlite3.Connection):
has_table = lambda t: (
db.execute(
"SELECT 1 FROM sqlite_master WHERE type='table' AND name=?",
(t,),
).fetchone()
is not None
)
if has_table("documents"):
_cur = db.execute("""
SELECT id, session_id, doc_type, seq, title, COALESCE(stable_id, '')
FROM documents
""")
while True:
_batch = _cur.fetchmany(_BACKFILL_BATCH_SIZE)
if not _batch:
break
for row in _batch:
did, session_id, doc_type, seq, title, existing = row
stable = _stable_sha256("document", session_id, doc_type, int(seq or 0), _normalize_title(title))
if existing != stable:
db.execute("UPDATE documents SET stable_id = ? WHERE id = ?", (stable, did))
if has_table("sections") and has_table("documents"):
_cur = db.execute("""
SELECT s.id, d.stable_id, s.section_name, COALESCE(s.stable_id, '')
FROM sections s
JOIN documents d ON s.document_id = d.id
WHERE COALESCE(d.stable_id, '') != ''
""")
while True:
_batch = _cur.fetchmany(_BACKFILL_BATCH_SIZE)
if not _batch:
break
for row in _batch:
sid, document_stable_id, section_name, existing = row
stable = _stable_sha256("section", document_stable_id, section_name or "")
if existing != stable:
db.execute("UPDATE sections SET stable_id = ? WHERE id = ?", (stable, sid))
if has_table("knowledge_entries"):
_cur = db.execute("""
SELECT id, session_id, category, title, COALESCE(topic_key, ''), COALESCE(stable_id, '')
FROM knowledge_entries
""")
while True:
_batch = _cur.fetchmany(_BACKFILL_BATCH_SIZE)
if not _batch:
break
for row in _batch:
kid, session_id, category, title, topic_key, existing = row
stable = _stable_sha256("knowledge", session_id, category, title or "", topic_key)
if existing != stable:
db.execute("UPDATE knowledge_entries SET stable_id = ? WHERE id = ?", (stable, kid))
if has_table("knowledge_relations") and has_table("knowledge_entries"):
_cur = db.execute("""
SELECT kr.id,
kr.source_id,
kr.target_id,
kr.relation_type,
COALESCE(kr.source_stable_id, ''),
COALESCE(kr.target_stable_id, ''),
COALESCE(kr.stable_id, ''),
COALESCE(s.stable_id, ''),
COALESCE(t.stable_id, '')
FROM knowledge_relations kr
LEFT JOIN knowledge_entries s ON kr.source_id = s.id
LEFT JOIN knowledge_entries t ON kr.target_id = t.id
""")
while True:
_batch = _cur.fetchmany(_BACKFILL_BATCH_SIZE)
if not _batch:
break
for row in _batch:
kr_id, _, _, relation_type, src_existing, tgt_existing, existing, src_sid, tgt_sid = row
if not src_sid or not tgt_sid:
continue
stable = _stable_sha256("knowledge_relation", src_sid, tgt_sid, relation_type or "")
if src_existing != src_sid or tgt_existing != tgt_sid or existing != stable:
db.execute(
"""
UPDATE knowledge_relations
SET source_stable_id = ?, target_stable_id = ?, stable_id = ?
WHERE id = ?
""",
(src_sid, tgt_sid, stable, kr_id),
)
if has_table("entity_relations"):
_cur = db.execute("""
SELECT id, subject, predicate, object, COALESCE(stable_id, '')
FROM entity_relations
""")
while True:
_batch = _cur.fetchmany(_BACKFILL_BATCH_SIZE)
if not _batch:
break
for row in _batch:
er_id, subject, predicate, obj, existing = row
stable = _stable_sha256("entity_relation", subject or "", predicate or "", obj or "")
if existing != stable:
db.execute("UPDATE entity_relations SET stable_id = ? WHERE id = ?", (stable, er_id))
if has_table("search_feedback"):
local_replica_id = _get_local_replica_id(db)
_cur = db.execute("""
SELECT id, created_at, result_kind, result_id, verdict, query,
COALESCE(origin_replica_id, ''), COALESCE(stable_id, '')
FROM search_feedback
""")
while True:
_batch = _cur.fetchmany(_BACKFILL_BATCH_SIZE)
if not _batch:
break
for row in _batch:
sf_id, created_at, result_kind, result_id, verdict, query, origin_replica_id, existing = row
origin = _normalize_search_feedback_origin(origin_replica_id, local_replica_id)
stable = _stable_sha256(
"search_feedback",
created_at or "",
result_kind or "",
result_id or "",
verdict if verdict is not None else "",
query or "",
origin,
)
if existing != stable or origin_replica_id != origin:
db.execute(
"""
UPDATE search_feedback
SET origin_replica_id = ?, stable_id = ?
WHERE id = ?
""",
(origin, stable, sf_id),
)
def _dedupe_stable_rows(db: sqlite3.Connection, table: str):
if table not in {
"documents",
"sections",
"knowledge_entries",
"knowledge_relations",
"entity_relations",
"search_feedback",
}:
return
db.execute(f"""
DELETE FROM {table}
WHERE id IN (
SELECT dupe.id
FROM {table} dupe
JOIN (
SELECT stable_id, MIN(id) AS keep_id
FROM {table}
WHERE COALESCE(stable_id, '') != ''
GROUP BY stable_id
HAVING COUNT(*) > 1
) grouped ON grouped.stable_id = dupe.stable_id
WHERE dupe.id != grouped.keep_id
)
""")
def _enforce_stable_id_uniqueness(db: sqlite3.Connection):
has_table = lambda t: (
db.execute(
"SELECT 1 FROM sqlite_master WHERE type='table' AND name=?",
(t,),
).fetchone()
is not None
)
index_specs = [
("documents", "uq_documents_stable_id"),
("sections", "uq_sections_stable_id"),
("knowledge_entries", "uq_knowledge_entries_stable_id"),
("knowledge_relations", "uq_knowledge_relations_stable_id"),
("entity_relations", "uq_entity_relations_stable_id"),
("search_feedback", "uq_search_feedback_stable_id"),
]
for table, index_name in index_specs:
if not has_table(table):
continue
_dedupe_stable_rows(db, table)
db.execute(f"CREATE UNIQUE INDEX IF NOT EXISTS {index_name} ON {table}(stable_id)")
def _repair_legacy_priority_collision(db: sqlite3.Connection):
legacy_row = db.execute("SELECT 1 FROM schema_version WHERE version=22 AND name='file_annotations'").fetchone()
if not legacy_row:
return False, False
db.execute("SAVEPOINT repair_priority")
try:
ke_cols = {row[1] for row in db.execute("PRAGMA table_info(knowledge_entries)").fetchall()}
repaired = False
if "priority" not in ke_cols:
for repair_sql in (
"ALTER TABLE knowledge_entries ADD COLUMN priority TEXT DEFAULT 'P2'",
"CREATE INDEX IF NOT EXISTS idx_ke_priority ON knowledge_entries(priority)",
):
try:
db.execute(repair_sql)
except sqlite3.OperationalError as e:
if "duplicate" in str(e).lower() or "already exists" in str(e).lower():
pass
else:
raise
repaired = True
renamed = (
db.execute(
"UPDATE schema_version SET name='priority' WHERE version=22 AND name='file_annotations'"
).rowcount
> 0
)
db.execute("RELEASE SAVEPOINT repair_priority")
if repaired or renamed:
db.commit()
return repaired, renamed
except Exception:
try:
db.execute("ROLLBACK TO SAVEPOINT repair_priority")
db.execute("RELEASE SAVEPOINT repair_priority")
except Exception:
pass
raise
def _ensure_base_schema(db: sqlite3.Connection):
"""Bootstrap the full base schema for a brand-new knowledge DB.
Fresh project-local DBs do not have an old migration history to upgrade from, so
they need the current base tables before the versioned ALTER/CREATE steps run.
Existing databases are unaffected because every statement is idempotent.
"""
db.executescript("""
CREATE TABLE IF NOT EXISTS schema_version (
version INTEGER PRIMARY KEY,
migrated_at TEXT DEFAULT (datetime('now')),
name TEXT DEFAULT ''
);
CREATE TABLE IF NOT EXISTS sessions (
id TEXT PRIMARY KEY,
path TEXT NOT NULL,
summary TEXT DEFAULT '',
total_checkpoints INTEGER DEFAULT 0,
total_research INTEGER DEFAULT 0,
total_files INTEGER DEFAULT 0,
has_plan INTEGER DEFAULT 0,
source TEXT DEFAULT 'copilot',
indexed_at TEXT,
file_mtime REAL,
indexed_at_r REAL,
fts_indexed_at REAL,
event_count_estimate INTEGER DEFAULT 0,
file_size_bytes INTEGER DEFAULT 0
);
CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
session_id TEXT NOT NULL,
doc_type TEXT NOT NULL,
seq INTEGER DEFAULT 0,
title TEXT NOT NULL,
stable_id TEXT,
file_path TEXT NOT NULL UNIQUE,
file_hash TEXT,
size_bytes INTEGER DEFAULT 0,
content_preview TEXT DEFAULT '',
source TEXT DEFAULT 'copilot',
indexed_at TEXT
);
CREATE TABLE IF NOT EXISTS sections (
id INTEGER PRIMARY KEY AUTOINCREMENT,
document_id INTEGER NOT NULL,
section_name TEXT NOT NULL,
stable_id TEXT,
content TEXT NOT NULL,
UNIQUE(document_id, section_name)
);
CREATE VIRTUAL TABLE IF NOT EXISTS knowledge_fts USING fts5(
title,
section_name,
content,
doc_type,
session_id UNINDEXED,
document_id UNINDEXED,
tokenize='unicode61 remove_diacritics 2'
);
CREATE VIRTUAL TABLE IF NOT EXISTS sessions_fts USING fts5(
session_id UNINDEXED,
title,
user_messages,
assistant_messages,
tool_names,
tokenize='porter unicode61 remove_diacritics 2'
);
CREATE TABLE IF NOT EXISTS event_offsets (
session_id TEXT NOT NULL,
event_id INTEGER NOT NULL,
byte_offset INTEGER NOT NULL,
file_mtime REAL NOT NULL,
PRIMARY KEY (session_id, event_id)
);
CREATE TABLE IF NOT EXISTS knowledge_entries (
id INTEGER PRIMARY KEY AUTOINCREMENT,
session_id TEXT NOT NULL,
document_id INTEGER,
category TEXT NOT NULL,
title TEXT NOT NULL,
stable_id TEXT,
content TEXT NOT NULL,
tags TEXT DEFAULT '',
confidence REAL DEFAULT 1.0,
occurrence_count INTEGER DEFAULT 1,
first_seen TEXT,
last_seen TEXT,
source TEXT DEFAULT 'copilot',
topic_key TEXT,
revision_count INTEGER DEFAULT 1,
content_hash TEXT,
wing TEXT DEFAULT '',
room TEXT DEFAULT '',
facts TEXT DEFAULT '[]',
est_tokens INTEGER DEFAULT 0,
task_id TEXT DEFAULT '',
affected_files TEXT DEFAULT '[]',
source_section TEXT DEFAULT '',
source_file TEXT DEFAULT '',
start_line INTEGER DEFAULT 0,
end_line INTEGER DEFAULT 0,
code_language TEXT DEFAULT '',
code_snippet TEXT DEFAULT '',
error_type TEXT DEFAULT '',
root_cause TEXT DEFAULT '',
severity TEXT DEFAULT 'medium',
is_resolved INTEGER DEFAULT 0,
fix_steps TEXT DEFAULT '',
prevention_hook TEXT DEFAULT '',
recurrence_after_briefing INTEGER DEFAULT 0,
valence TEXT DEFAULT '',
intensity REAL DEFAULT 0.5,
priority TEXT DEFAULT 'P2',
project_id TEXT DEFAULT '',
UNIQUE(category, title, session_id)
);
CREATE TABLE IF NOT EXISTS knowledge_relations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source_id INTEGER REFERENCES knowledge_entries(id),
target_id INTEGER REFERENCES knowledge_entries(id),
source_stable_id TEXT DEFAULT '',
target_stable_id TEXT DEFAULT '',
relation_type TEXT NOT NULL,
stable_id TEXT,
confidence REAL DEFAULT 0.8,
created_at TEXT,
UNIQUE(source_id, target_id, relation_type)
);
CREATE TABLE IF NOT EXISTS entity_relations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
subject TEXT NOT NULL,
predicate TEXT NOT NULL,
object TEXT NOT NULL,
stable_id TEXT,
noted_at TEXT DEFAULT (datetime('now')),
session_id TEXT DEFAULT '',
UNIQUE(subject, predicate, object)
);
CREATE TABLE IF NOT EXISTS search_feedback (
id INTEGER PRIMARY KEY AUTOINCREMENT,
query TEXT,
result_id TEXT,
result_kind TEXT,
verdict INTEGER NOT NULL CHECK(verdict IN (-1,0,1)),
comment TEXT,
user_agent TEXT,
created_at TEXT NOT NULL,
origin_replica_id TEXT DEFAULT 'local',
stable_id TEXT
);
CREATE TABLE IF NOT EXISTS wakeup_config (
key TEXT PRIMARY KEY,
value TEXT NOT NULL,
updated_at TEXT DEFAULT (datetime('now'))
);
CREATE VIRTUAL TABLE IF NOT EXISTS ke_fts USING fts5(
title,
content,
tags,
category,
wing,
room,
facts,
error_type,
root_cause,
tokenize='porter unicode61 remove_diacritics 2'
);
CREATE TABLE IF NOT EXISTS project_registry (
id INTEGER PRIMARY KEY AUTOINCREMENT,
project_id TEXT NOT NULL UNIQUE,
display_name TEXT NOT NULL DEFAULT '',
repo_root TEXT NOT NULL DEFAULT '',
description TEXT NOT NULL DEFAULT '',