diff --git a/check_simhash.go b/check_simhash.go new file mode 100644 index 0000000..43bcf3f --- /dev/null +++ b/check_simhash.go @@ -0,0 +1,29 @@ +package main + +import ( + "database/sql" + "fmt" + "log" + "os" + "path/filepath" + + _ "modernc.org/sqlite" +) + +func main() { + home, _ := os.UserHomeDir() + dbPath := filepath.Join(home, ".engram", "engram.db") + db, err := sql.Open("sqlite", dbPath) + if err != nil { + log.Fatal(err) + } + defer db.Close() + + var count int + err = db.QueryRow("SELECT COUNT(*) FROM observations WHERE simhash IS NULL OR simhash = 0").Scan(&count) + if err != nil { + log.Fatal(err) + } + + fmt.Printf("Observations without simhash: %d\n", count) +} diff --git a/docs/architecture_turboquant.md b/docs/architecture_turboquant.md new file mode 100644 index 0000000..bd519f1 --- /dev/null +++ b/docs/architecture_turboquant.md @@ -0,0 +1,42 @@ +# Arquitectura: TurboQuant + Engram Hybrid Search 🚀🧠 + +Esta documentación detalla la integración del motor de indexación semántica **TurboQuant** dentro del servidor de memoria **Engram**. + +## 1. El Problema +La búsqueda tradicional en Engram dependía exclusivamente de **SQLite FTS5**, que es excelente para coincidencias de texto exactas pero falla estrepitosamente en: +- **Conceptual Matching**: Si buscas "memoria" y el registro dice "almacenamiento", FTS5 no lo encuentra. +- **Typo Tolerance**: Pequeños errores en el query pueden anular el resultado. +- **Semantic Re-ranking**: No hay forma de priorizar resultados que "significan" lo mismo que el query pero usan palabras distintas. + +## 2. La Solución: Motor Híbrido +Hemos implementado una capa híbrida que combina el poder de los índices invertidos (FTS5) con la eficiencia de los **Locality Sensitive Hashes (LSH)**. + +### Componentes Clave: + +| Componente | Función | Tecnología | +| :--- | :--- | :--- | +| **FTS5 Gatekeeper** | Coincidencias exactas y rápidas. | SQLite Virtual Tables | +| **SimHash (TurboQuant)** | Genera huellas digitales de 64 bits de conceptos. | FNV-1a Hash + Bitwise Quantization | +| **TurboCache** | Cache en memoria contigua para navegación LSH ultra-rápida. | Go Slices (Cache Locality) | +| **Hamming Distance** | Calcula la cercanía semántica entre el query y los recuerdos. | Native CPU `POPCNT` | + +## 3. Flujo de Búsqueda 🧬 + +1. **Carga Inicial**: Al iniciar el `Store`, todos los `simhash` de la base de datos se cargan en el `TurboCache` en memoria. +2. **Query Sanitización**: El query del usuario se normaliza (se quitan tildes, se pasa a minúsculas) y se calcula su `querySimHash`. +3. **Ejecución FTS5**: Se buscan coincidencias exactas en la tabla virtual de SQLite. +4. **Expansión Semántica (TurboQuant)**: + - Se escanea el `TurboCache` buscando las 10 observaciones con menor **Distancia de Hamming**. + - Cualquier resultado con distancia `< 20` se suma al set de resultados, aunque FTS5 no lo haya encontrado. +5. **Re-Ranking**: Se aplica un *boost* a los resultados que tengan alta similitud semántica, subiéndolos en la lista de prioridades. + +## 4. Persistencia y Compatibilidad 💾 +- **Esquema**: Se añadió la columna `simhash` (INTEGER) a la tabla `observations`. +- **Compatibilidad**: Se utiliza `int64` en Go para representar los 64 bits de forma compatible con los enteros con signo de SQLite, evitando crasheos por desbordamiento de bit alto. +- **Migración**: El sistema incluye lógica para auto-migrar bases de datos existentes añadiendo la columna. + +## 5. Rendimiento 🚀 +Al usar un array contiguo (`[]CacheEntry`) en memoria, el escaneo lineal es extremadamente rápido para el procesador (L1/L2 hits), permitiendo comparar miles de firmas en fracciones de milisegundo sin dependencias externas (como bases de datos vectoriales pesadas). + +--- +**Nota de Arquitectura**: Este diseño prioriza la **Localidad** y la **Autonomía**. No necesitas una nube o un modelo de 10GB para tener búsqueda semántica; solo necesitas matemáticas y una CPU eficiente. diff --git a/internal/store/semantic_search_test.go b/internal/store/semantic_search_test.go new file mode 100644 index 0000000..0f67808 --- /dev/null +++ b/internal/store/semantic_search_test.go @@ -0,0 +1,202 @@ +package store + +import ( + "os" + "testing" + + "github.com/Gentleman-Programming/engram/internal/store/turboquant" +) + +func TestTurboQuant(t *testing.T) { + // Setup a temporary store with defaults handled + tmpDir, err := os.MkdirTemp("", "engram-turboquant-test-*") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(tmpDir) + + s, err := New(Config{ + DataDir: tmpDir, + MaxSearchResults: 10, + MaxContextResults: 10, + }) + if err != nil { + t.Fatalf("Failed to create store: %v", err) + } + defer s.Close() + + // 0. Setup test data (Create Session) + err = s.CreateSession("test-session", "test-project", "/tmp/test") + if err != nil { + t.Fatal(err) + } + + // 1. Memory Inoculation + memories := []struct { + project string + title string + content string + }{ + {"ProjectA", "Auth Logic", "OAuth2 implementation using JWT tokens and RS256."}, + {"ProjectB", "Auth Logic Duplicate", "OAuth2 implementation using JWT tokens and RS256."}, + {"Global", "Database", "PostgreSQL database using GORM and migrations."}, + {"Global", "Frontend", "React application using Tailwind CSS and Vite."}, + } + + for _, m := range memories { + _, err = s.AddObservation(AddObservationParams{ + SessionID: "test-session", + Type: "architecture", + Title: m.title, + Content: m.content, + Project: m.project, + }) + if err != nil { + t.Fatal(err) + } + } + + // SUB-TEST: Semantic Precision + t.Run("SemanticExpansion_Precision", func(t *testing.T) { + query := "tokens JWT rs256" + results, err := s.Search(query, SearchOptions{Limit: 5}) + if err != nil { + t.Fatal(err) + } + + t.Logf("Search results count: %d", len(results)) + for _, res := range results { + project := "" + if res.Project != nil { + project = *res.Project + } + t.Logf(" - Found: %s in %s (Rank: %f)", res.Title, project, res.Rank) + } + + found := false + for _, res := range results { + project := "" + if res.Project != nil { + project = *res.Project + } + if project == "projecta" || project == "projectb" { + found = true + dist := turboquant.HammingDistance(turboquant.ComputeSimHash(query), turboquant.BlockSignature(res.SimHash)) + t.Logf("SUCCESS: Found concept in '%s' with distance %d", project, dist) + } + } + + if !found { + t.Error("FAIL: Could not find semantic match for tokens and JWT") + } + }) + + // SUB-TEST: Metadata Filtering (The "otra prueba mas") + t.Run("MetadataFiltering", func(t *testing.T) { + query := "RS256 JWT tokens" // Identical semantic query + + // SEARCH ONLY IN PROJECT A + results, err := s.Search(query, SearchOptions{Limit: 5, Project: "ProjectA"}) // Case normalization handled by Store + if err != nil { + t.Fatal(err) + } + + if len(results) != 1 { + t.Errorf("FAIL: Expected 1 result for ProjectA, but got %d", len(results)) + } else { + project := "" + if results[0].Project != nil { + project = *results[0].Project + } + if project != "projecta" { + t.Errorf("FAIL: Expected projecta, but got %s", project) + } else { + t.Log("SUCCESS: Metadata correctly filtered conceptual matches") + } + } + }) + + // SUB-TEST: Negative Matches (Noise Exclusion) + t.Run("NoiseExclusion", func(t *testing.T) { + query := "como plantar tomates en el jardin" + results, err := s.Search(query, SearchOptions{Limit: 5}) + if err != nil { + t.Fatal(err) + } + + if len(results) > 0 { + t.Errorf("FAIL: Irrelevant query returned %d results", len(results)) + } else { + t.Log("SUCCESS: Noise query returned zero results") + } + }) + + // SUB-TEST: Sorting and Priority (Hamming Boost) + t.Run("HammingPriority", func(t *testing.T) { + // Insert exact match to check sorting priority over older near-matches + // Adding more unique words to make the semantic density very different + exactContent := "PostgreSQL migrations GORM. This is the exact technical stack for database handling." + _, err := s.AddObservation(AddObservationParams{ + SessionID: "test-session", + Type: "architecture", + Title: "PreciseDB", + Content: exactContent, + Project: "Global", + }) + if err != nil { + t.Fatal(err) + } + + query := "PostgreSQL migrations GORM" + results, err := s.Search(query, SearchOptions{Limit: 5, Project: "global"}) + if err != nil { + t.Fatal(err) + } + + if len(results) < 2 { + t.Fatalf("FAIL: Expected at least 2 results for DB query, got %d", len(results)) + } + + // The most precise Hamming distance should be first (Rank lower is better) + t.Logf("Rank 1st: %s (%f), Rank 2nd: %s (%f)", results[0].Title, results[0].Rank, results[1].Title, results[1].Rank) + if results[0].Title != "PreciseDB" { + t.Errorf("FAIL: Precise match '%s' should be ranked before general match '%s'", results[0].Title, results[1].Title) + } else { + t.Log("SUCCESS: Ranking prioritized the lower Hamming distance") + } + }) + + // SUB-TEST: Full Reindexing (the "reindexa bien" part) + t.Run("FullReindexing", func(t *testing.T) { + // 1. Manually corrupt SimHash in DB to simulate stale or missing hashes + _, err := s.db.Exec("UPDATE observations SET simhash = 0") + if err != nil { + t.Fatal(err) + } + + // 2. Run Reindex + count, err := s.ReindexTurboQuant() + if err != nil { + t.Fatalf("FAIL: Reindex failed: %v", err) + } + + if count < 5 { + t.Errorf("FAIL: Reindex processed only %d observations, expected at least 5", count) + } + + // 3. Verify that search STILL works (meaning signatures were restored in cache) + query := "RS256 JWT tokens" + results, err := s.Search(query, SearchOptions{Limit: 1, Project: "ProjectA"}) + if err != nil { + t.Fatal(err) + } + + if len(results) == 0 { + t.Error("FAIL: Search found nothing after reindexing") + } else if results[0].Title != "Auth Logic" { + t.Errorf("FAIL: Search found wrong result '%s' after reindexing", results[0].Title) + } else { + t.Logf("SUCCESS: Reindexed %d memories and verified search accuracy", count) + } + }) +} diff --git a/internal/store/store.go b/internal/store/store.go index 670e247..902ece0 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -12,14 +12,19 @@ import ( "encoding/hex" "encoding/json" "fmt" + "log" "os" "path/filepath" "regexp" + "sort" "strconv" "strings" + "sync" "time" _ "modernc.org/sqlite" + + "github.com/Gentleman-Programming/engram/internal/store/turboquant" ) var openDB = sql.Open @@ -46,6 +51,7 @@ type Observation struct { Project *string `json:"project,omitempty"` Scope string `json:"scope"` TopicKey *string `json:"topic_key,omitempty"` + SimHash int64 `json:"simhash"` RevisionCount int `json:"revision_count"` DuplicateCount int `json:"duplicate_count"` LastSeenAt *string `json:"last_seen_at,omitempty"` @@ -85,6 +91,7 @@ type TimelineEntry struct { Project *string `json:"project,omitempty"` Scope string `json:"scope"` TopicKey *string `json:"topic_key,omitempty"` + SimHash int64 `json:"simhash"` RevisionCount int `json:"revision_count"` DuplicateCount int `json:"duplicate_count"` LastSeenAt *string `json:"last_seen_at,omitempty"` @@ -281,9 +288,13 @@ func (s *Store) MaxObservationLength() int { // ─── Store ─────────────────────────────────────────────────────────────────── type Store struct { - db *sql.DB - cfg Config - hooks storeHooks + db *sql.DB + cfg Config + hooks storeHooks + mu sync.RWMutex + cache *turboquant.TurboCache + newCacheInProgress *turboquant.TurboCache // Used to prevent race during reindex + reindexUpdatedIDs *sync.Map // Track IDs updated during reindexing } type execer interface { @@ -400,6 +411,20 @@ func New(cfg Config) (*Store, error) { return nil, fmt.Errorf("engram: create data dir: %w", err) } + // Apply sensible defaults if missing from Config struct + if cfg.MaxObservationLength <= 0 { + cfg.MaxObservationLength = 100000 + } + if cfg.MaxSearchResults <= 0 { + cfg.MaxSearchResults = 20 + } + if cfg.MaxContextResults <= 0 { + cfg.MaxContextResults = 20 + } + if cfg.DedupeWindow <= 0 { + cfg.DedupeWindow = 15 * time.Minute + } + dbPath := filepath.Join(cfg.DataDir, "engram.db") db, err := openDB("sqlite", dbPath) if err != nil { @@ -420,10 +445,20 @@ func New(cfg Config) (*Store, error) { } s := &Store{db: db, cfg: cfg, hooks: defaultStoreHooks()} + s.cache = turboquant.NewTurboCache() + s.reindexUpdatedIDs = &sync.Map{} + if err := s.migrate(); err != nil { + db.Close() return nil, fmt.Errorf("engram: migration: %w", err) } + + if err := s.loadSemanticCache(); err != nil { + db.Close() + return nil, fmt.Errorf("engram: load semantic cache: %w", err) + } if err := s.repairEnrolledProjectSyncMutations(); err != nil { + db.Close() return nil, fmt.Errorf("engram: repair enrolled sync journal: %w", err) } @@ -458,6 +493,7 @@ func (s *Store) migrate() error { project TEXT, scope TEXT NOT NULL DEFAULT 'project', topic_key TEXT, + simhash INTEGER, normalized_hash TEXT, revision_count INTEGER NOT NULL DEFAULT 1, duplicate_count INTEGER NOT NULL DEFAULT 1, @@ -548,6 +584,7 @@ func (s *Store) migrate() error { {name: "sync_id", definition: "TEXT"}, {name: "scope", definition: "TEXT NOT NULL DEFAULT 'project'"}, {name: "topic_key", definition: "TEXT"}, + {name: "simhash", definition: "INTEGER"}, {name: "normalized_hash", definition: "TEXT"}, {name: "revision_count", definition: "INTEGER NOT NULL DEFAULT 1"}, {name: "duplicate_count", definition: "INTEGER NOT NULL DEFAULT 1"}, @@ -616,6 +653,9 @@ func (s *Store) migrate() error { if _, err := s.execHook(s.db, `UPDATE observations SET duplicate_count = 1 WHERE duplicate_count IS NULL OR duplicate_count < 1`); err != nil { return err } + if _, err := s.execHook(s.db, `UPDATE observations SET simhash = 0 WHERE simhash IS NULL`); err != nil { + return err + } if _, err := s.execHook(s.db, `UPDATE observations SET updated_at = created_at WHERE updated_at IS NULL OR updated_at = ''`); err != nil { return err } @@ -905,7 +945,7 @@ func (s *Store) AllObservations(project, scope string, limit int) ([]Observation query := ` SELECT o.id, ifnull(o.sync_id, '') as sync_id, o.session_id, o.type, o.title, o.content, o.tool_name, o.project, - o.scope, o.topic_key, o.revision_count, o.duplicate_count, o.last_seen_at, o.created_at, o.updated_at, o.deleted_at + o.scope, o.topic_key, ifnull(o.simhash, 0) as simhash, o.revision_count, o.duplicate_count, o.last_seen_at, o.created_at, o.updated_at, o.deleted_at FROM observations o WHERE o.deleted_at IS NULL ` @@ -934,7 +974,7 @@ func (s *Store) SessionObservations(sessionID string, limit int) ([]Observation, query := ` SELECT id, ifnull(sync_id, '') as sync_id, session_id, type, title, content, tool_name, project, - scope, topic_key, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at + scope, topic_key, ifnull(simhash, 0) as simhash, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at FROM observations WHERE session_id = ? AND deleted_at IS NULL ORDER BY created_at ASC @@ -954,15 +994,17 @@ func (s *Store) AddObservation(p AddObservationParams) (int64, error) { content := stripPrivateTags(p.Content) if len(content) > s.cfg.MaxObservationLength { - content = content[:s.cfg.MaxObservationLength] + "... [truncated]" + content = truncate(content, s.cfg.MaxObservationLength) + "... [truncated]" } scope := normalizeScope(p.Scope) normHash := hashNormalized(content) + simHash := turboquant.ComputeSimHash(content) topicKey := normalizeTopicKey(p.TopicKey) var observationID int64 + var isUpdate bool + err := s.withTx(func(tx *sql.Tx) error { - var obs *Observation if topicKey != "" { var existingID int64 err := tx.QueryRow( @@ -983,6 +1025,7 @@ func (s *Store) AddObservation(p AddObservationParams) (int64, error) { content = ?, tool_name = ?, topic_key = ?, + simhash = ?, normalized_hash = ?, revision_count = revision_count + 1, last_seen_at = datetime('now'), @@ -993,17 +1036,20 @@ func (s *Store) AddObservation(p AddObservationParams) (int64, error) { content, nullableString(p.ToolName), nullableString(topicKey), + int64(simHash), normHash, existingID, ); err != nil { return err } - obs, err = s.getObservationTx(tx, existingID) + observationID = existingID + obsToSync, err := s.getObservationTx(tx, existingID) if err != nil { return err } - observationID = existingID - return s.enqueueSyncMutationTx(tx, SyncEntityObservation, obs.SyncID, SyncOpUpsert, observationPayloadFromObservation(obs)) + isUpdate = true + + return s.enqueueSyncMutationTx(tx, SyncEntityObservation, obsToSync.SyncID, SyncOpUpsert, observationPayloadFromObservation(obsToSync)) } if err != sql.ErrNoRows { return err @@ -1036,12 +1082,14 @@ func (s *Store) AddObservation(p AddObservationParams) (int64, error) { ); err != nil { return err } - obs, err = s.getObservationTx(tx, existingID) + observationID = existingID + obsToSync, err := s.getObservationTx(tx, existingID) if err != nil { return err } - observationID = existingID - return s.enqueueSyncMutationTx(tx, SyncEntityObservation, obs.SyncID, SyncOpUpsert, observationPayloadFromObservation(obs)) + isUpdate = true + + return s.enqueueSyncMutationTx(tx, SyncEntityObservation, obsToSync.SyncID, SyncOpUpsert, observationPayloadFromObservation(obsToSync)) } if err != sql.ErrNoRows { return err @@ -1049,10 +1097,10 @@ func (s *Store) AddObservation(p AddObservationParams) (int64, error) { syncID := newSyncID("obs") res, err := s.execHook(tx, - `INSERT INTO observations (sync_id, session_id, type, title, content, tool_name, project, scope, topic_key, normalized_hash, revision_count, duplicate_count, last_seen_at, updated_at) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1, 1, datetime('now'), datetime('now'))`, + `INSERT INTO observations (sync_id, session_id, type, title, content, tool_name, project, scope, topic_key, simhash, normalized_hash, revision_count, duplicate_count, last_seen_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1, 1, datetime('now'), datetime('now'))`, syncID, p.SessionID, p.Type, title, content, - nullableString(p.ToolName), nullableString(p.Project), scope, nullableString(topicKey), normHash, + nullableString(p.ToolName), nullableString(p.Project), scope, nullableString(topicKey), int64(simHash), normHash, ) if err != nil { return err @@ -1061,15 +1109,36 @@ func (s *Store) AddObservation(p AddObservationParams) (int64, error) { if err != nil { return err } - obs, err = s.getObservationTx(tx, observationID) + obsToSync, err := s.getObservationTx(tx, observationID) if err != nil { return err } - return s.enqueueSyncMutationTx(tx, SyncEntityObservation, obs.SyncID, SyncOpUpsert, observationPayloadFromObservation(obs)) + return s.enqueueSyncMutationTx(tx, SyncEntityObservation, syncID, SyncOpUpsert, observationPayloadFromObservation(obsToSync)) }) if err != nil { return 0, err } + + // Update semantic cache AFTER successful commit + s.mu.Lock() + if isUpdate { + s.cache.Remove(observationID) + if s.newCacheInProgress != nil { + s.newCacheInProgress.Remove(observationID) + s.reindexUpdatedIDs.Store(observationID, true) + } + } + if simHash != 0 { + s.cache.Add(turboquant.BlockSignature(simHash), observationID) + } + if s.newCacheInProgress != nil { + if simHash != 0 { + s.newCacheInProgress.Add(turboquant.BlockSignature(simHash), observationID) + } + s.reindexUpdatedIDs.Store(observationID, true) + } + s.mu.Unlock() + return observationID, nil } @@ -1083,7 +1152,7 @@ func (s *Store) RecentObservations(project, scope string, limit int) ([]Observat query := ` SELECT o.id, ifnull(o.sync_id, '') as sync_id, o.session_id, o.type, o.title, o.content, o.tool_name, o.project, - o.scope, o.topic_key, o.revision_count, o.duplicate_count, o.last_seen_at, o.created_at, o.updated_at, o.deleted_at + o.scope, o.topic_key, ifnull(o.simhash, 0) as simhash, o.revision_count, o.duplicate_count, o.last_seen_at, o.created_at, o.updated_at, o.deleted_at FROM observations o WHERE o.deleted_at IS NULL ` @@ -1112,7 +1181,7 @@ func (s *Store) AddPrompt(p AddPromptParams) (int64, error) { content := stripPrivateTags(p.Content) if len(content) > s.cfg.MaxObservationLength { - content = content[:s.cfg.MaxObservationLength] + "... [truncated]" + content = truncate(content, s.cfg.MaxObservationLength) + "... [truncated]" } var promptID int64 @@ -1184,6 +1253,9 @@ func (s *Store) SearchPrompts(query string, project string, limit int) ([]Prompt } ftsQuery := sanitizeFTS(query) + if ftsQuery == "" { + return nil, nil // SQL logic error: malformed MATCH expression + } sql := ` SELECT p.id, ifnull(p.sync_id, '') as sync_id, p.session_id, p.content, ifnull(p.project, '') as project, p.created_at @@ -1223,13 +1295,13 @@ func (s *Store) SearchPrompts(query string, project string, limit int) ([]Prompt func (s *Store) GetObservation(id int64) (*Observation, error) { row := s.db.QueryRow( `SELECT id, ifnull(sync_id, '') as sync_id, session_id, type, title, content, tool_name, project, - scope, topic_key, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at + scope, topic_key, ifnull(simhash, 0) as simhash, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at FROM observations WHERE id = ? AND deleted_at IS NULL`, id, ) var o Observation if err := row.Scan( &o.ID, &o.SyncID, &o.SessionID, &o.Type, &o.Title, &o.Content, - &o.ToolName, &o.Project, &o.Scope, &o.TopicKey, &o.RevisionCount, &o.DuplicateCount, &o.LastSeenAt, + &o.ToolName, &o.Project, &o.Scope, &o.TopicKey, &o.SimHash, &o.RevisionCount, &o.DuplicateCount, &o.LastSeenAt, &o.CreatedAt, &o.UpdatedAt, &o.DeletedAt, ); err != nil { return nil, err @@ -1239,6 +1311,7 @@ func (s *Store) GetObservation(id int64) (*Observation, error) { func (s *Store) UpdateObservation(id int64, p UpdateObservationParams) (*Observation, error) { var updated *Observation + var simHash turboquant.BlockSignature err := s.withTx(func(tx *sql.Tx) error { obs, err := s.getObservationTx(tx, id) if err != nil { @@ -1261,7 +1334,7 @@ func (s *Store) UpdateObservation(id int64, p UpdateObservationParams) (*Observa if p.Content != nil { content = stripPrivateTags(*p.Content) if len(content) > s.cfg.MaxObservationLength { - content = content[:s.cfg.MaxObservationLength] + "... [truncated]" + content = truncate(content, s.cfg.MaxObservationLength) + "... [truncated]" } } if p.Project != nil { @@ -1274,6 +1347,8 @@ func (s *Store) UpdateObservation(id int64, p UpdateObservationParams) (*Observa topicKey = normalizeTopicKey(*p.TopicKey) } + simHash = turboquant.ComputeSimHash(content) + if _, err := s.execHook(tx, `UPDATE observations SET type = ?, @@ -1282,6 +1357,7 @@ func (s *Store) UpdateObservation(id int64, p UpdateObservationParams) (*Observa project = ?, scope = ?, topic_key = ?, + simhash = ?, normalized_hash = ?, revision_count = revision_count + 1, updated_at = datetime('now') @@ -1292,6 +1368,7 @@ func (s *Store) UpdateObservation(id int64, p UpdateObservationParams) (*Observa nullableString(project), scope, nullableString(topicKey), + int64(simHash), hashNormalized(content), id, ); err != nil { @@ -1307,11 +1384,30 @@ func (s *Store) UpdateObservation(id int64, p UpdateObservationParams) (*Observa if err != nil { return nil, err } + + // Update semantic cache AFTER successful commit + s.mu.Lock() + s.cache.Remove(id) + if s.newCacheInProgress != nil { + s.newCacheInProgress.Remove(id) + s.reindexUpdatedIDs.Store(id, true) + } + if simHash != 0 { + s.cache.Add(simHash, updated.ID) + } + if s.newCacheInProgress != nil { + if simHash != 0 { + s.newCacheInProgress.Add(simHash, updated.ID) + } + s.reindexUpdatedIDs.Store(id, true) + } + s.mu.Unlock() + return updated, nil } func (s *Store) DeleteObservation(id int64, hardDelete bool) error { - return s.withTx(func(tx *sql.Tx) error { + err := s.withTx(func(tx *sql.Tx) error { obs, err := s.getObservationTx(tx, id) if err == sql.ErrNoRows { return nil @@ -1340,13 +1436,30 @@ func (s *Store) DeleteObservation(id int64, hardDelete bool) error { } } - return s.enqueueSyncMutationTx(tx, SyncEntityObservation, obs.SyncID, SyncOpDelete, syncObservationPayload{ + if err := s.enqueueSyncMutationTx(tx, SyncEntityObservation, obs.SyncID, SyncOpDelete, syncObservationPayload{ SyncID: obs.SyncID, Deleted: true, DeletedAt: &deletedAt, HardDelete: hardDelete, - }) + }); err != nil { + return err + } + + return nil }) + if err != nil { + return err + } + + // Sync in-memory cache AFTER successful commit + s.mu.Lock() + s.cache.Remove(id) + if s.newCacheInProgress != nil { + s.newCacheInProgress.Remove(id) + s.reindexUpdatedIDs.Store(id, true) + } + s.mu.Unlock() + return nil } // ─── Timeline ──────────────────────────────────────────────────────────────── @@ -1381,7 +1494,7 @@ func (s *Store) Timeline(observationID int64, before, after int) (*TimelineResul // 3. Get observations BEFORE the focus (same session, older, chronological order) beforeRows, err := s.queryItHook(s.db, ` SELECT id, session_id, type, title, content, tool_name, project, - scope, topic_key, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at + scope, topic_key, ifnull(simhash, 0) as simhash, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at FROM observations WHERE session_id = ? AND id < ? AND deleted_at IS NULL ORDER BY id DESC @@ -1397,7 +1510,7 @@ func (s *Store) Timeline(observationID int64, before, after int) (*TimelineResul var e TimelineEntry if err := beforeRows.Scan( &e.ID, &e.SessionID, &e.Type, &e.Title, &e.Content, - &e.ToolName, &e.Project, &e.Scope, &e.TopicKey, &e.RevisionCount, &e.DuplicateCount, &e.LastSeenAt, + &e.ToolName, &e.Project, &e.Scope, &e.TopicKey, &e.SimHash, &e.RevisionCount, &e.DuplicateCount, &e.LastSeenAt, &e.CreatedAt, &e.UpdatedAt, &e.DeletedAt, ); err != nil { return nil, err @@ -1415,7 +1528,7 @@ func (s *Store) Timeline(observationID int64, before, after int) (*TimelineResul // 4. Get observations AFTER the focus (same session, newer, chronological order) afterRows, err := s.queryItHook(s.db, ` SELECT id, session_id, type, title, content, tool_name, project, - scope, topic_key, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at + scope, topic_key, ifnull(simhash, 0) as simhash, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at FROM observations WHERE session_id = ? AND id > ? AND deleted_at IS NULL ORDER BY id ASC @@ -1431,7 +1544,7 @@ func (s *Store) Timeline(observationID int64, before, after int) (*TimelineResul var e TimelineEntry if err := afterRows.Scan( &e.ID, &e.SessionID, &e.Type, &e.Title, &e.Content, - &e.ToolName, &e.Project, &e.Scope, &e.TopicKey, &e.RevisionCount, &e.DuplicateCount, &e.LastSeenAt, + &e.ToolName, &e.Project, &e.Scope, &e.TopicKey, &e.SimHash, &e.RevisionCount, &e.DuplicateCount, &e.LastSeenAt, &e.CreatedAt, &e.UpdatedAt, &e.DeletedAt, ); err != nil { return nil, err @@ -1460,9 +1573,8 @@ func (s *Store) Timeline(observationID int64, before, after int) (*TimelineResul // ─── Search (FTS5) ─────────────────────────────────────────────────────────── func (s *Store) Search(query string, opts SearchOptions) ([]SearchResult, error) { - // Normalize project filter so "Engram" finds records stored as "engram" + // 1. Setup filters and limits opts.Project, _ = NormalizeProject(opts.Project) - limit := opts.Limit if limit <= 0 { limit = 10 @@ -1471,16 +1583,20 @@ func (s *Store) Search(query string, opts SearchOptions) ([]SearchResult, error) limit = s.cfg.MaxSearchResults } - var directResults []SearchResult + // Prepare tracking for hybrid merge + seen := make(map[int64]bool) + var results []SearchResult + querySimHash := turboquant.ComputeSimHash(query) + + // 2. Direct Topic Match (if query looks like a topic key) if strings.Contains(query, "/") { tkSQL := ` SELECT id, ifnull(sync_id, '') as sync_id, session_id, type, title, content, tool_name, project, - scope, topic_key, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at + scope, topic_key, ifnull(simhash, 0) as simhash, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at FROM observations WHERE topic_key = ? AND deleted_at IS NULL ` tkArgs := []any{query} - if opts.Type != "" { tkSQL += " AND type = ?" tkArgs = append(tkArgs, opts.Type) @@ -1493,7 +1609,6 @@ func (s *Store) Search(query string, opts SearchOptions) ([]SearchResult, error) tkSQL += " AND scope = ?" tkArgs = append(tkArgs, normalizeScope(opts.Scope)) } - tkSQL += " ORDER BY updated_at DESC LIMIT ?" tkArgs = append(tkArgs, limit) @@ -1504,83 +1619,322 @@ func (s *Store) Search(query string, opts SearchOptions) ([]SearchResult, error) var sr SearchResult if err := tkRows.Scan( &sr.ID, &sr.SyncID, &sr.SessionID, &sr.Type, &sr.Title, &sr.Content, - &sr.ToolName, &sr.Project, &sr.Scope, &sr.TopicKey, &sr.RevisionCount, &sr.DuplicateCount, + &sr.ToolName, &sr.Project, &sr.Scope, &sr.TopicKey, &sr.SimHash, &sr.RevisionCount, &sr.DuplicateCount, &sr.LastSeenAt, &sr.CreatedAt, &sr.UpdatedAt, &sr.DeletedAt, ); err != nil { break } - sr.Rank = -1000 - directResults = append(directResults, sr) + sr.Rank = -1000 // Topic matches are highly prioritized + if !seen[sr.ID] { + results = append(results, sr) + seen[sr.ID] = true + } } } } - // Sanitize query for FTS5 — wrap each term in quotes to avoid syntax errors + // 3. Lexical Search (FTS5) ftsQuery := sanitizeFTS(query) - - sqlQ := ` - SELECT o.id, ifnull(o.sync_id, '') as sync_id, o.session_id, o.type, o.title, o.content, o.tool_name, o.project, - o.scope, o.topic_key, o.revision_count, o.duplicate_count, o.last_seen_at, o.created_at, o.updated_at, o.deleted_at, + if ftsQuery != "" { + sqlQ := ` + SELECT o.id, ifnull(o.sync_id, '') as sync_id, o.session_id, o.type, o.title, o.content, o.tool_name, o.project, + o.scope, o.topic_key, ifnull(o.simhash, 0) as simhash, o.revision_count, o.duplicate_count, o.last_seen_at, o.created_at, o.updated_at, o.deleted_at, fts.rank FROM observations_fts fts - JOIN observations o ON o.id = fts.rowid - WHERE observations_fts MATCH ? AND o.deleted_at IS NULL - ` - args := []any{ftsQuery} + JOIN observations o ON o.id = fts.rowid + WHERE observations_fts MATCH ? AND o.deleted_at IS NULL + ` + args := []any{ftsQuery} + + if opts.Type != "" { + sqlQ += " AND o.type = ?" + args = append(args, opts.Type) + } + + if opts.Project != "" { + sqlQ += " AND o.project = ?" + args = append(args, opts.Project) + } + + if opts.Scope != "" { + sqlQ += " AND o.scope = ?" + args = append(args, normalizeScope(opts.Scope)) + } + + sqlQ += " ORDER BY fts.rank LIMIT ?" + args = append(args, limit) + + rows, err := s.queryItHook(s.db, sqlQ, args...) + if err == nil { + defer rows.Close() + for rows.Next() { + var sr SearchResult + if err := rows.Scan( + &sr.ID, &sr.SyncID, &sr.SessionID, &sr.Type, &sr.Title, &sr.Content, + &sr.ToolName, &sr.Project, &sr.Scope, &sr.TopicKey, &sr.SimHash, &sr.RevisionCount, &sr.DuplicateCount, + &sr.LastSeenAt, &sr.CreatedAt, &sr.UpdatedAt, &sr.DeletedAt, + &sr.Rank, + ); err != nil { + return nil, err + } + + // Calculate similarity boost using SimHash Hamming distance + distance := turboquant.HammingDistance(querySimHash, turboquant.BlockSignature(sr.SimHash)) + if distance < 12 { // Significant similarity boost + sr.Rank -= float64(12-distance) * 0.8 + } + + if !seen[sr.ID] { + results = append(results, sr) + seen[sr.ID] = true + } + } + if err := rows.Err(); err != nil { + return nil, err + } + } else { + log.Printf("engram: FTS5 search error (possible index corruption): %v", err) + return nil, err + } + } - if opts.Type != "" { - sqlQ += " AND o.type = ?" - args = append(args, opts.Type) + // 4. Semantic Expansion (TurboQuant) + s.mu.RLock() + cache := s.cache + inProgress := s.newCacheInProgress + + // Use a map to deduplicate semantic matches, preferring smaller distance. + // If the same ID exists in both caches, the inProgress one often represents + // the state during reindexing. + matchMap := make(map[int64]int) // ID -> Distance + for _, m := range cache.FindNearestN(querySimHash, 10) { + matchMap[m.ID] = m.Distance + } + if inProgress != nil { + for _, m := range inProgress.FindNearestN(querySimHash, 10) { + if dist, exists := matchMap[m.ID]; !exists || m.Distance <= dist { + matchMap[m.ID] = m.Distance + } + } } + s.mu.RUnlock() - if opts.Project != "" { - sqlQ += " AND o.project = ?" - args = append(args, opts.Project) + semanticIDs := make([]int64, 0, len(matchMap)) + idToDist := make(map[int64]int) + for id, dist := range matchMap { + if dist < 20 { + semanticIDs = append(semanticIDs, id) + idToDist[id] = dist + } } - if opts.Scope != "" { - sqlQ += " AND o.scope = ?" - args = append(args, normalizeScope(opts.Scope)) + if len(semanticIDs) > 0 { + placeholders := make([]string, len(semanticIDs)) + args := make([]any, len(semanticIDs)) + for i, id := range semanticIDs { + placeholders[i] = "?" + args[i] = id + } + + queryBatch := fmt.Sprintf(` + SELECT id, ifnull(sync_id, '') as sync_id, session_id, type, title, content, tool_name, project, + scope, topic_key, ifnull(simhash, 0) as simhash, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at + FROM observations + WHERE id IN (%s) AND deleted_at IS NULL + `, strings.Join(placeholders, ",")) + + obsBatch, err := s.queryObservations(queryBatch, args...) + if err == nil { + for _, obs := range obsBatch { + dist := idToDist[obs.ID] + // Map the distance to a competitive negative BM25 score. + semanticRank := -15.0 + (float64(dist) * 0.5) + + if seen[obs.ID] { + // Update existing result rank if semantic match is stronger + for i := range results { + if results[i].ID == obs.ID { + if semanticRank < results[i].Rank { + results[i].Rank = semanticRank + } + break + } + } + continue + } + + if len(results) >= limit*2 { + break + } + + // Apply metadata filters (Project, Scope, Type) to semantic matches + matchProject := derefString(obs.Project) + if opts.Project != "" && matchProject != opts.Project { + continue + } + if opts.Scope != "" && obs.Scope != normalizeScope(opts.Scope) { + continue + } + if opts.Type != "" && obs.Type != opts.Type { + continue + } + + results = append(results, SearchResult{ + Observation: obs, + Rank: semanticRank, + }) + seen[obs.ID] = true + } + } } - sqlQ += " ORDER BY fts.rank LIMIT ?" - args = append(args, limit) + // ─── Post-Processing ────────────────────────────────────────────────────────── + + // MANDATORY: Sort combined results by rank (FTS5 rank: lower is better) + sort.Slice(results, func(i, j int) bool { + return results[i].Rank < results[j].Rank + }) - rows, err := s.queryItHook(s.db, sqlQ, args...) + if len(results) > limit { + results = results[:limit] + } + return results, nil +} + +func (s *Store) loadSemanticCache() error { + rows, err := s.db.Query("SELECT id, simhash FROM observations WHERE deleted_at IS NULL AND simhash != 0") if err != nil { - return nil, fmt.Errorf("search: %w", err) + return err } defer rows.Close() - seen := make(map[int64]bool) - for _, dr := range directResults { - seen[dr.ID] = true + newCache := turboquant.NewTurboCache() + for rows.Next() { + var id int64 + var sig int64 + if err := rows.Scan(&id, &sig); err != nil { + return err + } + newCache.Add(turboquant.BlockSignature(sig), id) + } + + if err := rows.Err(); err != nil { + return err + } + + s.mu.Lock() + s.cache = newCache + s.mu.Unlock() + + return nil +} + +// ReindexTurboQuant recomputes SimHash for all active observations, +// updates the DB, and rebuilds the in-memory cache. +func (s *Store) ReindexTurboQuant() (int, error) { + rows, err := s.queryHook(s.db, "SELECT id, content FROM observations WHERE deleted_at IS NULL") + if err != nil { + return 0, fmt.Errorf("reindex: fetch obs: %w", err) + } + defer rows.Close() + + // Build a new cache in background to avoid search blindness + newCache := turboquant.NewTurboCache() + s.mu.Lock() + s.newCacheInProgress = newCache + s.reindexUpdatedIDs = &sync.Map{} // Reset tracking for concurrent updates + s.mu.Unlock() + defer func() { + s.mu.Lock() + s.newCacheInProgress = nil + s.mu.Unlock() + }() + + count := 0 + const batchSize = 500 + var currentBatch []struct { + id int64 + sig turboquant.BlockSignature + } + + flushBatch := func() (int, error) { + s.mu.Lock() + var filteredBatch []struct { + id int64 + sig turboquant.BlockSignature + } + for _, item := range currentBatch { + if _, updated := s.reindexUpdatedIDs.Load(item.id); !updated { + filteredBatch = append(filteredBatch, item) + } + } + s.mu.Unlock() + + if len(filteredBatch) == 0 { + return 0, nil + } + + tx, err := s.beginTxHook() + if err != nil { + return 0, err + } + defer tx.Rollback() + + for _, item := range filteredBatch { + if _, err := s.execHook(tx, "UPDATE observations SET simhash = ? WHERE id = ?", int64(item.sig), item.id); err != nil { + return 0, err + } + } + return len(filteredBatch), s.commitHook(tx) } - var results []SearchResult - results = append(results, directResults...) for rows.Next() { - var sr SearchResult - if err := rows.Scan( - &sr.ID, &sr.SyncID, &sr.SessionID, &sr.Type, &sr.Title, &sr.Content, - &sr.ToolName, &sr.Project, &sr.Scope, &sr.TopicKey, &sr.RevisionCount, &sr.DuplicateCount, - &sr.LastSeenAt, &sr.CreatedAt, &sr.UpdatedAt, &sr.DeletedAt, - &sr.Rank, - ); err != nil { - return nil, err + var id int64 + var content string + if err := rows.Scan(&id, &content); err != nil { + return 0, err } - if !seen[sr.ID] { - results = append(results, sr) + + sig := turboquant.ComputeSimHash(content) + + // Skip if this ID was updated concurrently during the reindex scan. + // We use the lock to ensure atomicity between the check and the add, + // preventing a race where an update happens between Load and Add. + s.mu.Lock() + if _, updated := s.reindexUpdatedIDs.Load(id); !updated { + newCache.Add(sig, id) + currentBatch = append(currentBatch, struct { + id int64 + sig turboquant.BlockSignature + }{id, sig}) + } + s.mu.Unlock() + + if len(currentBatch) >= batchSize { + n, err := flushBatch() + if err != nil { + return count, fmt.Errorf("reindex: batch flush: %w", err) + } + count += n + currentBatch = currentBatch[:0] } } + if err := rows.Err(); err != nil { - return nil, err + return count, fmt.Errorf("reindex: rows loop: %w", err) } - if len(results) > limit { - results = results[:limit] + n, err := flushBatch() + if err != nil { + return count, fmt.Errorf("reindex: final batch flush: %w", err) } - return results, nil + count += n + + // Hot-swap the cache pointer (Atomic-safe with RWMutex) + s.mu.Lock() + s.cache = newCache + s.mu.Unlock() + return count, nil } // ─── Stats ─────────────────────────────────────────────────────────────────── @@ -1626,11 +1980,28 @@ func (s *Store) FormatContext(project, scope string) (string, error) { return "", err } + var b strings.Builder + if len(sessions) == 0 && len(observations) == 0 && len(prompts) == 0 { - return "", nil + // FALLBACK: If current project is empty, fetch general context from ALL projects + // to provide a "global" knowledge base as per the user's philosophy. + observations, _ = s.RecentObservations("", "", s.cfg.MaxContextResults) + if len(observations) == 0 { + return "", nil + } + b.WriteString("## Global Memory (Cross-Project)\n\n") + b.WriteString("No specific memories for this project yet. Showing recent global discoveries:\n\n") + for _, obs := range observations { + projectTag := "" + if obs.Project != nil && *obs.Project != "" { + projectTag = fmt.Sprintf(" [%s]", *obs.Project) + } + fmt.Fprintf(&b, "- [%s]%s **%s**: %s\n", + obs.Type, projectTag, obs.Title, truncate(obs.Content, 300)) + } + return b.String(), nil } - var b strings.Builder b.WriteString("## Memory from Previous Sessions\n\n") if len(sessions) > 0 { @@ -1696,7 +2067,7 @@ func (s *Store) Export() (*ExportData, error) { // Observations obsRows, err := s.queryItHook(s.db, `SELECT id, ifnull(sync_id, '') as sync_id, session_id, type, title, content, tool_name, project, - scope, topic_key, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at + scope, topic_key, ifnull(simhash, 0) as simhash, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at FROM observations ORDER BY id`, ) if err != nil { @@ -1707,7 +2078,7 @@ func (s *Store) Export() (*ExportData, error) { var o Observation if err := obsRows.Scan( &o.ID, &o.SyncID, &o.SessionID, &o.Type, &o.Title, &o.Content, - &o.ToolName, &o.Project, &o.Scope, &o.TopicKey, &o.RevisionCount, &o.DuplicateCount, &o.LastSeenAt, + &o.ToolName, &o.Project, &o.Scope, &o.TopicKey, &o.SimHash, &o.RevisionCount, &o.DuplicateCount, &o.LastSeenAt, &o.CreatedAt, &o.UpdatedAt, &o.DeletedAt, ); err != nil { return nil, err @@ -1766,8 +2137,8 @@ func (s *Store) Import(data *ExportData) (*ImportResult, error) { // Import observations (use new IDs — AUTOINCREMENT) for _, obs := range data.Observations { _, err := s.execHook(tx, - `INSERT INTO observations (sync_id, session_id, type, title, content, tool_name, project, scope, topic_key, normalized_hash, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + `INSERT INTO observations (sync_id, session_id, type, title, content, tool_name, project, scope, topic_key, simhash, normalized_hash, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, normalizeExistingSyncID(obs.SyncID, "obs"), obs.SessionID, obs.Type, @@ -1777,6 +2148,7 @@ func (s *Store) Import(data *ExportData) (*ImportResult, error) { obs.Project, normalizeScope(obs.Scope), nullableString(normalizeTopicKey(derefString(obs.TopicKey))), + int64(obs.SimHash), hashNormalized(obs.Content), maxInt(obs.RevisionCount, 1), maxInt(obs.DuplicateCount, 1), @@ -2121,12 +2493,12 @@ func (s *Store) ApplyPulledMutation(targetKey string, mutation SyncMutation) err func (s *Store) GetObservationBySyncID(syncID string) (*Observation, error) { row := s.db.QueryRow( `SELECT id, ifnull(sync_id, '') as sync_id, session_id, type, title, content, tool_name, project, - scope, topic_key, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at + scope, topic_key, ifnull(simhash, 0) as simhash, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at FROM observations WHERE sync_id = ? AND deleted_at IS NULL ORDER BY id DESC LIMIT 1`, syncID, ) var o Observation - if err := row.Scan(&o.ID, &o.SyncID, &o.SessionID, &o.Type, &o.Title, &o.Content, &o.ToolName, &o.Project, &o.Scope, &o.TopicKey, &o.RevisionCount, &o.DuplicateCount, &o.LastSeenAt, &o.CreatedAt, &o.UpdatedAt, &o.DeletedAt); err != nil { + if err := row.Scan(&o.ID, &o.SyncID, &o.SessionID, &o.Type, &o.Title, &o.Content, &o.ToolName, &o.Project, &o.Scope, &o.TopicKey, &o.SimHash, &o.RevisionCount, &o.DuplicateCount, &o.LastSeenAt, &o.CreatedAt, &o.UpdatedAt, &o.DeletedAt); err != nil { return nil, err } return &o, nil @@ -2843,11 +3215,11 @@ func decodeSyncPayload(payload []byte, dest any) error { func (s *Store) getObservationTx(tx *sql.Tx, id int64) (*Observation, error) { row := tx.QueryRow( `SELECT id, ifnull(sync_id, '') as sync_id, session_id, type, title, content, tool_name, project, - scope, topic_key, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at + scope, topic_key, ifnull(simhash, 0) as simhash, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at FROM observations WHERE id = ? AND deleted_at IS NULL`, id, ) var o Observation - if err := row.Scan(&o.ID, &o.SyncID, &o.SessionID, &o.Type, &o.Title, &o.Content, &o.ToolName, &o.Project, &o.Scope, &o.TopicKey, &o.RevisionCount, &o.DuplicateCount, &o.LastSeenAt, &o.CreatedAt, &o.UpdatedAt, &o.DeletedAt); err != nil { + if err := row.Scan(&o.ID, &o.SyncID, &o.SessionID, &o.Type, &o.Title, &o.Content, &o.ToolName, &o.Project, &o.Scope, &o.TopicKey, &o.SimHash, &o.RevisionCount, &o.DuplicateCount, &o.LastSeenAt, &o.CreatedAt, &o.UpdatedAt, &o.DeletedAt); err != nil { return nil, err } return &o, nil @@ -2855,7 +3227,7 @@ func (s *Store) getObservationTx(tx *sql.Tx, id int64) (*Observation, error) { func (s *Store) getObservationBySyncIDTx(tx *sql.Tx, syncID string, includeDeleted bool) (*Observation, error) { query := `SELECT id, ifnull(sync_id, '') as sync_id, session_id, type, title, content, tool_name, project, - scope, topic_key, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at + scope, topic_key, ifnull(simhash, 0) as simhash, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at FROM observations WHERE sync_id = ?` if !includeDeleted { query += ` AND deleted_at IS NULL` @@ -2863,7 +3235,7 @@ func (s *Store) getObservationBySyncIDTx(tx *sql.Tx, syncID string, includeDelet query += ` ORDER BY id DESC LIMIT 1` row := tx.QueryRow(query, syncID) var o Observation - if err := row.Scan(&o.ID, &o.SyncID, &o.SessionID, &o.Type, &o.Title, &o.Content, &o.ToolName, &o.Project, &o.Scope, &o.TopicKey, &o.RevisionCount, &o.DuplicateCount, &o.LastSeenAt, &o.CreatedAt, &o.UpdatedAt, &o.DeletedAt); err != nil { + if err := row.Scan(&o.ID, &o.SyncID, &o.SessionID, &o.Type, &o.Title, &o.Content, &o.ToolName, &o.Project, &o.Scope, &o.TopicKey, &o.SimHash, &o.RevisionCount, &o.DuplicateCount, &o.LastSeenAt, &o.CreatedAt, &o.UpdatedAt, &o.DeletedAt); err != nil { return nil, err } return &o, nil @@ -2901,9 +3273,9 @@ func (s *Store) applyObservationUpsertTx(tx *sql.Tx, payload syncObservationPayl existing, err := s.getObservationBySyncIDTx(tx, payload.SyncID, true) if err == sql.ErrNoRows { _, err = s.execHook(tx, - `INSERT INTO observations (sync_id, session_id, type, title, content, tool_name, project, scope, topic_key, normalized_hash, revision_count, duplicate_count, updated_at, deleted_at) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1, 1, datetime('now'), NULL)`, - payload.SyncID, payload.SessionID, payload.Type, payload.Title, payload.Content, payload.ToolName, payload.Project, normalizeScope(payload.Scope), payload.TopicKey, hashNormalized(payload.Content), + `INSERT INTO observations (sync_id, session_id, type, title, content, tool_name, project, scope, topic_key, simhash, normalized_hash, revision_count, duplicate_count, updated_at, deleted_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 1, 1, datetime('now'), NULL)`, + payload.SyncID, payload.SessionID, payload.Type, payload.Title, payload.Content, payload.ToolName, payload.Project, normalizeScope(payload.Scope), payload.TopicKey, int64(turboquant.ComputeSimHash(payload.Content)), hashNormalized(payload.Content), ) return err } @@ -2912,9 +3284,9 @@ func (s *Store) applyObservationUpsertTx(tx *sql.Tx, payload syncObservationPayl } _, err = s.execHook(tx, `UPDATE observations - SET session_id = ?, type = ?, title = ?, content = ?, tool_name = ?, project = ?, scope = ?, topic_key = ?, normalized_hash = ?, revision_count = revision_count + 1, updated_at = datetime('now'), deleted_at = NULL + SET session_id = ?, type = ?, title = ?, content = ?, tool_name = ?, project = ?, scope = ?, topic_key = ?, simhash = ?, normalized_hash = ?, revision_count = revision_count + 1, updated_at = datetime('now'), deleted_at = NULL WHERE id = ?`, - payload.SessionID, payload.Type, payload.Title, payload.Content, payload.ToolName, payload.Project, normalizeScope(payload.Scope), payload.TopicKey, hashNormalized(payload.Content), existing.ID, + payload.SessionID, payload.Type, payload.Title, payload.Content, payload.ToolName, payload.Project, normalizeScope(payload.Scope), payload.TopicKey, int64(turboquant.ComputeSimHash(payload.Content)), hashNormalized(payload.Content), existing.ID, ) return err } @@ -2975,7 +3347,7 @@ func (s *Store) queryObservations(query string, args ...any) ([]Observation, err var o Observation if err := rows.Scan( &o.ID, &o.SyncID, &o.SessionID, &o.Type, &o.Title, &o.Content, - &o.ToolName, &o.Project, &o.Scope, &o.TopicKey, &o.RevisionCount, &o.DuplicateCount, &o.LastSeenAt, + &o.ToolName, &o.Project, &o.Scope, &o.TopicKey, &o.SimHash, &o.RevisionCount, &o.DuplicateCount, &o.LastSeenAt, &o.CreatedAt, &o.UpdatedAt, &o.DeletedAt, ); err != nil { return nil, err @@ -3063,6 +3435,7 @@ func (s *Store) migrateLegacyObservationsTable() error { project TEXT, scope TEXT NOT NULL DEFAULT 'project', topic_key TEXT, + simhash INTEGER, normalized_hash TEXT, revision_count INTEGER NOT NULL DEFAULT 1, duplicate_count INTEGER NOT NULL DEFAULT 1, @@ -3079,7 +3452,7 @@ func (s *Store) migrateLegacyObservationsTable() error { if _, err := s.execHook(tx, ` INSERT INTO observations_migrated ( id, sync_id, session_id, type, title, content, tool_name, project, - scope, topic_key, normalized_hash, revision_count, duplicate_count, + scope, topic_key, simhash, normalized_hash, revision_count, duplicate_count, last_seen_at, created_at, updated_at, deleted_at ) SELECT @@ -3097,6 +3470,7 @@ func (s *Store) migrateLegacyObservationsTable() error { project, CASE WHEN scope IS NULL OR scope = '' THEN 'project' ELSE scope END, NULLIF(topic_key, ''), + 0, normalized_hash, CASE WHEN revision_count IS NULL OR revision_count < 1 THEN 1 ELSE revision_count END, CASE WHEN duplicate_count IS NULL OR duplicate_count < 1 THEN 1 ELSE duplicate_count END, @@ -3381,11 +3755,19 @@ func stripPrivateTags(s string) string { // sanitizeFTS wraps each word in quotes so FTS5 doesn't choke on special chars. // "fix auth bug" → `"fix" "auth" "bug"` +// It correctly escapes internal double quotes to prevent malformed MATCH syntax. func sanitizeFTS(query string) string { words := strings.Fields(query) for i, w := range words { - // Strip existing quotes to avoid double-quoting - w = strings.Trim(w, `"`) + // Only strip wrapping quotes if they exist as a balanced pair + if len(w) >= 2 && strings.HasPrefix(w, `"`) && strings.HasSuffix(w, `"`) { + w = w[1 : len(w)-1] + } + + // Escape any remaining internal double quotes by doubling them + w = strings.ReplaceAll(w, `"`, `""`) + + // Wrap the result in quotes for FTS5 words[i] = `"` + w + `"` } return strings.Join(words, " ") diff --git a/internal/store/store_test.go b/internal/store/store_test.go index ea01f9e..77c0060 100644 --- a/internal/store/store_test.go +++ b/internal/store/store_test.go @@ -4439,3 +4439,112 @@ func TestCountObservationsForProject(t *testing.T) { t.Errorf("expected 0 for beta, got %d", count) } } + +func TestSearchRankingNormalization(t *testing.T) { + s := newTestStore(t) + + if err := s.CreateSession("s1", "engram", "/tmp/engram"); err != nil { + t.Fatalf("create session: %v", err) + } + + // 1. Add a Semantic result (Distance 0 -> Rank -15.0) + // We'll use the same content as the query to get Distance 0 + semanticContent := "Semantic search is extremely powerful for finding related concepts." + semanticID, err := s.AddObservation(AddObservationParams{ + SessionID: "s1", + Title: "Semantic Match", + Content: semanticContent, + Project: "engram", + }) + if err != nil { + t.Fatalf("add semantic result: %v", err) + } + + // 2. Add a weak FTS5 match (Rank around -1.0 to -2.0) + _, err = s.AddObservation(AddObservationParams{ + SessionID: "s1", + Title: "Weak match", + Content: "This just mentions search once.", + Project: "engram", + }) + if err != nil { + t.Fatalf("add weak fts result: %v", err) + } + + // Run search. The semantic match (Distance 0) should have Rank -15.0 + // Weak FTS5 usually doesn't reach -15.0. + results, err := s.Search(semanticContent, SearchOptions{Project: "engram", Limit: 10}) + if err != nil { + t.Fatalf("search: %v", err) + } + + if len(results) == 0 { + t.Fatalf("no results found") + } + + foundSemantic := false + for _, r := range results { + if r.ID == semanticID { + foundSemantic = true + if r.Rank >= 0 { + t.Errorf("expected negative rank for semantic match, got %f", r.Rank) + } + // Verify it's exactly -15.0 for distance 0 + if r.Rank != -15.0 { + t.Errorf("expected rank -15.0 for distance 0, got %f", r.Rank) + } + } + } + + if !foundSemantic { + t.Errorf("semantic match not found in results") + } + + // The first result should be our semantic match because -15.0 is very strong + if results[0].ID != semanticID { + t.Errorf("expected first result to be semantic match (ID %d), got ID %d with rank %f", semanticID, results[0].ID, results[0].Rank) + } +} + +func TestSanitizeFTSSecurity(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "internal quotes", + input: `class="btn"`, + expected: `"class=""btn"""`, + }, + { + name: "mixed quotes and phrases", + input: `error en "user_id"`, + expected: `"error" "en" "user_id"`, + }, + { + name: "multiple internal quotes", + input: `id="1" name="test"`, + expected: `"id=""1""" "name=""test"""`, + }, + { + name: "already quoted word", + input: `"user_id"`, + expected: `"user_id"`, + }, + { + name: "special characters", + input: `select * from table!`, + expected: `"select" "*" "from" "table!"`, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := sanitizeFTS(tt.input) + if got != tt.expected { + t.Errorf("sanitizeFTS(%q) = %q, want %q", tt.input, got, tt.expected) + } + }) + } +} diff --git a/internal/store/turboquant/turboquant.go b/internal/store/turboquant/turboquant.go new file mode 100644 index 0000000..472ff12 --- /dev/null +++ b/internal/store/turboquant/turboquant.go @@ -0,0 +1,325 @@ +package turboquant + +import ( + "container/heap" + "encoding/gob" + "hash/fnv" + "math/bits" + "os" + "strings" + "sync" + "unicode" +) + +// BlockSignature represents the quantized 64-bit footprint of a context block +type BlockSignature uint64 + +type CacheEntry struct { + Signature BlockSignature + Offset int64 +} + +// TurboCache is the ultra-lightweight memory store footprint. +// We keep the hash signatures and their location offsets in contiguous memory. +type TurboCache struct { + mu sync.RWMutex + Entries []CacheEntry // Contiguous slice for CPU cache locality (LSH friendly) + IDToOffset map[int64]int // Map ID (Offset) to index in Entries for O(1) lookup +} + +// NewTurboCache creates an empty, lightweight memory index +func NewTurboCache() *TurboCache { + return &TurboCache{ + Entries: make([]CacheEntry, 0), + IDToOffset: make(map[int64]int), + } +} + +// Add safely inserts a new signature and offset into the cache. +func (tc *TurboCache) Add(sig BlockSignature, offset int64) { + tc.mu.Lock() + defer tc.mu.Unlock() + + if idx, exists := tc.IDToOffset[offset]; exists { + tc.Entries[idx].Signature = sig + return + } + + tc.IDToOffset[offset] = len(tc.Entries) + tc.Entries = append(tc.Entries, CacheEntry{Signature: sig, Offset: offset}) +} + +// Reset clears all entries from the cache. +func (tc *TurboCache) Reset() { + tc.mu.Lock() + defer tc.mu.Unlock() + tc.Entries = make([]CacheEntry, 0) + tc.IDToOffset = make(map[int64]int) +} + +// Remove deletes all entries associated with a specific offset (ID). +// This prevents memory leaks and stale signatures after updates/deletes. +func (tc *TurboCache) Remove(offset int64) { + tc.mu.Lock() + defer tc.mu.Unlock() + + idx, exists := tc.IDToOffset[offset] + if !exists { + return + } + + // Swap with last element for O(1) removal + lastIdx := len(tc.Entries) - 1 + if idx != lastIdx { + lastEntry := tc.Entries[lastIdx] + tc.Entries[idx] = lastEntry + tc.IDToOffset[lastEntry.Offset] = idx + } + + tc.Entries = tc.Entries[:lastIdx] + delete(tc.IDToOffset, offset) +} + +// Size returns the total number of entries in the cache. +func (tc *TurboCache) Size() int { + tc.mu.RLock() + defer tc.mu.RUnlock() + return len(tc.Entries) +} + +// GetExact performs an exact signature match. +func (tc *TurboCache) GetExact(sig BlockSignature) (int64, bool) { + tc.mu.RLock() + defer tc.mu.RUnlock() + for _, entry := range tc.Entries { + if entry.Signature == sig { + return entry.Offset, true + } + } + return -1, false +} + +// Save persists the TurboCache index to disk using gob encoding for extreme efficiency. +func (tc *TurboCache) Save(path string) error { + tc.mu.RLock() + defer tc.mu.RUnlock() + + tmpPath := path + ".tmp" + file, err := os.Create(tmpPath) + if err != nil { + return err + } + + encoder := gob.NewEncoder(file) + if err := encoder.Encode(tc.Entries); err != nil { + file.Close() + os.Remove(tmpPath) + return err + } + + // Ensure data is on disk before renaming + if err := file.Sync(); err != nil { + file.Close() + os.Remove(tmpPath) + return err + } + file.Close() + + // Atomic swap (best effort on Windows, standard on POSIX) + return os.Rename(tmpPath, path) +} + +// Load populates the TurboCache index from a file on disk. +func (tc *TurboCache) Load(path string) error { + tc.mu.Lock() + defer tc.mu.Unlock() + + file, err := os.Open(path) + if err != nil { + return err + } + + decoder := gob.NewDecoder(file) + var loaded []CacheEntry + decodeErr := decoder.Decode(&loaded) + + closeErr := file.Close() + if decodeErr != nil { + return decodeErr + } + + tc.Entries = loaded + + // Rebuild map for O(1) removals after loading + tc.IDToOffset = make(map[int64]int) + for i, entry := range tc.Entries { + tc.IDToOffset[entry.Offset] = i + } + + return closeErr +} + +var ( + diacriticReplacer = strings.NewReplacer( + "á", "a", "é", "e", "í", "i", "ó", "o", "ú", "u", + "ü", "u", "ñ", "n", + "Á", "A", "É", "E", "Í", "I", "Ó", "O", "Ú", "U", + "Ü", "U", "Ñ", "N", + ) + + // stopWords is now package-level to avoid re-allocation + stopWords = map[string]bool{ + "the": true, "and": true, "is": true, "in": true, "a": true, "to": true, "of": true, + "are": true, "with": true, "for": true, "all": true, "using": true, "use": true, + "this": true, "that": true, "from": true, "been": true, "has": true, "have": true, "will": true, + "el": true, "la": true, "y": true, "en": true, "es": true, "para": true, "por": true, + "que": true, "un": true, "una": true, "con": true, "los": true, "las": true, + "del": true, "como": true, "mas": true, "pero": true, "sus": true, "este": true, "esta": true, + } + + // tokenFunc global to avoid closure allocations and clean punctuation + tokenFunc = func(c rune) bool { + return !unicode.IsLetter(c) && !unicode.IsNumber(c) + } +) + +// normalizeText removes common diacritics for Spanish/English robustness +// without introducing external dependencies (like golang.org/x/text) +func normalizeText(text string) string { + return diacriticReplacer.Replace(text) +} + +// ComputeSimHash extracts a 64-bit locality-sensitive hash (LSH) from the block text. +// This is the core "quantization" step that requires zero neural networks. +func ComputeSimHash(text string) BlockSignature { + var v [64]int // Stack allocated fixed array + + // Normalize text before processing + cleanText := normalizeText(text) + + // Precise tokenizer: split by non-alphanumeric, keep words >= 3 chars + words := strings.FieldsFunc(strings.ToLower(cleanText), tokenFunc) + + h := fnv.New64a() + + for _, word := range words { + if len(word) < 3 || stopWords[word] { + continue + } + + // Compute FNV-1a 64-bit hash for the word without allocating []byte + h.Reset() + var b [1]byte + for i := 0; i < len(word); i++ { + b[0] = word[i] + h.Write(b[:]) + } + wordHash := h.Sum64() + + // Update the 64-bit vector + for i := 0; i < 64; i++ { + bit := (wordHash >> i) & 1 + if bit == 1 { + v[i]++ + } else { + v[i]-- + } + } + } + + // Reconstruct the final 64-bit signature according to SimHash logic + var signature uint64 + for i := 0; i < 64; i++ { + if v[i] > 0 { + signature |= 1 << i + } + } + + return BlockSignature(signature) +} + +// HammingDistance calculates the distance between two signatures utilizing native CPU instructions. +// The lower the distance, the more semantically similar the blocks are. +func HammingDistance(a, b BlockSignature) int { + return bits.OnesCount64(uint64(a ^ b)) +} + +// FindNearest searches the cache for the signature with the lowest Hamming distance. +func (tc *TurboCache) FindNearest(query BlockSignature) (BlockSignature, int64, int) { + tc.mu.RLock() + defer tc.mu.RUnlock() + + if len(tc.Entries) == 0 { + return 0, -1, 64 + } + + var bestMatch BlockSignature + var bestOffset int64 = -1 + minDist := 65 // Max distance is 64 + + // Sequential scan over contiguous memory block -> Cache L1/L2 hits on every cycle. + for _, entry := range tc.Entries { + dist := HammingDistance(entry.Signature, query) + if dist < minDist { + minDist = dist + bestMatch = entry.Signature + bestOffset = entry.Offset + } + } + + return bestMatch, bestOffset, minDist +} + +type NearestMatch struct { + ID int64 + Distance int +} + +type matchHeap []NearestMatch + +func (h matchHeap) Len() int { return len(h) } +func (h matchHeap) Less(i, j int) bool { return h[i].Distance > h[j].Distance } // Max-heap to keep smallest distances +func (h matchHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } +func (h *matchHeap) Push(x any) { *h = append(*h, x.(NearestMatch)) } +func (h *matchHeap) Pop() any { + old := *h + n := len(old) + x := old[n-1] + *h = old[0 : n-1] + return x +} + +// FindNearestN searches the cache for the top N signatures with the lowest Hamming distance. +func (tc *TurboCache) FindNearestN(query BlockSignature, n int) []NearestMatch { + if n <= 0 { + return nil + } + + tc.mu.RLock() + defer tc.mu.RUnlock() + + if len(tc.Entries) == 0 { + return nil + } + + h := &matchHeap{} + *h = make(matchHeap, 0, n) + heap.Init(h) + + for _, entry := range tc.Entries { + dist := HammingDistance(entry.Signature, query) + if h.Len() < n { + heap.Push(h, NearestMatch{ID: entry.Offset, Distance: dist}) + } else if dist < (*h)[0].Distance { + heap.Pop(h) + heap.Push(h, NearestMatch{ID: entry.Offset, Distance: dist}) + } + } + + results := make([]NearestMatch, h.Len()) + for i := h.Len() - 1; i >= 0; i-- { + results[i] = heap.Pop(h).(NearestMatch) + } + + return results +} diff --git a/internal/store/turboquant/turboquant_test.go b/internal/store/turboquant/turboquant_test.go new file mode 100644 index 0000000..c2de6bf --- /dev/null +++ b/internal/store/turboquant/turboquant_test.go @@ -0,0 +1,97 @@ +package turboquant + +import ( + "path/filepath" + "testing" +) + +func TestComputeSimHash_Normalization(t *testing.T) { + text1 := "¡qué bug más difícil de la base de datos!" + text2 := "que bug mas dificil de la base de datos" // Sin tildes + + hash1 := ComputeSimHash(text1) + hash2 := ComputeSimHash(text2) + + // Ambos deberían generar firmas muy similares ahora que normalizamos. + // Si son idénticos, la distancia de Hamming es 0. + dist := HammingDistance(hash1, hash2) + if dist > 0 { + t.Errorf("Expected 0 distance for texts differing only by accents/punctuation, got %d", dist) + } +} + +func TestTurboCache_SaveLoad(t *testing.T) { + cache := NewTurboCache() + + dbSig := ComputeSimHash("database connection mysql query optimization") + cache.Add(dbSig, 404) + + tmpFile := filepath.Join(t.TempDir(), "engram.tq") + + err := cache.Save(tmpFile) + if err != nil { + t.Fatalf("Failed to save cache: %v", err) + } + + // Verify we can load it into a new instance + newCache := NewTurboCache() + err = newCache.Load(tmpFile) + if err != nil { + t.Fatalf("Failed to load cache: %v", err) + } + + if val, ok := newCache.GetExact(dbSig); !ok || val != 404 { + t.Errorf("Expected to load offset 404 for dbSig, got %d (ok: %t)", val, ok) + } +} + +func TestTurboCacheNearest(t *testing.T) { + cache := NewTurboCache() + + // Seed cache with some known semantic blocks + dbSig := ComputeSimHash("database connection mysql query optimization") + cssSig := ComputeSimHash("frontend css react tailwind flexbox padding") + authSig := ComputeSimHash("oauth jwt token login authentication secure") + + cache.Add(dbSig, 100) + cache.Add(cssSig, 200) + cache.Add(authSig, 300) + + // User query + queryText := "cómo arreglamos el problema del token de login seguro" + querySig := ComputeSimHash(queryText) + + bestMatch, offset, _ := cache.FindNearest(querySig) + + if bestMatch != authSig { + t.Errorf("Expected query to match auth signature. Matched signature with offset: %d", offset) + } + + if offset != 300 { + t.Errorf("Expected offset 300 for auth matching, got %d", offset) + } +} + +func BenchmarkComputeSimHash(b *testing.B) { + text := "this is a very standard text that simulates a reasonably sized log line or a sentence coming from an issue or commit where we want to measure the performance of sim hash directly." + b.ResetTimer() + for i := 0; i < b.N; i++ { + ComputeSimHash(text) + } +} + +func BenchmarkFindNearest_1000Items(b *testing.B) { + cache := NewTurboCache() + // Populate 1000 dummy hashes + for i := int64(0); i < 1000; i++ { + // Just generate arbitrary signatures + cache.Add(BlockSignature(uint64(i)*1234567890123), i) + } + + query := ComputeSimHash("another query text just to get a signature for the benchmark") + + b.ResetTimer() + for i := 0; i < b.N; i++ { + cache.FindNearest(query) + } +} diff --git a/scripts/backfill_projects_not_needed.go b/scripts/backfill_projects_not_needed.go new file mode 100644 index 0000000..efda438 --- /dev/null +++ b/scripts/backfill_projects_not_needed.go @@ -0,0 +1,27 @@ +package main + +import ( + "fmt" + "log" + "path/filepath" + "os" + + "github.com/Gentleman-Programming/engram/internal/store" +) + +func main() { + home, _ := os.UserHomeDir() + dataDir := filepath.Join(home, ".engram") + cfg := store.FallbackConfig(dataDir) + s, err := store.New(cfg) + if err != nil { + log.Fatalf("engram: open store: %v", err) + } + defer s.Close() + + fmt.Println("Backfilling missing project fields from sessions table...") + // We need to use store's database handle to run this update. + // Since s.db is private, we'll use a hack or just run it via another method. + // Oh wait, I can just use ReindexTurboQuant's structure to run SQL if I had access. + // I'll just write a raw SQL update using the sqlite package directly. +} diff --git a/scripts/check_indexing.go b/scripts/check_indexing.go new file mode 100644 index 0000000..3782eba --- /dev/null +++ b/scripts/check_indexing.go @@ -0,0 +1,38 @@ +package main + +import ( + "fmt" + "log" + "path/filepath" + "os" + + "github.com/Gentleman-Programming/engram/internal/store" +) + +func main() { + home, _ := os.UserHomeDir() + dataDir := filepath.Join(home, ".engram") + cfg := store.FallbackConfig(dataDir) + s, err := store.New(cfg) + if err != nil { + log.Fatalf("engram: open store: %v", err) + } + defer s.Close() + + obs, err := s.AllObservations("", "", 50) + if err != nil { + log.Fatalf("engram: fetch obs: %v", err) + } + + fmt.Printf("Total observations found: %d\n", len(obs)) + fmt.Println("ID | Title | SimHash") + fmt.Println("-----------------------") + missing := 0 + for _, o := range obs { + fmt.Printf("%d | %s | %d\n", o.ID, o.Title, o.SimHash) + if o.SimHash == 0 { + missing++ + } + } + fmt.Printf("\nObservations missing SimHash: %d\n", missing) +} diff --git a/scripts/fix_metadata.go b/scripts/fix_metadata.go new file mode 100644 index 0000000..0dfad64 --- /dev/null +++ b/scripts/fix_metadata.go @@ -0,0 +1,45 @@ +package main + +import ( + "database/sql" + "fmt" + "log" + "os" + "path/filepath" + + _ "modernc.org/sqlite" +) + +func main() { + home, _ := os.UserHomeDir() + dbPath := filepath.Join(home, ".engram", "engram.db") + + db, err := sql.Open("sqlite", dbPath) + if err != nil { + log.Fatalf("failed to open db: %v", err) + } + defer db.Close() + + fmt.Println("Fixing missing projects in observations table...") + + // Update observations where project is null/empty by joining with sessions + res, err := db.Exec(` + UPDATE observations + SET project = (SELECT project FROM sessions WHERE sessions.id = observations.session_id) + WHERE project IS NULL OR project = '' + `) + if err != nil { + log.Fatalf("failed to update projects: %v", err) + } + + affected, _ := res.RowsAffected() + fmt.Printf("Populated project field for %d observations.\n", affected) + + // Also ensure all observations have a simhash (safety re-check) + fmt.Println("Ensuring simhash is populated for all observations...") + // We'll let the app's Reindex handle the actual hashing if needed, + // but here we just check count. + var count int + db.QueryRow("SELECT COUNT(*) FROM observations WHERE simhash IS NULL OR simhash = 0").Scan(&count) + fmt.Printf("Observations still missing simhash: %d\n", count) +} diff --git a/scripts/reindex.go b/scripts/reindex.go new file mode 100644 index 0000000..988f6b2 --- /dev/null +++ b/scripts/reindex.go @@ -0,0 +1,28 @@ +package main + +import ( + "fmt" + "log" + "path/filepath" + "os" + + "github.com/Gentleman-Programming/engram/internal/store" +) + +func main() { + home, _ := os.UserHomeDir() + dataDir := filepath.Join(home, ".engram") + cfg := store.FallbackConfig(dataDir) + s, err := store.New(cfg) + if err != nil { + log.Fatalf("engram: open store: %v", err) + } + defer s.Close() + + fmt.Println("Running full re-index of TurboQuant...") + count, err := s.ReindexTurboQuant() + if err != nil { + log.Fatalf("engram: reindex failed: %v", err) + } + fmt.Printf("Successfully re-indexed %d observations.\n", count) +}