Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions check_simhash.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package main

import (
"database/sql"
"fmt"
"log"
"os"
"path/filepath"

_ "modernc.org/sqlite"
)

func main() {
home, _ := os.UserHomeDir()
dbPath := filepath.Join(home, ".engram", "engram.db")
db, err := sql.Open("sqlite", dbPath)
if err != nil {
log.Fatal(err)
}
defer db.Close()

var count int
err = db.QueryRow("SELECT COUNT(*) FROM observations WHERE simhash IS NULL OR simhash = 0").Scan(&count)
if err != nil {
log.Fatal(err)
}

fmt.Printf("Observations without simhash: %d\n", count)
}
42 changes: 42 additions & 0 deletions docs/architecture_turboquant.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Arquitectura: TurboQuant + Engram Hybrid Search πŸš€πŸ§ 

Esta documentaciΓ³n detalla la integraciΓ³n del motor de indexaciΓ³n semΓ‘ntica **TurboQuant** dentro del servidor de memoria **Engram**.

## 1. El Problema
La bΓΊsqueda tradicional en Engram dependΓ­a exclusivamente de **SQLite FTS5**, que es excelente para coincidencias de texto exactas pero falla estrepitosamente en:
- **Conceptual Matching**: Si buscas "memoria" y el registro dice "almacenamiento", FTS5 no lo encuentra.
- **Typo Tolerance**: PequeΓ±os errores en el query pueden anular el resultado.
- **Semantic Re-ranking**: No hay forma de priorizar resultados que "significan" lo mismo que el query pero usan palabras distintas.

## 2. La SoluciΓ³n: Motor HΓ­brido
Hemos implementado una capa hΓ­brida que combina el poder de los Γ­ndices invertidos (FTS5) con la eficiencia de los **Locality Sensitive Hashes (LSH)**.

### Componentes Clave:

| Componente | FunciΓ³n | TecnologΓ­a |
| :--- | :--- | :--- |
| **FTS5 Gatekeeper** | Coincidencias exactas y rΓ‘pidas. | SQLite Virtual Tables |
| **SimHash (TurboQuant)** | Genera huellas digitales de 64 bits de conceptos. | FNV-1a Hash + Bitwise Quantization |
| **TurboCache** | Cache en memoria contigua para navegaciΓ³n LSH ultra-rΓ‘pida. | Go Slices (Cache Locality) |
| **Hamming Distance** | Calcula la cercanΓ­a semΓ‘ntica entre el query y los recuerdos. | Native CPU `POPCNT` |

## 3. Flujo de Búsqueda 🧬

1. **Carga Inicial**: Al iniciar el `Store`, todos los `simhash` de la base de datos se cargan en el `TurboCache` en memoria.
2. **Query SanitizaciΓ³n**: El query del usuario se normaliza (se quitan tildes, se pasa a minΓΊsculas) y se calcula su `querySimHash`.
3. **EjecuciΓ³n FTS5**: Se buscan coincidencias exactas en la tabla virtual de SQLite.
4. **ExpansiΓ³n SemΓ‘ntica (TurboQuant)**:
- Se escanea el `TurboCache` buscando las 10 observaciones con menor **Distancia de Hamming**.
- Cualquier resultado con distancia `< 20` se suma al set de resultados, aunque FTS5 no lo haya encontrado.
5. **Re-Ranking**: Se aplica un *boost* a los resultados que tengan alta similitud semΓ‘ntica, subiΓ©ndolos en la lista de prioridades.

## 4. Persistencia y Compatibilidad πŸ’Ύ
- **Esquema**: Se aΓ±adiΓ³ la columna `simhash` (INTEGER) a la tabla `observations`.
- **Compatibilidad**: Se utiliza `int64` en Go para representar los 64 bits de forma compatible con los enteros con signo de SQLite, evitando crasheos por desbordamiento de bit alto.
- **MigraciΓ³n**: El sistema incluye lΓ³gica para auto-migrar bases de datos existentes aΓ±adiendo la columna.

## 5. Rendimiento πŸš€
Al usar un array contiguo (`[]CacheEntry`) en memoria, el escaneo lineal es extremadamente rΓ‘pido para el procesador (L1/L2 hits), permitiendo comparar miles de firmas en fracciones de milisegundo sin dependencias externas (como bases de datos vectoriales pesadas).

---
**Nota de Arquitectura**: Este diseΓ±o prioriza la **Localidad** y la **AutonomΓ­a**. No necesitas una nube o un modelo de 10GB para tener bΓΊsqueda semΓ‘ntica; solo necesitas matemΓ‘ticas y una CPU eficiente.
202 changes: 202 additions & 0 deletions internal/store/semantic_search_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
package store

import (
"os"
"testing"

"github.com/Gentleman-Programming/engram/internal/store/turboquant"
)

func TestTurboQuant(t *testing.T) {
// Setup a temporary store with defaults handled
tmpDir, err := os.MkdirTemp("", "engram-turboquant-test-*")
if err != nil {
t.Fatal(err)
}
defer os.RemoveAll(tmpDir)

s, err := New(Config{
DataDir: tmpDir,
MaxSearchResults: 10,
MaxContextResults: 10,
})
if err != nil {
t.Fatalf("Failed to create store: %v", err)
}
defer s.Close()

// 0. Setup test data (Create Session)
err = s.CreateSession("test-session", "test-project", "/tmp/test")
if err != nil {
t.Fatal(err)
}

// 1. Memory Inoculation
memories := []struct {
project string
title string
content string
}{
{"ProjectA", "Auth Logic", "OAuth2 implementation using JWT tokens and RS256."},
{"ProjectB", "Auth Logic Duplicate", "OAuth2 implementation using JWT tokens and RS256."},
{"Global", "Database", "PostgreSQL database using GORM and migrations."},
{"Global", "Frontend", "React application using Tailwind CSS and Vite."},
}

for _, m := range memories {
_, err = s.AddObservation(AddObservationParams{
SessionID: "test-session",
Type: "architecture",
Title: m.title,
Content: m.content,
Project: m.project,
})
if err != nil {
t.Fatal(err)
}
}

// SUB-TEST: Semantic Precision
t.Run("SemanticExpansion_Precision", func(t *testing.T) {
query := "tokens JWT rs256"
results, err := s.Search(query, SearchOptions{Limit: 5})
if err != nil {
t.Fatal(err)
}

t.Logf("Search results count: %d", len(results))
for _, res := range results {
project := ""
if res.Project != nil {
project = *res.Project
}
t.Logf(" - Found: %s in %s (Rank: %f)", res.Title, project, res.Rank)
}

found := false
for _, res := range results {
project := ""
if res.Project != nil {
project = *res.Project
}
if project == "projecta" || project == "projectb" {
found = true
dist := turboquant.HammingDistance(turboquant.ComputeSimHash(query), turboquant.BlockSignature(res.SimHash))
t.Logf("SUCCESS: Found concept in '%s' with distance %d", project, dist)
}
}

if !found {
t.Error("FAIL: Could not find semantic match for tokens and JWT")
}
})

// SUB-TEST: Metadata Filtering (The "otra prueba mas")
t.Run("MetadataFiltering", func(t *testing.T) {
query := "RS256 JWT tokens" // Identical semantic query

// SEARCH ONLY IN PROJECT A
results, err := s.Search(query, SearchOptions{Limit: 5, Project: "ProjectA"}) // Case normalization handled by Store
if err != nil {
t.Fatal(err)
}

if len(results) != 1 {
t.Errorf("FAIL: Expected 1 result for ProjectA, but got %d", len(results))
} else {
project := ""
if results[0].Project != nil {
project = *results[0].Project
}
if project != "projecta" {
t.Errorf("FAIL: Expected projecta, but got %s", project)
} else {
t.Log("SUCCESS: Metadata correctly filtered conceptual matches")
}
}
})

// SUB-TEST: Negative Matches (Noise Exclusion)
t.Run("NoiseExclusion", func(t *testing.T) {
query := "como plantar tomates en el jardin"
results, err := s.Search(query, SearchOptions{Limit: 5})
if err != nil {
t.Fatal(err)
}

if len(results) > 0 {
t.Errorf("FAIL: Irrelevant query returned %d results", len(results))
} else {
t.Log("SUCCESS: Noise query returned zero results")
}
})

// SUB-TEST: Sorting and Priority (Hamming Boost)
t.Run("HammingPriority", func(t *testing.T) {
// Insert exact match to check sorting priority over older near-matches
// Adding more unique words to make the semantic density very different
exactContent := "PostgreSQL migrations GORM. This is the exact technical stack for database handling."
_, err := s.AddObservation(AddObservationParams{
SessionID: "test-session",
Type: "architecture",
Title: "PreciseDB",
Content: exactContent,
Project: "Global",
})
if err != nil {
t.Fatal(err)
}

query := "PostgreSQL migrations GORM"
results, err := s.Search(query, SearchOptions{Limit: 5, Project: "global"})
if err != nil {
t.Fatal(err)
}

if len(results) < 2 {
t.Fatalf("FAIL: Expected at least 2 results for DB query, got %d", len(results))
}

// The most precise Hamming distance should be first (Rank lower is better)
t.Logf("Rank 1st: %s (%f), Rank 2nd: %s (%f)", results[0].Title, results[0].Rank, results[1].Title, results[1].Rank)
if results[0].Title != "PreciseDB" {
t.Errorf("FAIL: Precise match '%s' should be ranked before general match '%s'", results[0].Title, results[1].Title)
} else {
t.Log("SUCCESS: Ranking prioritized the lower Hamming distance")
}
})

// SUB-TEST: Full Reindexing (the "reindexa bien" part)
t.Run("FullReindexing", func(t *testing.T) {
// 1. Manually corrupt SimHash in DB to simulate stale or missing hashes
_, err := s.db.Exec("UPDATE observations SET simhash = 0")
if err != nil {
t.Fatal(err)
}

// 2. Run Reindex
count, err := s.ReindexTurboQuant()
if err != nil {
t.Fatalf("FAIL: Reindex failed: %v", err)
}

if count < 5 {
t.Errorf("FAIL: Reindex processed only %d observations, expected at least 5", count)
}

// 3. Verify that search STILL works (meaning signatures were restored in cache)
query := "RS256 JWT tokens"
results, err := s.Search(query, SearchOptions{Limit: 1, Project: "ProjectA"})
if err != nil {
t.Fatal(err)
}

if len(results) == 0 {
t.Error("FAIL: Search found nothing after reindexing")
} else if results[0].Title != "Auth Logic" {
t.Errorf("FAIL: Search found wrong result '%s' after reindexing", results[0].Title)
} else {
t.Logf("SUCCESS: Reindexed %d memories and verified search accuracy", count)
}
})
}
Loading