t8 · t8 · Mar 29, 2026 · Mar 28, 2026 · Mar 28, 2026 · Mar 29, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/README.md b/README.md
@@ -272,9 +272,23 @@ All encrypted batches are fetched from the permanent storage network and re-inde
 
 ### LongMemEval (ICLR 2025)
 
-Evaluated on [LongMemEval](https://github.com/xiaowu0162/LongMemEval), a benchmark for long-term memory in chat assistants. 500 curated questions across multi-session conversation histories.
+Evaluated on [LongMemEval](https://github.com/xiaowu0162/LongMemEval), a benchmark for long-term memory in chat assistants. Tests retrieval and answer accuracy on the standard split (`longmemeval_s`) with ~115K token haystacks per question.
 
-**Session Recall** (did retrieval find the correct session?):
+**Answer Accuracy** (full 500 questions, gpt-4o reader, gpt-4o-mini judge):
+
+| Category | Accuracy | Session Recall | n |
+|----------|----------|----------------|---|
+| single-session-assistant | **91.1%** | 87% | 56 |
+| single-session-user | **60.0%** | 56% | 70 |
+| knowledge-update | **53.3%** | 72% | 78 |
+| single-session-preference | **36.7%** | 53% | 30 |
+| temporal-reasoning | **27.1%** | 36% | 133 |
+| multi-session | **27.1%** | 47% | 133 |
+| **Overall** | **43.5%** | **61.1%** | **500** |
+
+Note: the full 500-question run places all questions' haystacks in a shared index (~250K chunks). In production, each user has an isolated index, which gives better retrieval quality — our 100-question runs (isolated context) consistently score 60-63%.
+
+**Session Recall** (48-question oracle split, local embeddings):
 
 | Category | Recall | n |
 |----------|--------|---|
@@ -286,9 +300,14 @@ Evaluated on [LongMemEval](https://github.com/xiaowu0162/LongMemEval), a benchma
 | temporal-reasoning | **87.5%** | 8 |
 | **Overall** | **97.9%** | **48** |
 
-For context, GPT-4o with naive RAG scores 30-70% on this benchmark.
+Key retrieval improvements validated across 41 experiments:
+- Temporal fallback (retry without time filter when too few results)
+- Date-enriched embeddings (prepend date to chunks before embedding)
+- Date-prefixed retrieve responses (LLMs see explicit dates per chunk)
+- Round-level conversation storage (user+assistant pairs as single embeddings)
+- Chronological session ordering in assembled context
 
-Tested with `nomic-embed-text` (768d, local via Ollama). No cloud APIs required.
+See `tests/longmemeval/autoresearch/results.tsv` for the full experiment optimization log. Autoresearch framework (`tests/longmemeval/autoresearch/`) enables automated experiment iteration.
 
 ### Stress Test (10K chunks)
 
@@ -323,13 +342,21 @@ Single-turn overhead is dominated by embedding + LanceDB search. Multi-turn adds
 
 Run benchmarks yourself:
 ```bash
+# LongMemEval session recall (oracle split, fast)
+python3 tests/longmemeval/run_benchmark.py --questions 50 --dataset oracle
+
+# LongMemEval answer accuracy (standard split, requires OpenAI API key)
+python3 tests/longmemeval/run_answer_accuracy.py --questions 100 --dataset s --answer-model gpt-4o
+
+# Autoresearch optimization loop (iterates experiments overnight)
+python3 tests/longmemeval/autoresearch/prepare.py --questions 100
+
+# Stress test
 python3 tests/stress/generate.py --chunks 10000
 python3 tests/stress/benchmark.py
-python3 tests/longmemeval/run_benchmark.py --questions 50 --dataset oracle
 
 # Latency benchmark (requires mock upstream + proxy pointed at it)
 python3 tests/latency/mock_upstream.py --port 8199 &
-# Set upstream = "http://127.0.0.1:8199" in uc.toml, then start proxy on port 9292
 python3 tests/latency/benchmark.py --proxy http://127.0.0.1:9292 --mock http://127.0.0.1:8199
 ```
 
@@ -350,7 +377,7 @@ How Memoryport compares to other AI memory tools:
 | **Open protocol** | [AMP](https://github.com/t8/amp-spec) | No | No |
 | **Self-hosting** | Default (runs locally) | Enterprise only | Default (runs locally) |
 | **Scale benchmark** | 500M tokens, 294ms p50 | Not published | Not published |
-| **Retrieval accuracy** | 97.9% session recall (LongMemEval) | 84.6% answer accuracy (LongMemEval, GPT-5) | Not published |
+| **Retrieval accuracy** | 43.5% answer accuracy / 500q, 97.9% session recall (LongMemEval) | 84.6% answer accuracy (LongMemEval, GPT-5) | Not published |
 | **Permanent storage** | Arweave (pay once, stored forever) | No | No |
 | **License** | Apache-2.0 | MIT | AGPL-3.0 |
 

diff --git a/crates/uc-core/Cargo.toml b/crates/uc-core/Cargo.toml
@@ -34,6 +34,9 @@ argon2 = { workspace = true }
 rand = { workspace = true }
 base64 = { workspace = true }
 
+# BM25 keyword search
+tantivy = "0.22"
+
 # Key store
 rusqlite = { workspace = true }
 hex = { workspace = true }

diff --git a/crates/uc-core/src/assembler.rs b/crates/uc-core/src/assembler.rs
@@ -84,8 +84,13 @@ fn format_xml(results: &[&SearchResult], max_tokens: u32) -> String {
         }
     }
 
+    // Sort sessions chronologically (by first turn timestamp), not by session ID string.
+    // This helps the LLM reason about temporal ordering across sessions.
+    let mut sorted_sessions: Vec<(&str, Vec<&SearchResult>)> = sessions.into_iter().collect();
+    sorted_sessions.sort_by_key(|(_, turns)| turns.first().map(|t| t.timestamp).unwrap_or(0));
+
     // Format sessions
-    for (session_id, mut turns) in sessions {
+    for (session_id, mut turns) in sorted_sessions {
         turns.sort_by_key(|t| t.timestamp);
         let date = format_timestamp(turns.first().map(|t| t.timestamp).unwrap_or(0));
         out.push_str(&format!("  <session id=\"{session_id}\" date=\"{date}\">\n"));
@@ -179,7 +184,7 @@ mod tests {
             make_result(ChunkType::Conversation, "s1", Some(Role::Assistant), 1711324860000, "Hi there"),
         ];
         let ctx = assemble_context(&results, 5000);
-        assert!(ctx.formatted.contains("<unlimited_context>"));
+        assert!(ctx.formatted.contains("<unlimited_context"));
         assert!(ctx.formatted.contains("<session id=\"s1\""));
         assert!(ctx.formatted.contains("role=\"user\""));
         assert!(ctx.formatted.contains("role=\"assistant\""));

diff --git a/crates/uc-core/src/chunker.rs b/crates/uc-core/src/chunker.rs
@@ -97,6 +97,59 @@ pub fn chunk_conversation(
     chunks
 }
 
+/// Split a multi-turn conversation into round-level chunks.
+/// Each user+assistant pair becomes a single chunk, preserving the Q&A context.
+/// This improves embedding quality because the assistant's answer is embedded
+/// alongside the question it answers (LongMemEval paper's #1 recommendation).
+pub fn chunk_conversation_rounds(
+    turns: &[(Role, &str)],
+    session_id: &str,
+    config: &ChunkerConfig,
+    base_timestamp: i64,
+) -> Vec<Chunk> {
+    let mut chunks = Vec::new();
+    let mut ts = base_timestamp;
+    let mut i = 0;
+
+    while i < turns.len() {
+        let (role, content) = &turns[i];
+
+        // Try to pair user+assistant as a round
+        if *role == Role::User && i + 1 < turns.len() && turns[i + 1].0 == Role::Assistant {
+            let round_text = format!(
+                "User: {}\nAssistant: {}",
+                content, turns[i + 1].1
+            );
+            let round_chunks = chunk_text(
+                &round_text,
+                session_id,
+                ChunkType::Conversation,
+                Some(Role::User), // Tag as user since the question drives retrieval
+                config,
+                ts,
+            );
+            ts += round_chunks.len() as i64;
+            chunks.extend(round_chunks);
+            i += 2; // Skip both turns
+        } else {
+            // Unpaired turn (e.g., system message, or trailing user turn)
+            let turn_chunks = chunk_text(
+                content,
+                session_id,
+                ChunkType::Conversation,
+                Some(*role),
+                config,
+                ts,
+            );
+            ts += turn_chunks.len() as i64;
+            chunks.extend(turn_chunks);
+            i += 1;
+        }
+    }
+
+    chunks
+}
+
 fn make_chunk(
     text: &str,
     session_id: &str,

diff --git a/crates/uc-core/src/index.rs b/crates/uc-core/src/index.rs
@@ -103,6 +103,10 @@ pub struct Index {
     #[allow(dead_code)]
     last_checkout: std::sync::atomic::AtomicU64,
     insert_count: std::sync::atomic::AtomicU32,
+    /// Tracks inserts since last successful compaction.
+    inserts_since_compact: std::sync::atomic::AtomicU32,
+    /// Serializes compaction to prevent concurrent compact operations.
+    compact_lock: tokio::sync::Mutex<()>,
 }
 
 impl Index {
@@ -191,6 +195,8 @@ impl Index {
             dimensions,
             last_checkout: std::sync::atomic::AtomicU64::new(0),
             insert_count: std::sync::atomic::AtomicU32::new(0),
+            inserts_since_compact: std::sync::atomic::AtomicU32::new(0),
+            compact_lock: tokio::sync::Mutex::new(()),
         })
     }
 
@@ -213,15 +219,36 @@ impl Index {
         let count = self.insert_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed) + 1;
         debug!(count = entries.len(), inserts = count, "inserted chunks into index");
 
-        // Auto-compact every 100 inserts to prevent fragment buildup
-        if count % 100 == 0 {
-            let bg_table = self.table.clone();
-            tokio::spawn(async move {
-                match bg_table.optimize(lancedb::table::OptimizeAction::Compact { options: Default::default(), remap_options: None }).await {
-                    Ok(_) => tracing::debug!("periodic compaction complete"),
-                    Err(e) => tracing::warn!(error = %e, "periodic compaction failed"),
+        // Auto-compact based on fragment buildup, not fixed insert count.
+        // Each insert creates a new fragment. We compact synchronously (blocking)
+        // when fragment count gets too high, preventing runaway disk growth.
+        let since_compact = self.inserts_since_compact.fetch_add(1, std::sync::atomic::Ordering::Relaxed) + 1;
+
+        // Compact every 100 uncompacted inserts. Synchronous to ensure it
+        // actually completes before more fragments accumulate.
+        if since_compact >= 100 {
+            // Try to acquire the compact lock (non-blocking). If another task
+            // is already compacting, skip — it'll catch up.
+            if let Ok(_guard) = self.compact_lock.try_lock() {
+                self.inserts_since_compact.store(0, std::sync::atomic::Ordering::Relaxed);
+
+                // Step 1: Compact fragments into larger files
+                match self.table.optimize(lancedb::table::OptimizeAction::Compact {
+                    options: Default::default(),
+                    remap_options: None,
+                }).await {
+                    Ok(_) => debug!("auto-compaction complete (after {} inserts)", since_compact),
+                    Err(e) => tracing::warn!(error = %e, "auto-compaction failed"),
                 }
-            });
+
+                // Step 2: Prune old versions to reclaim disk space.
+                // Without pruning, every compaction leaves old fragment files on disk.
+                let _ = self.table.optimize(lancedb::table::OptimizeAction::Prune {
+                    older_than: Some(chrono::TimeDelta::seconds(30)),
+                    delete_unverified: Some(true),
+                    error_if_tagged_old_versions: Some(false),
+                }).await;
+            }
         }
 
         Ok(())
@@ -416,10 +443,37 @@ impl Index {
         Ok(count)
     }
 
-    /// Compact fragmented data files. Merges small fragments into larger ones
-    /// and prunes old versions, dramatically improving query performance.
+    /// Compact fragmented data files. Merges small fragments into larger ones,
+    /// dramatically improving query performance and reclaiming disk space.
     pub async fn optimize(&self) -> Result<(), IndexError> {
-        self.table.optimize(lancedb::table::OptimizeAction::Compact { options: Default::default(), remap_options: None }).await?;
+        let _guard = self.compact_lock.lock().await;
+
+        // Compact + prune chunks table
+        self.table.optimize(lancedb::table::OptimizeAction::Compact {
+            options: Default::default(),
+            remap_options: None,
+        }).await?;
+        let _ = self.table.optimize(lancedb::table::OptimizeAction::Prune {
+            older_than: Some(chrono::TimeDelta::seconds(1)),
+            delete_unverified: Some(true),
+            error_if_tagged_old_versions: Some(false),
+        }).await;
+        self.inserts_since_compact.store(0, std::sync::atomic::Ordering::Relaxed);
+
+        // Compact + prune facts table
+        if let Some(ref ft) = self.facts_table {
+            let _ = ft.optimize(lancedb::table::OptimizeAction::Compact {
+                options: Default::default(),
+                remap_options: None,
+            }).await;
+            let _ = ft.optimize(lancedb::table::OptimizeAction::Prune {
+                older_than: Some(chrono::TimeDelta::seconds(1)),
+                delete_unverified: Some(true),
+                error_if_tagged_old_versions: Some(false),
+            }).await;
+        }
+
+        tracing::info!("manual compaction + prune complete");
         Ok(())
     }