diff --git a/Cargo.lock b/Cargo.lock index 1fce915..ad5e65b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8850,6 +8850,7 @@ dependencies = [ "serde", "serde_json", "sha2", + "tantivy", "tempfile", "thiserror 2.0.18", "tokio", diff --git a/README.md b/README.md index cbcb85f..94269b2 100644 --- a/README.md +++ b/README.md @@ -272,9 +272,23 @@ All encrypted batches are fetched from the permanent storage network and re-inde ### LongMemEval (ICLR 2025) -Evaluated on [LongMemEval](https://github.com/xiaowu0162/LongMemEval), a benchmark for long-term memory in chat assistants. 500 curated questions across multi-session conversation histories. +Evaluated on [LongMemEval](https://github.com/xiaowu0162/LongMemEval), a benchmark for long-term memory in chat assistants. Tests retrieval and answer accuracy on the standard split (`longmemeval_s`) with ~115K token haystacks per question. -**Session Recall** (did retrieval find the correct session?): +**Answer Accuracy** (full 500 questions, gpt-4o reader, gpt-4o-mini judge): + +| Category | Accuracy | Session Recall | n | +|----------|----------|----------------|---| +| single-session-assistant | **91.1%** | 87% | 56 | +| single-session-user | **60.0%** | 56% | 70 | +| knowledge-update | **53.3%** | 72% | 78 | +| single-session-preference | **36.7%** | 53% | 30 | +| temporal-reasoning | **27.1%** | 36% | 133 | +| multi-session | **27.1%** | 47% | 133 | +| **Overall** | **43.5%** | **61.1%** | **500** | + +Note: the full 500-question run places all questions' haystacks in a shared index (~250K chunks). In production, each user has an isolated index, which gives better retrieval quality — our 100-question runs (isolated context) consistently score 60-63%. + +**Session Recall** (48-question oracle split, local embeddings): | Category | Recall | n | |----------|--------|---| @@ -286,9 +300,14 @@ Evaluated on [LongMemEval](https://github.com/xiaowu0162/LongMemEval), a benchma | temporal-reasoning | **87.5%** | 8 | | **Overall** | **97.9%** | **48** | -For context, GPT-4o with naive RAG scores 30-70% on this benchmark. +Key retrieval improvements validated across 41 experiments: +- Temporal fallback (retry without time filter when too few results) +- Date-enriched embeddings (prepend date to chunks before embedding) +- Date-prefixed retrieve responses (LLMs see explicit dates per chunk) +- Round-level conversation storage (user+assistant pairs as single embeddings) +- Chronological session ordering in assembled context -Tested with `nomic-embed-text` (768d, local via Ollama). No cloud APIs required. +See `tests/longmemeval/autoresearch/results.tsv` for the full experiment optimization log. Autoresearch framework (`tests/longmemeval/autoresearch/`) enables automated experiment iteration. ### Stress Test (10K chunks) @@ -323,13 +342,21 @@ Single-turn overhead is dominated by embedding + LanceDB search. Multi-turn adds Run benchmarks yourself: ```bash +# LongMemEval session recall (oracle split, fast) +python3 tests/longmemeval/run_benchmark.py --questions 50 --dataset oracle + +# LongMemEval answer accuracy (standard split, requires OpenAI API key) +python3 tests/longmemeval/run_answer_accuracy.py --questions 100 --dataset s --answer-model gpt-4o + +# Autoresearch optimization loop (iterates experiments overnight) +python3 tests/longmemeval/autoresearch/prepare.py --questions 100 + +# Stress test python3 tests/stress/generate.py --chunks 10000 python3 tests/stress/benchmark.py -python3 tests/longmemeval/run_benchmark.py --questions 50 --dataset oracle # Latency benchmark (requires mock upstream + proxy pointed at it) python3 tests/latency/mock_upstream.py --port 8199 & -# Set upstream = "http://127.0.0.1:8199" in uc.toml, then start proxy on port 9292 python3 tests/latency/benchmark.py --proxy http://127.0.0.1:9292 --mock http://127.0.0.1:8199 ``` @@ -350,7 +377,7 @@ How Memoryport compares to other AI memory tools: | **Open protocol** | [AMP](https://github.com/t8/amp-spec) | No | No | | **Self-hosting** | Default (runs locally) | Enterprise only | Default (runs locally) | | **Scale benchmark** | 500M tokens, 294ms p50 | Not published | Not published | -| **Retrieval accuracy** | 97.9% session recall (LongMemEval) | 84.6% answer accuracy (LongMemEval, GPT-5) | Not published | +| **Retrieval accuracy** | 43.5% answer accuracy / 500q, 97.9% session recall (LongMemEval) | 84.6% answer accuracy (LongMemEval, GPT-5) | Not published | | **Permanent storage** | Arweave (pay once, stored forever) | No | No | | **License** | Apache-2.0 | MIT | AGPL-3.0 | diff --git a/crates/uc-core/Cargo.toml b/crates/uc-core/Cargo.toml index c7677ad..e6ff5bc 100644 --- a/crates/uc-core/Cargo.toml +++ b/crates/uc-core/Cargo.toml @@ -34,6 +34,9 @@ argon2 = { workspace = true } rand = { workspace = true } base64 = { workspace = true } +# BM25 keyword search +tantivy = "0.22" + # Key store rusqlite = { workspace = true } hex = { workspace = true } diff --git a/crates/uc-core/src/assembler.rs b/crates/uc-core/src/assembler.rs index 5c9a657..1c71f50 100644 --- a/crates/uc-core/src/assembler.rs +++ b/crates/uc-core/src/assembler.rs @@ -84,8 +84,13 @@ fn format_xml(results: &[&SearchResult], max_tokens: u32) -> String { } } + // Sort sessions chronologically (by first turn timestamp), not by session ID string. + // This helps the LLM reason about temporal ordering across sessions. + let mut sorted_sessions: Vec<(&str, Vec<&SearchResult>)> = sessions.into_iter().collect(); + sorted_sessions.sort_by_key(|(_, turns)| turns.first().map(|t| t.timestamp).unwrap_or(0)); + // Format sessions - for (session_id, mut turns) in sessions { + for (session_id, mut turns) in sorted_sessions { turns.sort_by_key(|t| t.timestamp); let date = format_timestamp(turns.first().map(|t| t.timestamp).unwrap_or(0)); out.push_str(&format!(" \n")); @@ -179,7 +184,7 @@ mod tests { make_result(ChunkType::Conversation, "s1", Some(Role::Assistant), 1711324860000, "Hi there"), ]; let ctx = assemble_context(&results, 5000); - assert!(ctx.formatted.contains("")); + assert!(ctx.formatted.contains(" Vec { + let mut chunks = Vec::new(); + let mut ts = base_timestamp; + let mut i = 0; + + while i < turns.len() { + let (role, content) = &turns[i]; + + // Try to pair user+assistant as a round + if *role == Role::User && i + 1 < turns.len() && turns[i + 1].0 == Role::Assistant { + let round_text = format!( + "User: {}\nAssistant: {}", + content, turns[i + 1].1 + ); + let round_chunks = chunk_text( + &round_text, + session_id, + ChunkType::Conversation, + Some(Role::User), // Tag as user since the question drives retrieval + config, + ts, + ); + ts += round_chunks.len() as i64; + chunks.extend(round_chunks); + i += 2; // Skip both turns + } else { + // Unpaired turn (e.g., system message, or trailing user turn) + let turn_chunks = chunk_text( + content, + session_id, + ChunkType::Conversation, + Some(*role), + config, + ts, + ); + ts += turn_chunks.len() as i64; + chunks.extend(turn_chunks); + i += 1; + } + } + + chunks +} + fn make_chunk( text: &str, session_id: &str, diff --git a/crates/uc-core/src/index.rs b/crates/uc-core/src/index.rs index d852c31..5f49ccd 100644 --- a/crates/uc-core/src/index.rs +++ b/crates/uc-core/src/index.rs @@ -103,6 +103,10 @@ pub struct Index { #[allow(dead_code)] last_checkout: std::sync::atomic::AtomicU64, insert_count: std::sync::atomic::AtomicU32, + /// Tracks inserts since last successful compaction. + inserts_since_compact: std::sync::atomic::AtomicU32, + /// Serializes compaction to prevent concurrent compact operations. + compact_lock: tokio::sync::Mutex<()>, } impl Index { @@ -191,6 +195,8 @@ impl Index { dimensions, last_checkout: std::sync::atomic::AtomicU64::new(0), insert_count: std::sync::atomic::AtomicU32::new(0), + inserts_since_compact: std::sync::atomic::AtomicU32::new(0), + compact_lock: tokio::sync::Mutex::new(()), }) } @@ -213,15 +219,36 @@ impl Index { let count = self.insert_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed) + 1; debug!(count = entries.len(), inserts = count, "inserted chunks into index"); - // Auto-compact every 100 inserts to prevent fragment buildup - if count % 100 == 0 { - let bg_table = self.table.clone(); - tokio::spawn(async move { - match bg_table.optimize(lancedb::table::OptimizeAction::Compact { options: Default::default(), remap_options: None }).await { - Ok(_) => tracing::debug!("periodic compaction complete"), - Err(e) => tracing::warn!(error = %e, "periodic compaction failed"), + // Auto-compact based on fragment buildup, not fixed insert count. + // Each insert creates a new fragment. We compact synchronously (blocking) + // when fragment count gets too high, preventing runaway disk growth. + let since_compact = self.inserts_since_compact.fetch_add(1, std::sync::atomic::Ordering::Relaxed) + 1; + + // Compact every 100 uncompacted inserts. Synchronous to ensure it + // actually completes before more fragments accumulate. + if since_compact >= 100 { + // Try to acquire the compact lock (non-blocking). If another task + // is already compacting, skip — it'll catch up. + if let Ok(_guard) = self.compact_lock.try_lock() { + self.inserts_since_compact.store(0, std::sync::atomic::Ordering::Relaxed); + + // Step 1: Compact fragments into larger files + match self.table.optimize(lancedb::table::OptimizeAction::Compact { + options: Default::default(), + remap_options: None, + }).await { + Ok(_) => debug!("auto-compaction complete (after {} inserts)", since_compact), + Err(e) => tracing::warn!(error = %e, "auto-compaction failed"), } - }); + + // Step 2: Prune old versions to reclaim disk space. + // Without pruning, every compaction leaves old fragment files on disk. + let _ = self.table.optimize(lancedb::table::OptimizeAction::Prune { + older_than: Some(chrono::TimeDelta::seconds(30)), + delete_unverified: Some(true), + error_if_tagged_old_versions: Some(false), + }).await; + } } Ok(()) @@ -416,10 +443,37 @@ impl Index { Ok(count) } - /// Compact fragmented data files. Merges small fragments into larger ones - /// and prunes old versions, dramatically improving query performance. + /// Compact fragmented data files. Merges small fragments into larger ones, + /// dramatically improving query performance and reclaiming disk space. pub async fn optimize(&self) -> Result<(), IndexError> { - self.table.optimize(lancedb::table::OptimizeAction::Compact { options: Default::default(), remap_options: None }).await?; + let _guard = self.compact_lock.lock().await; + + // Compact + prune chunks table + self.table.optimize(lancedb::table::OptimizeAction::Compact { + options: Default::default(), + remap_options: None, + }).await?; + let _ = self.table.optimize(lancedb::table::OptimizeAction::Prune { + older_than: Some(chrono::TimeDelta::seconds(1)), + delete_unverified: Some(true), + error_if_tagged_old_versions: Some(false), + }).await; + self.inserts_since_compact.store(0, std::sync::atomic::Ordering::Relaxed); + + // Compact + prune facts table + if let Some(ref ft) = self.facts_table { + let _ = ft.optimize(lancedb::table::OptimizeAction::Compact { + options: Default::default(), + remap_options: None, + }).await; + let _ = ft.optimize(lancedb::table::OptimizeAction::Prune { + older_than: Some(chrono::TimeDelta::seconds(1)), + delete_unverified: Some(true), + error_if_tagged_old_versions: Some(false), + }).await; + } + + tracing::info!("manual compaction + prune complete"); Ok(()) } diff --git a/crates/uc-core/src/keyword_index.rs b/crates/uc-core/src/keyword_index.rs new file mode 100644 index 0000000..5029629 --- /dev/null +++ b/crates/uc-core/src/keyword_index.rs @@ -0,0 +1,267 @@ +//! BM25 keyword search index using Tantivy. +//! +//! Provides lexical search alongside the vector index (LanceDB). At query time, +//! both are searched in parallel and results are fused with Reciprocal Rank Fusion. +//! This catches entity-specific queries ("name of my hamster", "airline on Valentine's +//! day") that embedding-based search misses. + +use std::path::{Path, PathBuf}; +use tantivy::collector::TopDocs; +use tantivy::query::QueryParser; +use tantivy::schema::*; +use tantivy::{doc, Index, IndexReader, IndexWriter, ReloadPolicy}; +use thiserror::Error; +use tracing::{debug, warn}; + +#[derive(Debug, Error)] +pub enum KeywordIndexError { + #[error("tantivy error: {0}")] + Tantivy(#[from] tantivy::TantivyError), + #[error("query parse error: {0}")] + QueryParse(#[from] tantivy::query::QueryParserError), +} + +/// Result from a BM25 keyword search. +#[derive(Debug, Clone)] +pub struct KeywordSearchResult { + pub chunk_id: String, + pub session_id: String, + pub user_id: String, + pub content: String, + pub score: f32, +} + +/// BM25 keyword index backed by Tantivy. +#[allow(dead_code)] +pub struct KeywordIndex { + index: Index, + reader: IndexReader, + writer: tokio::sync::Mutex, + schema: Schema, + f_chunk_id: Field, + f_session_id: Field, + f_user_id: Field, + f_content: Field, + f_content_stored: Field, +} + +impl KeywordIndex { + /// Open or create a keyword index at the given path. + pub fn open(index_path: &Path) -> Result { + let keyword_path = index_path.join("keywords"); + std::fs::create_dir_all(&keyword_path).ok(); + + let mut schema_builder = Schema::builder(); + let f_chunk_id = schema_builder.add_text_field("chunk_id", STRING | STORED); + let f_session_id = schema_builder.add_text_field("session_id", STRING | STORED); + let f_user_id = schema_builder.add_text_field("user_id", STRING); + let f_content = schema_builder.add_text_field("content", TEXT); + let f_content_stored = schema_builder.add_text_field("content_stored", STORED); + let schema = schema_builder.build(); + + let index = if keyword_path.join("meta.json").exists() { + Index::open_in_dir(&keyword_path)? + } else { + Index::create_in_dir(&keyword_path, schema.clone())? + }; + + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::OnCommitWithDelay) + .try_into()?; + + let writer = index.writer(50_000_000)?; // 50MB heap + + Ok(Self { + index, + reader, + writer: tokio::sync::Mutex::new(writer), + schema, + f_chunk_id, + f_session_id, + f_user_id, + f_content, + f_content_stored, + }) + } + + /// Index a chunk's text content for BM25 search. + pub async fn index_chunk( + &self, + chunk_id: &str, + session_id: &str, + user_id: &str, + content: &str, + ) -> Result<(), KeywordIndexError> { + let writer = self.writer.lock().await; + writer.add_document(doc!( + self.f_chunk_id => chunk_id, + self.f_session_id => session_id, + self.f_user_id => user_id, + self.f_content => content, + self.f_content_stored => content, + ))?; + Ok(()) + } + + /// Commit pending writes to disk. Call after a batch of inserts. + pub async fn commit(&self) -> Result<(), KeywordIndexError> { + let mut writer = self.writer.lock().await; + writer.commit()?; + Ok(()) + } + + /// Search for chunks matching the query text using BM25 scoring. + pub fn search( + &self, + query_text: &str, + user_id: &str, + top_k: usize, + ) -> Result, KeywordIndexError> { + let searcher = self.reader.searcher(); + + // Parse query against the content field + let query_parser = QueryParser::for_index(&self.index, vec![self.f_content]); + let query = query_parser.parse_query(query_text)?; + + let top_docs = searcher.search(&query, &TopDocs::with_limit(top_k * 2))?; + + let mut results = Vec::new(); + for (score, doc_address) in top_docs { + let doc: TantivyDocument = searcher.doc(doc_address)?; + + let uid = doc + .get_first(self.f_user_id) + .and_then(|v| v.as_str()) + .unwrap_or(""); + if uid != user_id { + continue; + } + + let chunk_id = doc + .get_first(self.f_chunk_id) + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + let session_id = doc + .get_first(self.f_session_id) + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + let content = doc + .get_first(self.f_content_stored) + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + + results.push(KeywordSearchResult { + chunk_id, + session_id, + user_id: user_id.to_string(), + content, + score, + }); + + if results.len() >= top_k { + break; + } + } + + debug!(query = %query_text, hits = results.len(), "BM25 keyword search"); + Ok(results) + } + + /// Search for specific entities (proper nouns, quoted strings) extracted from the query. + /// More targeted than full-text search — finds "Alice" or "Bali" directly. + pub fn search_entities( + &self, + query_text: &str, + user_id: &str, + top_k: usize, + ) -> Result, KeywordIndexError> { + // Extract potential entities: quoted strings and capitalized multi-word sequences + let mut entities = Vec::new(); + + // Quoted strings: 'X' or "X" + let mut in_quote = false; + let mut current = String::new(); + for c in query_text.chars() { + if c == '\'' || c == '"' { + if in_quote && current.len() > 2 { + entities.push(current.clone()); + } + current.clear(); + in_quote = !in_quote; + } else if in_quote { + current.push(c); + } + } + + // Capitalized words (potential proper nouns), skip sentence starters + let words: Vec<&str> = query_text.split_whitespace().collect(); + for (i, word) in words.iter().enumerate() { + let clean = word.trim_matches(|c: char| !c.is_alphanumeric()); + if clean.len() > 2 && clean.chars().next().map_or(false, |c| c.is_uppercase()) && i > 0 { + entities.push(clean.to_string()); + } + } + + if entities.is_empty() { + return Ok(Vec::new()); + } + + // Search for each entity and merge results + let mut all_results: std::collections::HashMap = std::collections::HashMap::new(); + let searcher = self.reader.searcher(); + let query_parser = QueryParser::for_index(&self.index, vec![self.f_content]); + + for entity in &entities { + // Use quotes for phrase matching + let phrase_query = format!("\"{}\"", entity); + if let Ok(query) = query_parser.parse_query(&phrase_query) { + if let Ok(top_docs) = searcher.search(&query, &TopDocs::with_limit(top_k)) { + for (score, doc_address) in top_docs { + if let Ok(doc) = searcher.doc::(doc_address) { + let uid = doc.get_first(self.f_user_id).and_then(|v| v.as_str()).unwrap_or(""); + if uid != user_id { continue; } + + let chunk_id = doc.get_first(self.f_chunk_id).and_then(|v| v.as_str()).unwrap_or("").to_string(); + let entry = all_results.entry(chunk_id.clone()).or_insert(KeywordSearchResult { + chunk_id, + session_id: doc.get_first(self.f_session_id).and_then(|v| v.as_str()).unwrap_or("").to_string(), + user_id: user_id.to_string(), + content: doc.get_first(self.f_content_stored).and_then(|v| v.as_str()).unwrap_or("").to_string(), + score: 0.0, + }); + entry.score += score; // Accumulate scores across entity matches + } + } + } + } + } + + let mut results: Vec = all_results.into_values().collect(); + results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal)); + results.truncate(top_k); + + debug!(entities = ?entities, hits = results.len(), "BM25 entity search"); + Ok(results) + } + + /// Delete all documents for a user (for index rebuilds). + pub async fn delete_user(&self, user_id: &str) -> Result<(), KeywordIndexError> { + let term = tantivy::Term::from_field_text(self.f_user_id, user_id); + let mut writer = self.writer.lock().await; + writer.delete_term(term); + writer.commit()?; + Ok(()) + } + + /// Delete all documents (for test/benchmark resets). + pub async fn clear(&self) -> Result<(), KeywordIndexError> { + let mut writer = self.writer.lock().await; + writer.delete_all_documents()?; + writer.commit()?; + Ok(()) + } +} diff --git a/crates/uc-core/src/lib.rs b/crates/uc-core/src/lib.rs index dec1cdd..3a135b3 100644 --- a/crates/uc-core/src/lib.rs +++ b/crates/uc-core/src/lib.rs @@ -13,6 +13,7 @@ pub mod facts; pub mod gate; pub mod graph; pub mod index; +pub mod keyword_index; pub mod keystore; pub mod models; pub mod profile; @@ -69,6 +70,7 @@ pub struct Engine { config: Config, user_id: String, index: Arc, + keyword_index: Option>, embeddings: Arc, arweave: Arc, writer: Arc, @@ -290,19 +292,52 @@ impl Engine { // Create reranker let reranker: Box = Box::new(HeuristicReranker::default()); + // Open BM25 keyword index (best-effort — degrades gracefully if it fails) + let keyword_index = match keyword_index::KeywordIndex::open(&index_path) { + Ok(ki) => { + info!("BM25 keyword index ready"); + Some(Arc::new(ki)) + } + Err(e) => { + tracing::warn!(error = %e, "failed to open keyword index, BM25 search disabled"); + None + } + }; + // Create batcher with flush callback let flush_writer = writer.clone(); let flush_index = index.clone(); let flush_embeddings = embeddings.clone(); + let flush_keyword_index = keyword_index.clone(); let on_flush: FlushCallback = Arc::new(move |batch: Batch| { let writer = flush_writer.clone(); let index = flush_index.clone(); let embeddings = flush_embeddings.clone(); + let kw_index = flush_keyword_index.clone(); Box::pin(async move { - // 1. Compute embeddings - let texts: Vec<&str> = batch.chunks.iter().map(|c| c.content.as_str()).collect(); - let vectors = embeddings.embed_batch(&texts).await.map_err(|e| -> Box { Box::new(e) })?; + // 1. Compute embeddings with enriched text. + // Prepend context to each chunk before embedding to improve + // retrieval quality: + // - Date prefix: "[March 15, 2023]" so temporal queries match + // - Previous turn context: the preceding message in the session + // gives conversational context (Anthropic's Contextual Retrieval) + let enriched_texts: Vec = batch.chunks.iter().map(|c| { + // Date-enriched embedding: prepend the chunk's date so temporal + // queries ("last week", "in March") match chunks from those dates. + // Exp 28 showed this improves temporal reasoning from 50% to 61.5%. + let ts_secs = c.timestamp / 1000; + if ts_secs > 0 { + if let Some(dt) = chrono::DateTime::from_timestamp(ts_secs, 0) { + return format!("[{}] {}", dt.format("%B %d, %Y"), c.content); + } else { + tracing::debug!(timestamp = ts_secs, "timestamp out of range for date prefix"); + } + } + c.content.clone() + }).collect(); + let text_refs: Vec<&str> = enriched_texts.iter().map(|s| s.as_str()).collect(); + let vectors = embeddings.embed_batch(&text_refs).await.map_err(|e| -> Box { Box::new(e) })?; // 2. Upload to Arweave let receipt = writer.write_batch(&batch).await.map_err(|e| -> Box { Box::new(e) })?; @@ -320,76 +355,87 @@ impl Engine { .collect(); index.insert(&entries, &user_id).await.map_err(|e| -> Box { Box::new(e) })?; - // 4. Extract facts from chunks and store in facts table - let mut all_facts = Vec::new(); - for chunk in &batch.chunks { - let extraction = facts::extract_facts( - &chunk.content, - &chunk.id.to_string(), - &chunk.session_id, - &user_id, - chunk.timestamp, - ); - all_facts.extend(extraction.facts); + // 3b. Index in BM25 keyword index (best-effort) + if let Some(ref ki) = kw_index { + for chunk in &batch.chunks { + if let Err(e) = ki.index_chunk( + &chunk.id.to_string(), + &chunk.session_id, + &user_id, + &chunk.content, + ).await { + tracing::warn!(error = %e, "BM25 index failed for chunk (non-fatal)"); + } + } + if let Err(e) = ki.commit().await { + tracing::warn!(error = %e, "BM25 commit failed, retrying..."); + let _ = ki.commit().await; + } } - if !all_facts.is_empty() { - // Embed fact content + // 4. Extract facts in background (non-blocking) + let bg_index = index.clone(); + let bg_embeddings = embeddings.clone(); + let bg_user_id = user_id.clone(); + let bg_chunks: Vec<_> = batch.chunks.iter().map(|c| (c.id.to_string(), c.content.clone(), c.session_id.clone(), c.timestamp)).collect(); + tokio::spawn(async move { + let mut all_facts = Vec::new(); + for (chunk_id, content, session_id, timestamp) in &bg_chunks { + let extraction = facts::extract_facts(content, chunk_id, session_id, &bg_user_id, *timestamp); + all_facts.extend(extraction.facts); + } + + if all_facts.is_empty() { return; } + let fact_texts: Vec<&str> = all_facts.iter().map(|f| f.content.as_str()).collect(); - match embeddings.embed_batch(&fact_texts).await { - Ok(fact_vectors) => { - // Detect contradictions against existing facts - for fact in &all_facts { - let existing = index - .search_facts_by_predicate(&user_id, &fact.subject, &fact.predicate, true) - .await - .unwrap_or_default(); - - let existing_as_facts: Vec = existing.iter().map(|r| facts::Fact { - id: uuid::Uuid::parse_str(&r.fact_id).unwrap_or_default(), - content: r.content.clone(), - subject: r.subject.clone(), - predicate: r.predicate.clone(), - object: r.object.clone(), - source_chunk_id: String::new(), - session_id: r.session_id.clone(), - user_id: user_id.clone(), - document_date: r.document_date, - event_date: r.event_date, - valid: r.valid, - superseded_by: None, - confidence: r.confidence, - created_at: 0, - }).collect(); - - let contradictions = contradiction::detect_contradictions( - std::slice::from_ref(fact), - &existing_as_facts, - ); - - for c in &contradictions { - let _ = index.mark_fact_superseded(&c.old_fact_id, &c.new_fact_id).await; - tracing::debug!( - old = %c.old_fact_id, - new = %c.new_fact_id, - reason = %c.reason, - "fact superseded" - ); - } - } - - // Insert facts into LanceDB - if let Err(e) = index.insert_facts(&all_facts, &fact_vectors).await { - tracing::warn!(error = %e, "failed to insert facts (non-fatal)"); - } else { - tracing::debug!(count = all_facts.len(), "extracted and stored facts"); - } - } - Err(e) => { - tracing::warn!(error = %e, "failed to embed facts (non-fatal)"); + let fact_vectors = match bg_embeddings.embed_batch(&fact_texts).await { + Ok(v) => v, + Err(e) => { tracing::warn!(error = %e, "failed to embed facts (non-fatal)"); return; } + }; + + for fact in &all_facts { + let existing = bg_index + .search_facts_by_predicate(&bg_user_id, &fact.subject, &fact.predicate, true) + .await + .unwrap_or_default(); + + let existing_as_facts: Vec = existing.iter().map(|r| facts::Fact { + id: match uuid::Uuid::parse_str(&r.fact_id) { + Ok(id) => id, + Err(e) => { tracing::warn!(fact_id = %r.fact_id, error = %e, "invalid fact UUID"); uuid::Uuid::new_v4() } + }, + content: r.content.clone(), + subject: r.subject.clone(), + predicate: r.predicate.clone(), + object: r.object.clone(), + source_chunk_id: String::new(), + session_id: r.session_id.clone(), + user_id: bg_user_id.clone(), + document_date: r.document_date, + event_date: r.event_date, + valid: r.valid, + superseded_by: None, + confidence: r.confidence, + created_at: 0, + }).collect(); + + let contradictions = contradiction::detect_contradictions( + std::slice::from_ref(fact), + &existing_as_facts, + ); + + for c in &contradictions { + let _ = bg_index.mark_fact_superseded(&c.old_fact_id, &c.new_fact_id).await; + tracing::debug!(old = %c.old_fact_id, new = %c.new_fact_id, reason = %c.reason, "fact superseded"); } } - } + + if let Err(e) = bg_index.insert_facts(&all_facts, &fact_vectors).await { + tracing::warn!(error = %e, "failed to insert facts (non-fatal)"); + } else { + tracing::debug!(count = all_facts.len(), "extracted and stored facts"); + } + }); Ok(()) }) @@ -407,6 +453,7 @@ impl Engine { config, user_id, index, + keyword_index, embeddings, arweave, writer, @@ -420,34 +467,44 @@ impl Engine { } /// Store text content. Chunks it and buffers in the batcher. + /// + /// For conversation turns: user turns are buffered until the next assistant + /// turn arrives for the same session. The user+assistant pair is then stored + /// as a single "round" chunk, keeping the Q&A context together in the embedding. + /// This improves retrieval quality (LongMemEval paper's #1 recommendation). pub async fn store( &self, text: &str, params: StoreParams, ) -> Result, EngineError> { - // Set the batcher's user_id for this store operation self.batcher.set_user_id(¶ms.user_id).await; let timestamp = params.timestamp.unwrap_or_else(|| chrono::Utc::now().timestamp_millis()); + + // Round-level buffering for conversations: buffer user turns, + // combine with the next assistant turn. + let store_text: String; + let store_role: Option; + + store_text = text.to_string(); + store_role = params.role; + let mut chunks = chunker::chunk_text( - text, + &store_text, ¶ms.session_id, params.chunk_type, - params.role, + store_role, &self.chunker_config, timestamp, ); - // Tag source integration + model on each chunk for chunk in &mut chunks { chunk.metadata.source_integration = params.source_integration.clone(); chunk.metadata.source_model = params.source_model.clone(); } let ids: Vec = chunks.iter().map(|c| c.id).collect(); - self.batcher.add_many(chunks).await?; - Ok(ids) } @@ -516,16 +573,40 @@ impl Engine { }; let query_vector = self.embeddings.embed(text).await?; + + // ── Parallel: vector search + BM25 keyword search ── let params = models::QueryParams { user_id: user_id.to_string(), top_k, - session_id: signals.explicit_session, + session_id: signals.explicit_session.clone(), chunk_type: None, - // Apply temporal range for production use; benchmark data may have - // different timestamps so the filter may not match. time_range: signals.temporal_range, }; - let results = self.index.search(&query_vector, ¶ms).await?; + let mut results = self.index.search(&query_vector, ¶ms).await?; + + let mut seen: std::collections::HashSet = + results.iter().map(|r| r.chunk_id.clone()).collect(); + + // Temporal fallback: if temporal filter yielded few results, retry without it. + if signals.temporal_range.is_some() && results.len() < top_k / 2 { + let fallback_params = models::QueryParams { + user_id: user_id.to_string(), + top_k, + session_id: signals.explicit_session.clone(), + chunk_type: None, + time_range: None, + }; + let fallback = self.index.search(&query_vector, &fallback_params).await?; + for r in fallback { + if seen.insert(r.chunk_id.clone()) { + results.push(r); + } + } + } + + // Sort by score descending, truncate to top_k + results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal)); + results.truncate(top_k); Ok(results) } @@ -712,3 +793,7 @@ fn create_embedding_provider(config: &config::EmbeddingsConfig) -> Arc, + /// When true, prepend human-readable dates to content (useful for LLM consumers). + /// Default false to keep raw content for dashboard/UI display. + #[serde(default)] + pub include_dates: bool, } #[derive(Debug, Deserialize)] diff --git a/crates/uc-server/src/routes/retrieve.rs b/crates/uc-server/src/routes/retrieve.rs index ac13d5d..224deb2 100644 --- a/crates/uc-server/src/routes/retrieve.rs +++ b/crates/uc-server/src/routes/retrieve.rs @@ -1,6 +1,7 @@ use axum::extract::State; use axum::Extension; use axum::Json; +use chrono::{TimeZone, Utc}; use std::sync::Arc; use crate::auth::AuthenticatedUser; @@ -23,15 +24,28 @@ pub async fn retrieve( let results: Vec = results .into_iter() .take(req.top_k) - .map(|r| RetrieveResult { - chunk_id: r.chunk_id, - session_id: r.session_id, - chunk_type: r.chunk_type.as_str().to_string(), - role: r.role.map(|r| r.as_str().to_string()), - score: r.score, - timestamp: r.timestamp, - content: r.content, - arweave_tx_id: r.arweave_tx_id, + .map(|r| { + // Optionally prepend date to content for LLM consumers. + // Dashboard/UI should pass include_dates=false (the default). + let content = if req.include_dates && r.timestamp > 0 { + if let Some(dt) = Utc.timestamp_millis_opt(r.timestamp).single() { + format!("[{}] {}", dt.format("%B %d, %Y"), r.content) + } else { + r.content + } + } else { + r.content + }; + RetrieveResult { + chunk_id: r.chunk_id, + session_id: r.session_id, + chunk_type: r.chunk_type.as_str().to_string(), + role: r.role.map(|r| r.as_str().to_string()), + score: r.score, + timestamp: r.timestamp, + content, + arweave_tx_id: r.arweave_tx_id, + } }) .collect(); diff --git a/tests/longmemeval/autoresearch/.gitignore b/tests/longmemeval/autoresearch/.gitignore new file mode 100644 index 0000000..c283507 --- /dev/null +++ b/tests/longmemeval/autoresearch/.gitignore @@ -0,0 +1,2 @@ +# Autoresearch run results (large JSON files) +run_*.json diff --git a/tests/longmemeval/autoresearch/experiment.py b/tests/longmemeval/autoresearch/experiment.py new file mode 100644 index 0000000..ffda173 --- /dev/null +++ b/tests/longmemeval/autoresearch/experiment.py @@ -0,0 +1,43 @@ +""" +Experiment configuration for autoresearch. + +THIS FILE IS THE AGENT'S SANDBOX. Modify CONFIG to test hypotheses. +Each experiment should change one thing at a time. + +After modifying, run: + python3 tests/longmemeval/autoresearch/prepare.py + +If you modified Rust code, omit --skip-build. +If the index is already ingested for the same dataset, add --skip-ingest. +""" + +# ── Experiment Config ─────────────────────────────────────────────────────── +# This is the BASELINE configuration. The agent modifies this dict. + +CONFIG = { + # Base: Exp 8 config (61% accuracy) + "retrieval": { + "similarity_top_k": 150, + "min_relevance_score": 0.3, + "recency_window": 20, + "max_context_tokens": 50000, + "rerank": False, + "query_expansion": False, + "hyde": False, + "gating_enabled": True, + }, + + "context_chunks": 40, + "prompt_style": "default", + "answer_model": "gpt-4o", + "judge_model": "gpt-4o-mini", + + # EXPERIMENT 18: Rust sub-query decomposition in engine.search(). + # Detects multi-entity comparisons ("A or B"), aggregation ("how many"), + # and temporal ordering queries. Extracts entities and runs parallel + # sub-queries to cover entities the primary embedding misses. + # No LLM needed — pure pattern matching. + "prompt_style": "default", + + "description": "FULL 500q: temporal fallback + date enrichment + round storage + date-prefixed retrieve + compact+prune", +} diff --git a/tests/longmemeval/autoresearch/prepare.py b/tests/longmemeval/autoresearch/prepare.py new file mode 100644 index 0000000..60daebb --- /dev/null +++ b/tests/longmemeval/autoresearch/prepare.py @@ -0,0 +1,817 @@ +#!/usr/bin/env python3 +""" +Immutable benchmark harness for autoresearch. + +DO NOT MODIFY THIS FILE. The agent modifies experiment.py, not this file. + +This script: + 1. Reads experiment config from experiment.py + 2. Builds the server if Rust code changed + 3. Starts the server with experiment config + 4. Ingests the LongMemEval dataset + 5. Runs retrieval + answer accuracy evaluation + 6. Outputs structured results for the agent to parse + +Usage: + python3 tests/longmemeval/autoresearch/prepare.py [--skip-ingest] [--skip-build] +""" + +import argparse +import json +import os +import shutil +import signal +import statistics +import subprocess +import sys +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime +from pathlib import Path + +import requests + +# Force unbuffered output so progress is visible in background runs +sys.stdout.reconfigure(line_buffering=True) if hasattr(sys.stdout, 'reconfigure') else None + +# ── Paths ─────────────────────────────────────────────────────────────────── +ROOT = Path(__file__).resolve().parent.parent.parent.parent +DATASET_DIR = ROOT / "tests" / "longmemeval" / "data" +RESULTS_DIR = ROOT / "tests" / "longmemeval" / "autoresearch" +DATA_DIR = Path.home() / ".memoryport" / "autoresearch_data" +CONFIG_DIR = Path.home() / ".memoryport" +AUTORESEARCH_CONFIG = CONFIG_DIR / "uc_autoresearch.toml" +SERVER_BIN = ROOT / "target" / "debug" / "uc-server" + +# ── Constants ─────────────────────────────────────────────────────────────── +SERVER_PORT = 8091 # Separate from normal server (8090) +SERVER_URL = f"http://127.0.0.1:{SERVER_PORT}" +SAMPLE_SIZE = 100 # Questions per evaluation run +SAMPLE_SEED = 42 # Reproducible sampling + +# ── HTTP Session ──────────────────────────────────────────────────────────── +_http = requests.Session() + + +def load_experiment_config() -> dict: + """Load the experiment config from experiment.py.""" + config_path = RESULTS_DIR / "experiment.py" + if not config_path.exists(): + print("ERROR: experiment.py not found. Create it first.") + sys.exit(1) + + # Import as module + import importlib.util + spec = importlib.util.spec_from_file_location("experiment", config_path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod.CONFIG + + +def sample_questions(dataset_path: Path, n: int, seed: int) -> list: + """Sample a balanced set of questions across all types.""" + import random + rng = random.Random(seed) + + with open(dataset_path) as f: + all_questions = json.load(f) + + by_type = {} + for q in all_questions: + by_type.setdefault(q["question_type"], []).append(q) + + # Proportional sampling: each type gets its share of n + types = sorted(by_type.keys()) + total = sum(len(qs) for qs in by_type.values()) + sampled = [] + + remaining = n + for i, t in enumerate(types): + if i == len(types) - 1: + count = remaining # Give remainder to last type + else: + count = max(1, round(n * len(by_type[t]) / total)) + count = min(count, remaining, len(by_type[t])) + remaining -= count + sampled.extend(rng.sample(by_type[t], count)) + + rng.shuffle(sampled) + return sampled + + +def write_toml_config(experiment_config: dict): + """Write a TOML config file for the autoresearch server. + + Reads the base uc.toml config and merges experiment overrides into it, + replacing section values rather than appending duplicate sections. + """ + try: + import tomllib # Python 3.11+ + except ModuleNotFoundError: + import tomli as tomllib # pip install tomli for 3.9/3.10 + + base_config_path = CONFIG_DIR / "uc.toml" + if base_config_path.exists(): + with open(base_config_path, "rb") as f: + base = tomllib.load(f) + else: + base = {} + + # Merge experiment retrieval overrides into base config + retrieval = experiment_config.get("retrieval", {}) + if "retrieval" not in base: + base["retrieval"] = {} + base["retrieval"].update(retrieval) + + # Inject OPENAI_API_KEY into embeddings config if available + openai_key = os.environ.get("OPENAI_API_KEY") + if openai_key and "embeddings" in base: + base["embeddings"]["api_key"] = openai_key + + # Override embeddings model/dimensions if experiment specifies them + emb_overrides = experiment_config.get("embeddings", {}) + if emb_overrides: + if "embeddings" not in base: + base["embeddings"] = {} + base["embeddings"].update(emb_overrides) + # Also update index embedding_dimensions to match + if "dimensions" in emb_overrides: + if "index" not in base: + base["index"] = {} + base["index"]["embedding_dimensions"] = emb_overrides["dimensions"] + + # Override index path to use the isolated autoresearch data directory + if "index" not in base: + base["index"] = {} + base["index"]["path"] = str(DATA_DIR / "index") + + # Serialize back to TOML manually (simple flat structure) + lines = ["# Autoresearch config (auto-generated, do not edit)"] + for section, values in base.items(): + if isinstance(values, dict): + lines.append(f"\n[{section}]") + for k, v in values.items(): + if isinstance(v, bool): + lines.append(f"{k} = {'true' if v else 'false'}") + elif isinstance(v, str): + lines.append(f'{k} = "{v}"') + elif isinstance(v, float): + lines.append(f"{k} = {v}") + else: + lines.append(f"{k} = {v}") + else: + # Top-level scalar + if isinstance(values, bool): + lines.append(f"{section} = {'true' if values else 'false'}") + elif isinstance(values, str): + lines.append(f'{section} = "{values}"') + else: + lines.append(f"{section} = {values}") + + with open(AUTORESEARCH_CONFIG, "w") as f: + f.write("\n".join(lines) + "\n") + + print(f" Config written to {AUTORESEARCH_CONFIG}") + + +def build_server() -> bool: + """Build uc-server. Returns True on success.""" + print(" Building uc-server...") + result = subprocess.run( + ["cargo", "build", "-p", "uc-server"], + cwd=ROOT, + capture_output=True, + text=True, + timeout=300, + ) + if result.returncode != 0: + print(f" BUILD FAILED:\n{result.stderr[-1000:]}") + return False + print(" Build OK") + return True + + +def start_server() -> subprocess.Popen: + """Start uc-server on the autoresearch port.""" + env = os.environ.copy() + env["UC_SERVER_LISTEN"] = f"127.0.0.1:{SERVER_PORT}" + env["UC_SERVER_DATA_DIR"] = str(DATA_DIR) + + proc = subprocess.Popen( + [str(SERVER_BIN), "--config", str(AUTORESEARCH_CONFIG)], + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + # Wait for server to be ready + for attempt in range(30): + try: + r = requests.get(f"{SERVER_URL}/health", timeout=2) + if r.status_code == 200: + print(f" Server ready on port {SERVER_PORT} (pid={proc.pid})") + return proc + except Exception: + pass + time.sleep(1) + + proc.kill() + print(" ERROR: Server failed to start within 30s") + stderr = proc.stderr.read().decode()[-500:] + print(f" stderr: {stderr}") + sys.exit(1) + + +def stop_server(proc: subprocess.Popen): + """Gracefully stop the server.""" + proc.send_signal(signal.SIGTERM) + try: + proc.wait(timeout=10) + except subprocess.TimeoutExpired: + proc.kill() + print(" Server stopped") + + +def clear_index(): + """Clear the autoresearch data directory (per-user indexes live inside).""" + if DATA_DIR.exists(): + shutil.rmtree(DATA_DIR) + DATA_DIR.mkdir(parents=True, exist_ok=True) + + +def parse_session_date(date_str: str): + """Parse LongMemEval date to epoch ms.""" + try: + clean = date_str + if "(" in clean: + clean = clean[:clean.index("(")].strip() + " " + clean[clean.index(")") + 1:].strip() + clean = clean.strip() + for fmt in ["%Y/%m/%d %H:%M", "%Y/%m/%d", "%Y-%m-%d %H:%M", "%Y-%m-%d"]: + try: + dt = datetime.strptime(clean, fmt) + return int(dt.timestamp() * 1000) + except ValueError: + continue + except Exception: + pass + return None + + +def store_turn(text: str, session_id: str, role: str, timestamp: int = None) -> bool: + """Store a single turn.""" + try: + body = {"text": text, "chunk_type": "conversation", + "session_id": session_id, "role": role} + if timestamp is not None: + body["timestamp"] = timestamp + r = _http.post(f"{SERVER_URL}/v1/store", json=body, timeout=30) + return r.status_code == 200 + except Exception: + return False + + +def ingest_question(question: dict, max_workers: int = 16) -> int: + """Ingest all haystack sessions for one question. Returns stored count.""" + futures = [] + with ThreadPoolExecutor(max_workers=max_workers) as pool: + for sid, sdate, sturns in zip( + question["haystack_session_ids"], + question["haystack_dates"], + question["haystack_sessions"], + ): + ts = parse_session_date(sdate) + for idx, turn in enumerate(sturns): + full_sid = f"{question['question_id']}_{sid}" + turn_ts = (ts + idx) if ts else None + futures.append(pool.submit(store_turn, turn["content"], + full_sid, turn["role"], turn_ts)) + return sum(1 for f in as_completed(futures) if f.result()) + + +def _expand_query(query: str) -> list: + """Use LLM to generate 2-3 alternative phrasings for retrieval.""" + try: + response = call_llm([{ + "role": "user", + "content": ( + "Given this search query about a user's conversation history, " + "generate 3 alternative phrasings that would help find the relevant " + "conversations. Focus on the key topics and entities, stripping away " + "temporal/meta language. Return ONLY the alternatives, one per line.\n\n" + f"Query: {query}" + ), + }], "gpt-4o-mini", max_tokens=150) + return [ + line.strip().lstrip("0123456789.-) ") + for line in response.strip().split("\n") + if line.strip() and len(line.strip()) > 5 + ][:3] + except Exception: + return [] + + +def _do_retrieve(query: str, top_k: int, reference_time: int = None): + """Single retrieve call to the server.""" + body = {"query": query, "top_k": top_k} + if reference_time: + body["reference_time"] = reference_time + r = _http.post(f"{SERVER_URL}/v1/retrieve", json=body, timeout=60) + if r.status_code != 200: + return [] + return r.json().get("results", []) + + +def _get_full_session(session_id: str): + """Retrieve all chunks for a session via /v1/sessions/{id}.""" + try: + r = _http.get(f"{SERVER_URL}/v1/sessions/{session_id}", timeout=30) + if r.status_code == 200: + return r.json().get("chunks", []) + except Exception: + pass + return [] + + +def _needs_decomposition(query: str) -> bool: + """Heuristic: does this query mention multiple entities or need aggregation?""" + q = query.lower() + # Multi-entity comparisons + if " or " in q and ("which" in q or "first" in q or "before" in q or "after" in q): + return True + # Aggregation / exhaustive + if any(w in q for w in ["how many", "how much total", "total money", "total time", + "all the", "list all", "every time"]): + return True + # Temporal ordering of multiple events + if any(w in q for w in ["what order", "in order", "chronological", "sequence"]): + return True + return False + + +def _decompose_query(query: str): + """Decompose a multi-entity question into sub-queries.""" + try: + response = call_llm([{ + "role": "user", + "content": ( + "This question requires finding information about multiple specific " + "topics/events/items in a conversation history. Decompose it into 2-4 " + "separate, simpler search queries that each target ONE specific topic.\n\n" + "Rules:\n" + "- Each sub-query should be a simple search for one entity/event\n" + "- Strip temporal language, focus on the core content\n" + "- Return ONLY the sub-queries, one per line\n" + "- If the question is already simple (about one thing), return just that one topic\n\n" + f"Question: {query}" + ), + }], "gpt-4o-mini", max_tokens=200) + return [ + line.strip().lstrip("0123456789.-) ") + for line in response.strip().split("\n") + if line.strip() and len(line.strip()) > 5 + ][:4] + except Exception: + return [] + + +def retrieve(question: dict, top_k: int = 50, expand_queries: bool = False, + session_expansion: bool = False, query_decomposition: bool = False, + max_expanded_sessions: int = 5) -> dict: + """Retrieve context with optional enhancements.""" + qid = question["question_id"] + query = question["question"] + qdate = question.get("question_date") + ref_ts = parse_session_date(qdate) if qdate else None + + start = time.time() + try: + # Primary retrieval + results = _do_retrieve(query, top_k, ref_ts) + + # Optional: query decomposition for multi-entity questions + # "adaptive" mode only decomposes when the query looks multi-entity/aggregation + if query_decomposition: + should_decompose = (query_decomposition == "always" or + (query_decomposition == "adaptive" and _needs_decomposition(query))) + if should_decompose: + sub_queries = _decompose_query(query) + if len(sub_queries) > 1: # Only if actually decomposed + seen_ids = {r.get("chunk_id") for r in results} + for sq in sub_queries: + sq_results = _do_retrieve(sq, top_k // 3, ref_ts) + for r in sq_results: + if r.get("chunk_id") not in seen_ids: + seen_ids.add(r.get("chunk_id")) + results.append(r) + + # Optional: Python-side query expansion + if expand_queries: + expansions = _expand_query(query) + seen_ids = {r.get("chunk_id") for r in results} + for exp_query in expansions: + exp_results = _do_retrieve(exp_query, top_k // 3, ref_ts) + for r in exp_results: + if r.get("chunk_id") not in seen_ids: + seen_ids.add(r.get("chunk_id")) + results.append(r) + + # Optional: session expansion — for top-scoring sessions, + # retrieve ALL turns from those sessions (not just matched chunks) + if session_expansion and results: + # Find top sessions by score + session_scores = {} + for r in results: + sid = r.get("session_id", "") + score = r.get("score", 0) + session_scores[sid] = max(session_scores.get(sid, 0), score) + + top_sessions = sorted(session_scores.items(), key=lambda x: -x[1]) + top_sessions = top_sessions[:max_expanded_sessions] + + # Fetch full sessions and merge + seen_ids = {r.get("chunk_id") for r in results} + for sid, _score in top_sessions: + full_chunks = _get_full_session(sid) + for chunk in full_chunks: + cid = chunk.get("chunk_id", "") + if cid and cid not in seen_ids: + seen_ids.add(cid) + results.append({ + "chunk_id": cid, + "session_id": sid, + "content": chunk.get("content", ""), + "score": 0.0, # No vector score for expanded chunks + "timestamp": chunk.get("timestamp", 0), + "role": chunk.get("role"), + }) + + latency_ms = (time.time() - start) * 1000 + except Exception as e: + return {"qid": qid, "error": str(e), "latency_ms": 0} + + # Session recall + retrieved = set() + for res in results: + sid = res.get("session_id", "") + if "_" in sid: + retrieved.add(sid.split("_", 1)[1]) + + answer_sids = set(question.get("answer_session_ids", [])) + hits = answer_sids & retrieved + recall = len(hits) / len(answer_sids) if answer_sids else 0.0 + + return { + "qid": qid, + "question_type": question["question_type"], + "session_recall": recall, + "hits": len(hits), + "answer_sessions": len(answer_sids), + "latency_ms": latency_ms, + "num_results": len(results), + "context": [res.get("content", "") for res in results], + } + + +def call_llm(messages: list, model: str, max_tokens: int = 1024) -> str: + """Call LLM API.""" + if model.startswith("claude"): + api_key = os.environ.get("ANTHROPIC_API_KEY") + if not api_key: + raise ValueError("ANTHROPIC_API_KEY not set") + r = requests.post( + "https://api.anthropic.com/v1/messages", + headers={"x-api-key": api_key, "anthropic-version": "2023-06-01", + "content-type": "application/json"}, + json={"model": model, "max_tokens": max_tokens, "messages": messages}, + timeout=120, + ) + r.raise_for_status() + return r.json()["content"][0]["text"] + else: + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + raise ValueError("OPENAI_API_KEY not set") + r = requests.post( + "https://api.openai.com/v1/chat/completions", + headers={"Authorization": f"Bearer {api_key}", + "Content-Type": "application/json"}, + json={"model": model, "max_tokens": max_tokens, "messages": messages}, + timeout=120, + ) + r.raise_for_status() + return r.json()["choices"][0]["message"]["content"] + + +def generate_answer(question: str, context: list, model: str, + question_date: str = None, context_chunks: int = 20, + prompt_style: str = "default") -> str: + """Generate answer from retrieved context.""" + ctx_text = "\n\n---\n\n".join(context[:context_chunks]) + date_line = f"The question was asked on: {question_date}\n\n" if question_date else "" + + if prompt_style == "knowledge-aware": + # Knowledge-update-aware prompt: explicitly tells LLM to prefer latest info + prompt = ( + f"You are answering a question based on your conversation history with the user.\n\n" + f"{date_line}" + f"Retrieved conversation history:\n{ctx_text}\n\n" + f"Question: {question}\n\n" + f"IMPORTANT: Information may have been updated over time. When you find " + f"multiple values for the same fact (e.g., a count, price, or status), " + f"ALWAYS use the most recent one based on conversation dates. Explicitly " + f"note if information was updated.\n\n" + f"For temporal/time questions, identify specific dates mentioned and " + f"compute differences step by step. Show your date arithmetic.\n\n" + f"For counting/aggregation questions, enumerate every distinct item or " + f"event found in the history before giving a total. Do not guess.\n\n" + f"Answer concisely." + ) + elif prompt_style == "extract-then-reason": + # LongMemEval paper's "con" strategy: extract relevant facts first, then reason + prompt = ( + f"You are answering a question based on your conversation history with the user.\n\n" + f"{date_line}" + f"Retrieved conversation history:\n{ctx_text}\n\n" + f"Question: {question}\n\n" + f"Follow these steps:\n" + f"1. EXTRACT: List all facts from the conversation history that are relevant " + f"to answering this question. Include dates, names, and specific details.\n" + f"2. REASON: Using only the extracted facts, reason step by step to arrive " + f"at the answer. For temporal questions, explicitly calculate time differences. " + f"For questions about order, explicitly compare dates.\n" + f"3. ANSWER: State your final answer concisely.\n" + ) + else: + prompt = ( + f"You are answering a question based on your conversation history with " + f"the user. Use the retrieved conversation excerpts below to answer.\n\n" + f"{date_line}" + f"Retrieved conversation history:\n{ctx_text}\n\n" + f"Question: {question}\n\n" + f"Answer the question concisely based on the conversation history above. " + f"Extract all relevant information and reason step by step if needed. " + f"Pay attention to dates and temporal ordering of events." + ) + + return call_llm([{"role": "user", "content": prompt}], model, max_tokens=768) + + +# Type-specific judge prompts (matching MemoryBench methodology) +JUDGE_BASE = ( + "I will give you a question, a correct answer, and a response from a model. " + "Please answer yes if the response contains the correct answer. Otherwise, " + "answer no. If the response is equivalent to the correct answer or contains " + "all the intermediate steps to get the correct answer, you should also answer " + "yes. If the response only contains a subset of the information required by " + "the answer, answer no." +) +JUDGE_TEMPORAL_EXTRA = ( + " In addition, do not penalize off-by-one errors for the number of days. If " + "the question asks for the number of days/weeks/months, etc., and the model " + "makes off-by-one errors (e.g., predicting 19 days when the answer is 18), " + "the model's response is still correct." +) +JUDGE_KNOWLEDGE_UPDATE_EXTRA = ( + " If the response contains some previous information along with an updated " + "answer, the response should be considered as correct as long as the updated " + "answer is the required answer." +) + + +def judge_answer(question: str, ground_truth: str, predicted: str, + model: str, question_type: str = None) -> dict: + """LLM-as-judge evaluation.""" + instructions = JUDGE_BASE + if question_type == "temporal-reasoning": + instructions += JUDGE_TEMPORAL_EXTRA + elif question_type == "knowledge-update": + instructions += JUDGE_KNOWLEDGE_UPDATE_EXTRA + + response = call_llm([{ + "role": "user", + "content": ( + f"{instructions}\n\n" + f"Question: {question}\n\nCorrect Answer: {ground_truth}\n\n" + f"Model Response: {predicted}\n\n" + f"Respond with EXACTLY one word on the first line: 'correct' or 'incorrect'\n" + f"Then on the next line, a brief explanation." + ), + }], model, max_tokens=256) + + first_line = response.strip().split("\n")[0].strip().lower() + return {"correct": first_line.startswith("correct"), "judge_response": response.strip()} + + +def run_evaluation(questions: list, experiment_config: dict) -> dict: + """Run full retrieval + answer accuracy evaluation.""" + top_k = experiment_config.get("retrieval", {}).get("similarity_top_k", 50) + answer_model = experiment_config.get("answer_model", "gpt-4o-mini") + judge_model = experiment_config.get("judge_model", "gpt-4o-mini") + + # Phase 1: Retrieve + print("\n [2/3] Retrieving context...") + retrievals = [] + for i, q in enumerate(questions): + r = retrieve( + q, top_k=top_k, + expand_queries=experiment_config.get("expand_queries", False), + session_expansion=experiment_config.get("session_expansion", False), + query_decomposition=experiment_config.get("query_decomposition", False), + max_expanded_sessions=experiment_config.get("max_expanded_sessions", 5), + ) + retrievals.append(r) + if (i + 1) % 20 == 0: + recalls = [x["session_recall"] for x in retrievals if "session_recall" in x] + print(f" [{i+1}/{len(questions)}] Avg recall: {statistics.mean(recalls):.2%}") + + # Phase 2: Answer + Judge + print("\n [3/3] Generating answers and judging...") + results = [] + correct = 0 + evaluated = 0 + + for i, (q, ret) in enumerate(zip(questions, retrievals)): + if "error" in ret: + results.append({**ret, "answer_correct": False, "skipped": True}) + continue + + try: + context_chunks = experiment_config.get("context_chunks", 20) + prompt_style = experiment_config.get("prompt_style", "default") + answer = generate_answer( + q["question"], ret.get("context", []), answer_model, + question_date=q.get("question_date"), + context_chunks=context_chunks, + prompt_style=prompt_style, + ) + judgment = judge_answer( + q["question"], q["answer"], answer, judge_model, + question_type=q.get("question_type"), + ) + evaluated += 1 + if judgment["correct"]: + correct += 1 + + results.append({ + "qid": ret["qid"], + "question_type": q["question_type"], + "question": q["question"], + "ground_truth": q["answer"], + "llm_answer": answer, + "answer_correct": judgment["correct"], + "judge_response": judgment["judge_response"], + "session_recall": ret["session_recall"], + "latency_ms": ret["latency_ms"], + }) + + if (i + 1) % 10 == 0: + acc = correct / evaluated if evaluated else 0 + print(f" [{i+1}/{len(questions)}] Accuracy: {acc:.2%} ({correct}/{evaluated})") + + except Exception as e: + print(f" [{i+1}/{len(questions)}] ERROR: {e}") + results.append({**ret, "answer_correct": False, "error_answer": str(e)}) + + # Aggregate + valid = [r for r in results if not r.get("skipped") and "error_answer" not in r] + by_type = {} + for r in valid: + by_type.setdefault(r["question_type"], []).append(r) + + type_accuracy = {} + for t, rs in sorted(by_type.items()): + type_accuracy[t] = sum(1 for r in rs if r["answer_correct"]) / len(rs) if rs else 0 + + latencies = [r["latency_ms"] for r in valid] + recalls = [r["session_recall"] for r in valid] + + summary = { + "answer_accuracy": correct / evaluated if evaluated else 0, + "session_recall": statistics.mean(recalls) if recalls else 0, + "latency_p50": statistics.median(latencies) if latencies else 0, + "latency_p95": sorted(latencies)[int(len(latencies) * 0.95)] if latencies else 0, + "evaluated": evaluated, + "correct": correct, + "type_accuracy": type_accuracy, + } + + return {"summary": summary, "results": results} + + +def main(): + parser = argparse.ArgumentParser(description="Autoresearch benchmark harness") + parser.add_argument("--skip-ingest", action="store_true", + help="Skip ingestion (reuse existing index)") + parser.add_argument("--skip-build", action="store_true", + help="Skip cargo build") + parser.add_argument("--dataset", default="s", choices=["oracle", "s"], + help="Dataset variant (default: s)") + parser.add_argument("--questions", type=int, default=SAMPLE_SIZE, + help=f"Number of questions (default: {SAMPLE_SIZE})") + args = parser.parse_args() + + experiment_config = load_experiment_config() + + # Verify required env vars + if not os.environ.get("OPENAI_API_KEY"): + print("ERROR: OPENAI_API_KEY environment variable not set.") + print(" export OPENAI_API_KEY='sk-...'") + sys.exit(1) + + print(f"{'='*70}") + print(f"AUTORESEARCH BENCHMARK RUN") + print(f"{'='*70}") + print(f" Dataset: longmemeval_{args.dataset}") + print(f" Questions: {args.questions}") + print(f" Config: {json.dumps(experiment_config.get('retrieval', {}), indent=2)}") + + # Build + if not args.skip_build: + if not build_server(): + sys.exit(1) + + # Write config + write_toml_config(experiment_config) + + # Start server + proc = start_server() + + try: + # Sample questions + dataset_name = f"longmemeval_{args.dataset}_cleaned.json" if args.dataset == "s" else "longmemeval_oracle.json" + dataset_path = DATASET_DIR / dataset_name + questions = sample_questions(dataset_path, args.questions, SAMPLE_SEED) + print(f" Sampled {len(questions)} questions") + + types = {} + for q in questions: + types[q["question_type"]] = types.get(q["question_type"], 0) + 1 + for t, c in sorted(types.items()): + print(f" {t}: {c}") + + # Ingest + if not args.skip_ingest: + clear_index() + print("\n [1/3] Ingesting haystacks...") + total = 0 + for i, q in enumerate(questions): + stored = ingest_question(q) + total += stored + if (i + 1) % 10 == 0: + print(f" [{i+1}/{len(questions)}] Ingested {total} turns") + print(f" Total: {total} turns") + # Wait for indexing to settle + time.sleep(2) + else: + print("\n [1/3] Skipping ingestion") + + # Evaluate + eval_result = run_evaluation(questions, experiment_config) + summary = eval_result["summary"] + + # Print results + print(f"\n{'='*70}") + print(f"RESULTS") + print(f"{'='*70}") + print(f" Answer Accuracy: {summary['answer_accuracy']:.2%} ({summary['correct']}/{summary['evaluated']})") + print(f" Session Recall: {summary['session_recall']:.2%}") + print(f" Latency p50: {summary['latency_p50']:.0f}ms") + print(f" Latency p95: {summary['latency_p95']:.0f}ms") + print(f"\n By Type:") + for t, acc in sorted(summary["type_accuracy"].items()): + print(f" {t:<35s} {acc:.2%}") + + # Output parseable line for agent + print(f"\n{'='*70}") + print(f"PARSEABLE:") + type_str = " ".join(f"{t}={acc:.4f}" for t, acc in sorted(summary["type_accuracy"].items())) + print(f"overall_accuracy={summary['answer_accuracy']:.4f} " + f"session_recall={summary['session_recall']:.4f} " + f"latency_p50={summary['latency_p50']:.0f} " + f"latency_p95={summary['latency_p95']:.0f} " + f"{type_str}") + + # Save full results + timestamp = time.strftime("%Y%m%d_%H%M%S") + commit_hash = subprocess.run( + ["git", "rev-parse", "--short", "HEAD"], + capture_output=True, text=True, cwd=ROOT, + ).stdout.strip() + + output_path = RESULTS_DIR / f"run_{timestamp}_{commit_hash}.json" + with open(output_path, "w") as f: + json.dump({ + "config": experiment_config, + "summary": summary, + "results": eval_result["results"], + "timestamp": timestamp, + "commit": commit_hash, + }, f, indent=2) + print(f"\n Full results: {output_path}") + + finally: + stop_server(proc) + + +if __name__ == "__main__": + main() diff --git a/tests/longmemeval/autoresearch/program.md b/tests/longmemeval/autoresearch/program.md new file mode 100644 index 0000000..ed6c591 --- /dev/null +++ b/tests/longmemeval/autoresearch/program.md @@ -0,0 +1,104 @@ +# Memoryport LongMemEval Autoresearch Program + +## Objective + +Maximize LongMemEval answer accuracy on `longmemeval_s` (the standard difficulty split with ~115K token haystacks per question) while keeping query latency at 500M token scale under 500ms. + +## Optimization Target + +**Primary metric:** Answer accuracy (%) on a 100-question balanced sample from `longmemeval_s` +**Secondary metric:** Session recall (%) — must not regress below baseline +**Constraint:** Query latency p50 at 500M tokens must stay under 500ms (test with scale benchmark if architectural changes are made) + +## What You Can Modify + +You are an AI research agent. You may modify ANY Rust source code in the `crates/uc-core/src/` directory AND the experiment config. The key files are: + +### Config Parameters (fast to test — no recompile needed if exposed via config) +- `similarity_top_k` (default: 50) — candidate pool size +- `min_relevance_score` (default: 0.3) — quality gate threshold +- `recency_window` (default: 20) — recent chunks to include +- `rerank` (default: false) — enable heuristic reranking +- `query_expansion` (default: false) — LLM-based query reformulation +- `hyde` (default: false) — Hypothetical Document Embeddings +- `max_context_tokens` (default: 50,000) — token budget for assembly + +### Retriever Constants (require `cargo build`) +- RRF k constant (default: 60.0) in `retriever.rs` +- Session diversity cap (default: 5 per session) in `retriever.rs` +- Expanded query top_k divisor (default: /3) in `retriever.rs` +- Explicit session top_k (default: 20) in `retriever.rs` + +### Reranker Parameters (require `cargo build`) +- `recency_half_life_ms` (default: 86,400,000 = 1 day) +- `session_affinity_boost` (default: 1.2) +- `diversity_lambda` (default: 0.7) — MMR tradeoff +- Recency weight split (default: 0.7 base + 0.3 recency) + +### Gate Parameters (require `cargo build`) +- Gate 2 `retrieve_bias` (default: 0.05) in `gate.rs` +- Gate 1 patterns in `analyzer.rs` +- Gate 2 exemplars (20 retrieve + 20 skip) in `gate.rs` + +### Chunker Parameters (require `cargo build` + re-ingest) +- `target_size` (default: 1,500 chars) +- `overlap` (default: 200 chars) + +### Enhancer Parameters (require `cargo build`) +- Expansion count (default: 5) +- HyDE prompt text +- Query expansion prompt text + +### Assembler Parameters (require `cargo build`) +- Context format / XML structure +- Dedup fingerprint length (default: 100 chars) +- Token budget allocation strategy + +## Experiment Rules + +1. **One change at a time.** Each experiment should test a single hypothesis. If you want to test a combination, first test each component individually. + +2. **Always build before running.** If you modified Rust code, run `cargo build -p uc-server` and verify it succeeds before running the benchmark. + +3. **Never modify `prepare.py`** — it is the immutable benchmark harness. + +4. **Never modify `program.md`** — these are your instructions. + +5. **Log every experiment** in `results.tsv` with: commit hash, overall accuracy, per-type accuracy breakdown, session recall, latency p50, description of change. + +6. **Revert failed experiments.** If accuracy drops, revert the change before trying the next experiment. Use `git checkout -- ` to revert. + +7. **Build time budget:** Each experiment cycle (build + ingest + evaluate) should complete within 30 minutes. If an experiment will take longer, skip it and note why. + +8. **The `/v1/retrieve` endpoint bypasses gating.** The benchmark calls `/v1/retrieve` directly, so Gate 1 and Gate 2 do NOT affect benchmark results. Focus on retrieval algorithm quality, not gating. + +9. **Temporal reasoning is the weakest category.** Prioritize experiments that improve temporal reasoning without hurting other categories. + +10. **The `reference_time` parameter is available.** The benchmark passes the question date as `reference_time` for temporal queries. Make sure temporal filtering logic uses this correctly. + +## Research Directions (suggested priority order) + +### Phase 1: Low-hanging fruit (config-only) +- [ ] Enable reranking and measure impact +- [ ] Enable query expansion (with OpenAI) and measure impact +- [ ] Enable HyDE and measure impact +- [ ] Tune `similarity_top_k` (try 30, 75, 100) +- [ ] Tune `min_relevance_score` (try 0.1, 0.2, 0.5) + +### Phase 2: Retrieval algorithm improvements +- [ ] Improve temporal range detection for LongMemEval-style questions +- [ ] Add temporal boosting: boost results closer to `reference_time` in scoring +- [ ] Improve RRF parameters (try k=20, k=40, k=80) +- [ ] Increase session diversity cap (try 3, 8, 10) +- [ ] Improve fact-based retrieval for knowledge-update questions + +### Phase 3: Deeper architectural changes +- [ ] Add BM25/keyword hybrid search alongside vector search +- [ ] Implement cross-encoder reranking (using OpenAI or local model) +- [ ] Improve chunk boundaries for multi-turn conversations +- [ ] Add session-level summarization as an additional retrieval key +- [ ] Implement query decomposition for multi-session questions + +## Baseline + +Run `prepare.py` with default config to establish baseline metrics before making any changes. diff --git a/tests/longmemeval/autoresearch/results.tsv b/tests/longmemeval/autoresearch/results.tsv new file mode 100644 index 0000000..995a0f4 --- /dev/null +++ b/tests/longmemeval/autoresearch/results.tsv @@ -0,0 +1,44 @@ +commit timestamp overall_accuracy session_recall latency_p50 latency_p95 knowledge_update multi_session single_session_assistant single_session_preference single_session_user temporal_reasoning description +f352f58 20260328_012151 0.5100 0.6008 321 558 0.5625 0.3704 0.8182 0.5000 0.7857 0.3462 Baseline: default config, no reranking, no expansion +f352f58 20260328_014932 0.5000 0.5983 370 574 0.6250 0.4074 0.9091 0.3333 0.7857 0.2308 Exp 1: enable heuristic reranking (REVERTED - temporal dropped to 23%) +f352f58 20260328_020000 0.5200 0.6008 351 492 0.6250 0.4074 0.8182 0.3333 0.8571 0.3077 Exp 2: min_relevance 0.1 (noise - config doesnt affect search() path) +f352f58 20260328_024836 0.3500 0.6008 2283 3287 0.4375 0.1852 0.7273 0.5000 0.4286 0.2308 Exp 3: full hybrid pipeline for /v1/retrieve (REVERTED - accuracy+latency regressed) +f352f58 20260328_031736 0.5200 0.6475 319 505 0.6875 0.3333 0.8182 0.3333 0.7857 0.3846 Exp 4: top_k=150 + temporal fallback (+4.7% recall, +12.5% knowledge-update) +f352f58 20260328_034615 0.5000 0.6608 701 996 0.6250 0.3333 0.8182 0.3333 0.9286 0.2692 Exp 5: content re-query (REVERTED - +recall but -accuracy, 2x latency, noise dilution) +f352f58 20260328_041719 0.5500 0.6475 370 655 0.5625 0.4074 0.8182 0.6667 0.7857 0.4231 Exp 6: 40 context chunks to LLM (NEW BEST +4% accuracy, all weak types improved) +f352f58 20260328_045020 0.5000 0.6608 348 509 0.6875 0.2222 0.7273 0.5000 0.8571 0.3846 Exp 7: top_k=200 + 60 chunks (REVERTED - too much context dilutes signal) +f352f58 20260328_051509 0.6100 0.6475 337 485 0.6250 0.4074 1.0000 1.0000 0.8571 0.4231 Exp 8: gpt-4o answer model (NEW BEST +10% accuracy, 2 types at 100%) +f352f58 20260328_054558 0.5500 0.6642 2738 3468 0.5625 0.3333 0.9091 0.6667 0.8571 0.4231 Exp 9: query expansion (REVERTED - +recall but -accuracy, 8x latency) +f352f58 20260328_061325 0.5900 0.6475 347 617 0.5000 0.3704 1.0000 1.0000 0.8571 0.4615 Exp 10: extract-then-reason prompt (+temporal but -knowledge-update) +f352f58 20260328_064259 0.5200 0.6658 516 718 0.5000 0.3333 1.0000 0.5000 0.7857 0.3846 Exp 11: embedding-3-large 3072d (+recall but -accuracy, score distribution change) +f352f58 20260328_071034 0.5600 0.6608 372 610 0.7500 0.3333 1.0000 0.8333 0.7857 0.3077 Exp 12: embedding-3-large@1536 Matryoshka (+knowledge but -temporal) +f352f58 20260328_073552 0.5800 0.6475 318 468 0.6875 0.3704 1.0000 0.6667 0.7857 0.4231 Exp 13: gpt-4o judge (stricter than mini — Exp 8 score is real) +4de8a32 20260328_100426 0.5200 0.6475 609 714 0.5625 0.2963 1.0000 0.5000 0.8571 0.3462 Exp 14: Python session expansion (REGRESSED — full sessions flood context) +4de8a32 20260328_101257 0.5100 0.6433 1420 1627 0.5625 0.3333 0.9091 0.6667 0.8571 0.2692 Exp 14a: Rust session expansion + facts (REGRESSED — expansion floods context + 4x latency) +4de8a32 20260328_102056 0.5600 0.6475 1265 1437 0.5625 0.3704 0.9091 0.8333 0.7857 0.4231 Exp 15: Rust fact search only (facts table near-empty, +latency for no gain) +4de8a32 20260328_103334 0.5341 0.6278 1299 1476 0.6154 0.2609 1.0000 0.3333 0.8462 0.4800 Exp 16: knowledge-aware prompt (temporal 48%, but multi-session crashed) +4de8a32 20260328_104624 0.5700 0.6558 4586 7547 0.6250 0.3704 0.9091 0.3333 0.8571 0.5000 Exp 17: Python query decomposition (temporal 50%, but hurts simple categories) +4de8a32 20260328_105722 0.5800 0.6475 1320 1563 0.5000 0.3704 1.0000 0.5000 0.8571 0.5385 Exp 18: Rust decomposition broad (temporal 54% RECORD, but 45/100 trigger too much) +4de8a32 20260328_110649 0.6100 0.6475 1276 1502 0.5625 0.4444 1.0000 0.6667 0.8571 0.5000 Exp 19: Rust decomposition tightened (61% stale-index, temporal 50%, multi 44%) +4de8a32 20260328_115042 0.5600 0.6475 351 1025 0.5000 0.3704 1.0000 0.6667 0.8571 0.4231 Exp 21: decomp + date-text expansion fresh ingest (date-text hurts, removed) +4de8a32 20260328_121812 0.5800 0.6475 341 957 0.5000 0.4074 1.0000 1.0000 0.7857 0.4231 Exp 22: decomp only fresh ingest (still -3% vs Exp 8, decomp reverted) +4de8a32 20260328_124544 0.6300 0.6475 342 525 0.6250 0.4444 1.0000 0.8333 0.8571 0.5000 Exp 23: temporal fallback only (BEST 63%, temporal 50%) +f0cbcee 20260328_135841 0.5657 0.6540 378 809 0.4375 0.4444 0.9091 0.6667 0.8571 0.4400 Exp 24: BM25 always-on hybrid (noise dilution, -6.4% vs Exp 23) +f0cbcee 20260328_140713 0.5800 0.6475 1317 1461 0.5625 0.4074 1.0000 0.5000 0.8571 0.4615 Exp 25: BM25 conditional fallback (still below Exp 23) +f0cbcee 20260328_151159 0.4500 0.6475 329 563 0.5625 0.2593 0.9091 0.1667 0.7857 0.2692 Exp 26: session-grouped ordering (catastrophic) +f0cbcee 20260328_162239 0.5800 0.6475 506 716 0.5625 0.4815 0.9091 0.5000 0.8571 0.4231 Exp 27: NDCG session retrieval (multi-session 48% best, but overall 58%) +f0cbcee 20260328_172842 0.6000 0.6517 343 483 0.5000 0.3704 1.0000 0.5000 0.8571 0.6154 Exp 28: date-enriched embeddings (temporal 61.5% RECORD) +f0cbcee 20260328_182602 0.5400 0.6517 329 521 0.4375 0.3333 1.0000 0.5000 0.8571 0.4615 Exp 29: full enrichment date+context+facts (context/facts hurt, reverted to date-only) +f0cbcee 20260328_192551 0.6100 0.6517 341 496 0.6250 0.4444 1.0000 1.0000 0.8571 0.3846 Exp 30: date-only enrichment validation (61%, temporal high-variance) +f1daa23 20260328_210231 0.5500 0.6733 439 611 0.5625 0.3333 0.9091 0.5000 0.8571 0.4615 Exp 31: LLM memory extraction (recall 67.3% BEST, but memories flood context -accuracy) +3578855 20260328_231831 0.6100 0.6617 640 1054 0.5625 0.4074 1.0000 0.6667 0.8571 0.5385 Exp 33: date+temporal+statement+BM25 entity (recall 66.2% best, but 640ms latency) +3578855 20260328_232614 0.6000 0.6517 1261 1450 0.7500 0.3704 1.0000 0.5000 0.8571 0.4615 Exp 34: no statement re-query (stale index inflated latency) +db644f3 20260329_013628 0.5500 0.6517 342 542 0.5000 0.4074 1.0000 0.8333 0.7857 0.3462 Exp 36: chronological assembler (55%, LLM variance low run) +db644f3 20260329_014520 0.5900 0.6517 1328 1627 0.5625 0.2963 1.0000 0.8333 0.8571 0.5385 Exp 37: date-prefixed retrieve content (stale index latency) +8d7ed8e 20260329_023555 0.6200 0.6558 324 517 0.6250 0.4444 0.8182 0.6667 0.8571 0.5769 Exp 38: round-level storage (temporal 58%, multi 44%, but assistant dropped 82%) +8d7ed8e 20260329_035844 0.5900 0.6425 417 550 0.5625 0.3704 1.0000 0.8333 0.8571 0.4615 Exp 39: round+raw assistant 3x chunks (300GB bloat, reverted to round-only) +4073253 20260329_044815 0.5900 0.6558 298 447 0.5625 0.4074 0.7273 0.6667 0.8571 0.5769 Exp 40: round-only validation (298ms p50, temporal 58%, assistant 73%) +82de89e 20260329_055326 0.5800 0.6517 365 531 0.5625 0.2963 1.0000 0.8333 0.7857 0.5385 Exp 41: BM25 entity fallback score<0.4 (doesn't trigger, scores above threshold) +f0cbcee 20260328_151159 0.4500 0.6475 329 563 0.5625 0.2593 0.9091 0.1667 0.7857 0.2692 Exp 26: session-grouped ordering (CATASTROPHIC — top sessions monopolize) +4de8a32 20260328_124544 0.6300 0.6475 342 525 0.6250 0.4444 1.0000 0.8333 0.8571 0.5000 Exp 23: temporal fallback only clean baseline (NEW BEST 63%, temporal 50%) +4de8a32 20260328_104624 0.5700 0.6558 4586 7547 0.6250 0.3704 0.9091 0.3333 0.8571 0.5000 Exp 17: query decomposition (temporal 50% RECORD, but hurts simple categories)