From 4696bd7c76e79b1eaeac97a6ce0aa5bb0112a620 Mon Sep 17 00:00:00 2001 From: DaevMithran Date: Tue, 7 Apr 2026 23:27:11 +0700 Subject: [PATCH 1/9] feat: Implement LocalBackend --- packages/contexto/package.json | 3 + packages/contexto/src/index.ts | 38 ++++- packages/contexto/src/local/backend.ts | 125 +++++++++++++++ packages/contexto/src/local/index.ts | 3 + packages/contexto/src/local/summarizer.ts | 179 ++++++++++++++++++++++ packages/contexto/src/local/types.ts | 34 ++++ packages/contexto/src/types.ts | 2 + pnpm-lock.yaml | 10 ++ 8 files changed, 393 insertions(+), 1 deletion(-) create mode 100644 packages/contexto/src/local/backend.ts create mode 100644 packages/contexto/src/local/index.ts create mode 100644 packages/contexto/src/local/summarizer.ts create mode 100644 packages/contexto/src/local/types.ts diff --git a/packages/contexto/package.json b/packages/contexto/package.json index 48b9682..6afe562 100644 --- a/packages/contexto/package.json +++ b/packages/contexto/package.json @@ -31,6 +31,9 @@ "scripts": { "build": "tsc --noEmit" }, + "dependencies": { + "@ekai/mindmap": "^0.1.8" + }, "peerDependencies": { "openclaw": "*" }, diff --git a/packages/contexto/src/index.ts b/packages/contexto/src/index.ts index 642c969..86c2417 100644 --- a/packages/contexto/src/index.ts +++ b/packages/contexto/src/index.ts @@ -1,10 +1,12 @@ import type { PluginConfig } from './types.js'; import { RemoteBackend } from './client.js'; +import { LocalBackend } from './local/index.js'; import { createContextEngine } from './engine/index.js'; -// Public API — use ContextoBackend to implement a custom (e.g. local) backend export type { ContextoBackend, SearchResult, WebhookPayload, Logger } from './types.js'; export { RemoteBackend } from './client.js'; +export { LocalBackend } from './local/index.js'; +export type { LocalBackendConfig, EpisodeSummary } from './local/index.js'; /** OpenClaw plugin definition. */ export default { @@ -20,15 +22,21 @@ export default { maxContextChars: { type: 'number' }, compactThreshold: { type: 'number', default: 0.50 }, compactionStrategy: { type: 'string', default: 'default' }, + backend: { type: 'string', default: 'remote' }, + storagePath: { type: 'string' }, }, }, register(api: any) { const strategy = api.pluginConfig?.compactionStrategy ?? 'default'; + const backendMode = api.pluginConfig?.backend ?? 'remote'; + const base = { apiKey: api.pluginConfig?.apiKey, contextEnabled: api.pluginConfig?.contextEnabled ?? true, maxContextChars: api.pluginConfig?.maxContextChars, + backend: backendMode as 'remote' | 'local', + storagePath: api.pluginConfig?.storagePath, }; const config: PluginConfig = strategy === 'default' @@ -41,6 +49,34 @@ export default { const logger = api.logger; + if (backendMode === 'local') { + // Resolve provider and apiKey from OpenClaw runtime defaults + const defaults = api.runtime?.agent?.defaults; + const provider = defaults?.provider ?? 'openrouter'; + const apiKey = api.pluginConfig?.apiKey ?? defaults?.apiKey; + + if (!apiKey) { + // Try resolving from runtime modelAuth as fallback + logger.warn('[contexto] No apiKey available for local backend — provide apiKey in plugin config or ensure OpenClaw runtime has a configured provider'); + return; + } + + // Set apiKey to a truthy value so engine guards (if (!this.config.apiKey)) pass + config.apiKey = config.apiKey || 'local'; + + const backend = new LocalBackend({ + provider, + apiKey, + storagePath: config.storagePath, + }, logger); + + const engine = createContextEngine(config, backend, logger); + api.registerContextEngine('contexto', () => engine); + logger.info(`[contexto] Plugin registered with local backend (provider: ${provider}, contextEnabled: ${config.contextEnabled})`); + return; + } + + // Remote backend (default) if (!config.apiKey) { logger.warn('[contexto] Missing apiKey — ingestion and retrieval will be disabled'); return; diff --git a/packages/contexto/src/local/backend.ts b/packages/contexto/src/local/backend.ts new file mode 100644 index 0000000..367eea9 --- /dev/null +++ b/packages/contexto/src/local/backend.ts @@ -0,0 +1,125 @@ +import { Mindmap, jsonFileStorage, memoryStorage } from '@ekai/mindmap'; +import type { MindmapStorage } from '@ekai/mindmap'; +import type { ContextoBackend, Logger, SearchResult, WebhookPayload } from '../types.js'; +import type { LocalBackendConfig } from './types.js'; +import { extractEpisodeText, summarizeEpisode } from './summarizer.js'; + +const DEFAULT_STORAGE_PATH = '.contexto/mindmap.json'; + +/** ContextoBackend implementation that runs the full pipeline locally. */ +export class LocalBackend implements ContextoBackend { + private mindmap: Mindmap; + private config: LocalBackendConfig; + private logger: Logger; + + constructor(config: LocalBackendConfig, logger: Logger) { + this.config = config; + this.logger = logger; + + const storage: MindmapStorage = config.storage + ?? jsonFileStorage(config.storagePath ?? DEFAULT_STORAGE_PATH); + + this.mindmap = new Mindmap({ + provider: config.provider, + apiKey: config.apiKey, + embedModel: config.embedModel, + storage, + config: config.mindmapConfig, + }); + } + + async ingest(payload: WebhookPayload | WebhookPayload[]): Promise { + const payloads = Array.isArray(payload) ? payload : [payload]; + if (payloads.length === 0) return; + + // Filter to episode/combined events only + const episodes = payloads.filter( + (p) => p.event.type === 'episode' && p.event.action === 'combined', + ); + + if (episodes.length === 0) { + this.logger.debug('[contexto:local] No episode/combined events to ingest'); + return; + } + + try { + const items: Array<{ id: string; role: string; content: string; timestamp?: string; metadata?: Record }> = []; + + for (const ep of episodes) { + const text = extractEpisodeText(ep); + if (!text) { + this.logger.debug('[contexto:local] Empty episode text, skipping'); + continue; + } + + const traceRef = crypto.randomUUID(); + const summary = await summarizeEpisode(text, { + provider: this.config.provider, + apiKey: this.config.apiKey, + model: this.config.llmModel, + }, this.logger); + + // Compose content: summary + key findings as bullets (matches remote API format) + const contentParts = [summary.summary]; + if (summary.key_findings.length > 0) { + contentParts.push(`\nKey findings:\n${summary.key_findings.map((f) => `- ${f}`).join('\n')}`); + } + + const episodeData = ep.data as Record | undefined; + + items.push({ + id: crypto.randomUUID(), + role: 'assistant', + content: contentParts.join('\n'), + timestamp: ep.timestamp ?? new Date().toISOString(), + metadata: { + source: 'summary', + status: summary.status, + evidence_refs: summary.evidence_refs, + open_questions: summary.open_questions, + confidence: summary.confidence, + trace_ref: traceRef, + sessionKey: ep.sessionKey, + episode: { + userMessage: episodeData?.userMessage, + assistantMessages: episodeData?.assistantMessages ?? [], + toolMessages: episodeData?.toolMessages ?? [], + }, + }, + }); + } + + if (items.length > 0) { + await this.mindmap.add(items); + this.logger.info(`[contexto:local] Ingested ${items.length} episode(s) into mindmap`); + } + } catch (err) { + this.logger.warn(`[contexto:local] Ingest failed: ${err instanceof Error ? err.message : String(err)}`); + } + } + + async search( + query: string, + maxResults: number, + filter?: Record, + minScore?: number, + ): Promise { + try { + const result = await this.mindmap.search(query, { + maxResults, + filter, + minScore, + }); + + if (!result.items.length) return null; + + return { + items: result.items, + paths: result.paths, + }; + } catch (err) { + this.logger.warn(`[contexto:local] Search failed: ${err instanceof Error ? err.message : String(err)}`); + return null; + } + } +} diff --git a/packages/contexto/src/local/index.ts b/packages/contexto/src/local/index.ts new file mode 100644 index 0000000..a7948ce --- /dev/null +++ b/packages/contexto/src/local/index.ts @@ -0,0 +1,3 @@ +export { LocalBackend } from './backend.js'; +export type { LocalBackendConfig, EpisodeSummary, EvidenceRef, EvidenceRefType, LLMProviderConfig } from './types.js'; +export { extractEpisodeText, summarizeEpisode } from './summarizer.js'; diff --git a/packages/contexto/src/local/summarizer.ts b/packages/contexto/src/local/summarizer.ts new file mode 100644 index 0000000..b4a0346 --- /dev/null +++ b/packages/contexto/src/local/summarizer.ts @@ -0,0 +1,179 @@ +import type { WebhookPayload, ContentBlock, Logger } from '../types.js'; +import { stripMetadataEnvelope } from '../helpers.js'; +import type { EpisodeSummary, LLMProviderConfig } from './types.js'; + +const LLM_PROVIDERS: Record = { + openrouter: { + baseUrl: 'https://openrouter.ai/api/v1', + defaultModel: 'openai/gpt-4o-mini', + }, + openai: { + baseUrl: 'https://api.openai.com/v1', + defaultModel: 'gpt-4o-mini', + }, +}; + +const SUMMARIZE_SYSTEM_PROMPT = `You are a concise summarizer. Given a conversation episode (user question + assistant answer + tool outputs), produce a JSON object with exactly these fields: + +{ + "status": "complete" | "partial" | "blocked", + "summary": "", + "key_findings": ["", "", ...], + "evidence_refs": [{"type": "", "value": ""}], + "open_questions": [""], + "confidence": <0.0 to 1.0> +} + +Rules: +- Set status to "complete" if the episode fully resolved the user's request, "partial" if only partly, "blocked" if unable to proceed. +- summary should be 1-3 sentences capturing the essence. +- key_findings should have at least one entry. +- evidence_refs should reference relevant tools, files, or episodes mentioned. +- Respond ONLY with valid JSON, no markdown fences, no extra text.`; + +/** Extract text content from a message, handling both string and ContentBlock[] formats. */ +function extractMessageText(message: any): string { + if (!message) return ''; + const content = message.content; + if (typeof content === 'string') return content; + if (Array.isArray(content)) { + return (content as ContentBlock[]) + .filter((block) => block.type === 'text' && block.text) + .map((block) => block.text) + .join('\n'); + } + return ''; +} + +/** + * Extract combined text from an episode/combined WebhookPayload. + * Returns empty string for non-episode events. + */ +export function extractEpisodeText(payload: WebhookPayload): string { + if (payload.event.type !== 'episode' || payload.event.action !== 'combined') { + return ''; + } + + const data = payload.data as Record | undefined; + if (!data) return ''; + + const parts: string[] = []; + + // User message — strip OpenClaw metadata envelope + const userText = extractMessageText(data.userMessage); + if (userText) { + parts.push(`Q: ${stripMetadataEnvelope(userText)}`); + } + + // Assistant messages (drop api/usage/model metadata) + const assistantMessages = Array.isArray(data.assistantMessages) ? data.assistantMessages : []; + for (const msg of assistantMessages) { + const text = extractMessageText(msg); + if (text) parts.push(`A: ${text}`); + } + + return parts.join('\n'); +} + +/** + * Summarize episode text via an LLM call. + * Returns a graceful fallback on any failure. + */ +export async function summarizeEpisode( + text: string, + config: LLMProviderConfig, + logger: Logger, +): Promise { + const providerDef = LLM_PROVIDERS[config.provider]; + if (!providerDef) { + logger.warn(`[contexto:local] Unknown LLM provider: ${config.provider}, using fallback summary`); + return buildFallback(text); + } + + const model = config.model ?? providerDef.defaultModel; + const url = `${providerDef.baseUrl}/chat/completions`; + + try { + const response = await fetch(url, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${config.apiKey}`, + }, + body: JSON.stringify({ + model, + temperature: 0.2, + response_format: { type: 'json_object' }, + messages: [ + { role: 'system', content: SUMMARIZE_SYSTEM_PROMPT }, + { role: 'user', content: text }, + ], + }), + }); + + if (!response.ok) { + const body = await response.text().catch(() => '(no body)'); + logger.warn(`[contexto:local] LLM summarize HTTP ${response.status}: ${body.slice(0, 200)}`); + return buildFallback(text); + } + + const json = await response.json() as any; + const raw = json.choices?.[0]?.message?.content; + if (!raw) { + logger.warn('[contexto:local] LLM returned no content, using fallback summary'); + return buildFallback(text); + } + + return parseSummary(raw, text, logger); + } catch (err) { + logger.warn(`[contexto:local] LLM summarize failed: ${err instanceof Error ? err.message : String(err)}`); + return buildFallback(text); + } +} + +/** Parse and validate LLM JSON response into EpisodeSummary, with graceful degradation. */ +function parseSummary(raw: string, originalText: string, logger: Logger): EpisodeSummary { + try { + const parsed = JSON.parse(raw); + + const summary = typeof parsed.summary === 'string' && parsed.summary + ? parsed.summary + : originalText.slice(0, 200); + + const key_findings = Array.isArray(parsed.key_findings) && parsed.key_findings.length > 0 + ? parsed.key_findings.map(String) + : ['Episode processed']; + + const status = ['complete', 'partial', 'blocked'].includes(parsed.status) + ? parsed.status as EpisodeSummary['status'] + : 'partial'; + + const confidence = typeof parsed.confidence === 'number' && parsed.confidence >= 0 && parsed.confidence <= 1 + ? parsed.confidence + : 0.5; + + const evidence_refs = Array.isArray(parsed.evidence_refs) + ? parsed.evidence_refs.filter((r: any) => r && typeof r.type === 'string' && typeof r.value === 'string') + : []; + + const open_questions = Array.isArray(parsed.open_questions) + ? parsed.open_questions.filter((q: any) => typeof q === 'string') + : undefined; + + return { summary, key_findings, status, confidence, evidence_refs, open_questions }; + } catch (err) { + logger.warn(`[contexto:local] Failed to parse LLM summary JSON: ${err instanceof Error ? err.message : String(err)}`); + return buildFallback(originalText); + } +} + +/** Build a fallback EpisodeSummary from raw text when LLM call or parsing fails. */ +function buildFallback(text: string): EpisodeSummary { + return { + summary: text.slice(0, 200) + (text.length > 200 ? '...' : ''), + key_findings: ['Episode processed (fallback — LLM summarization unavailable)'], + status: 'partial', + confidence: 0.0, + evidence_refs: [], + }; +} diff --git a/packages/contexto/src/local/types.ts b/packages/contexto/src/local/types.ts new file mode 100644 index 0000000..08c3180 --- /dev/null +++ b/packages/contexto/src/local/types.ts @@ -0,0 +1,34 @@ +import type { MindmapStorage } from '@ekai/mindmap'; +import type { MindmapConfig } from '@ekai/mindmap'; + +export type EvidenceRefType = 'episode_ref' | 'tool_ref' | 'file_ref' | 'trace_ref'; + +export interface EvidenceRef { + type: EvidenceRefType; + value: string; +} + +export interface EpisodeSummary { + summary: string; + key_findings: string[]; + status: 'complete' | 'partial' | 'blocked'; + confidence: number; + evidence_refs: EvidenceRef[]; + open_questions?: string[]; +} + +export interface LocalBackendConfig { + provider: 'openrouter' | 'openai'; + apiKey: string; + embedModel?: string; + llmModel?: string; + storagePath?: string; + storage?: MindmapStorage; + mindmapConfig?: Partial; +} + +export interface LLMProviderConfig { + provider: 'openrouter' | 'openai'; + apiKey: string; + model?: string; +} diff --git a/packages/contexto/src/types.ts b/packages/contexto/src/types.ts index 57dfd98..cfa6704 100644 --- a/packages/contexto/src/types.ts +++ b/packages/contexto/src/types.ts @@ -4,6 +4,8 @@ export interface BaseConfig { maxContextChars?: number; minScore?: number; filter?: Record; + backend?: 'remote' | 'local'; + storagePath?: string; } export interface DefaultConfig extends BaseConfig { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 63a8c86..86c5e35 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -41,6 +41,9 @@ importers: packages/contexto: dependencies: + '@ekai/mindmap': + specifier: ^0.1.8 + version: 0.1.8 openclaw: specifier: '*' version: 2026.4.5(@napi-rs/canvas@0.1.97) @@ -401,6 +404,9 @@ packages: '@ekai/memory@0.0.1': resolution: {integrity: sha512-bMdR8X6UDhlLmwm1mUFuuZQv6ZMl9G1gB0fcH+iHA2q8lSeCRDDBibn7b+k2OXfP1rTtlY8OAFnWLV0mXvfLKw==} + '@ekai/mindmap@0.1.8': + resolution: {integrity: sha512-7Af9ShJ8c4d80HAw5MR0j2ypIUQwag6lgt/s5Mywm2EIZrl2uDVdM6VmjT+U2SjYBc/cbyjSwiHRfb3x93GWVA==} + '@emnapi/core@1.9.1': resolution: {integrity: sha512-mukuNALVsoix/w1BJwFzwXBN/dHeejQtuVzcDsfOEsdpCumXb/E9j8w11h5S54tT1xhifGfbbSm/ICrObRb3KA==} @@ -6552,6 +6558,10 @@ snapshots: transitivePeerDependencies: - supports-color + '@ekai/mindmap@0.1.8': + dependencies: + ml-hclust: 4.0.0 + '@emnapi/core@1.9.1': dependencies: '@emnapi/wasi-threads': 1.2.0 From 586aea76f7f6987e2d9c4a0d52f24646061c87e6 Mon Sep 17 00:00:00 2001 From: DaevMithran Date: Wed, 8 Apr 2026 16:34:22 +0700 Subject: [PATCH 2/9] fix mindmap path --- packages/contexto/src/index.ts | 3 --- packages/contexto/src/local/backend.ts | 10 +++++----- packages/contexto/src/local/types.ts | 1 - packages/contexto/src/types.ts | 1 - 4 files changed, 5 insertions(+), 10 deletions(-) diff --git a/packages/contexto/src/index.ts b/packages/contexto/src/index.ts index 86c2417..14954dd 100644 --- a/packages/contexto/src/index.ts +++ b/packages/contexto/src/index.ts @@ -23,7 +23,6 @@ export default { compactThreshold: { type: 'number', default: 0.50 }, compactionStrategy: { type: 'string', default: 'default' }, backend: { type: 'string', default: 'remote' }, - storagePath: { type: 'string' }, }, }, @@ -36,7 +35,6 @@ export default { contextEnabled: api.pluginConfig?.contextEnabled ?? true, maxContextChars: api.pluginConfig?.maxContextChars, backend: backendMode as 'remote' | 'local', - storagePath: api.pluginConfig?.storagePath, }; const config: PluginConfig = strategy === 'default' @@ -67,7 +65,6 @@ export default { const backend = new LocalBackend({ provider, apiKey, - storagePath: config.storagePath, }, logger); const engine = createContextEngine(config, backend, logger); diff --git a/packages/contexto/src/local/backend.ts b/packages/contexto/src/local/backend.ts index 367eea9..698332e 100644 --- a/packages/contexto/src/local/backend.ts +++ b/packages/contexto/src/local/backend.ts @@ -1,10 +1,11 @@ -import { Mindmap, jsonFileStorage, memoryStorage } from '@ekai/mindmap'; -import type { MindmapStorage } from '@ekai/mindmap'; +import { homedir } from 'node:os'; +import { join } from 'node:path'; +import { Mindmap, jsonFileStorage } from '@ekai/mindmap'; import type { ContextoBackend, Logger, SearchResult, WebhookPayload } from '../types.js'; import type { LocalBackendConfig } from './types.js'; import { extractEpisodeText, summarizeEpisode } from './summarizer.js'; -const DEFAULT_STORAGE_PATH = '.contexto/mindmap.json'; +const STORAGE_PATH = join(homedir(), '.openclaw', 'data', 'contexto', 'mindmap.json'); /** ContextoBackend implementation that runs the full pipeline locally. */ export class LocalBackend implements ContextoBackend { @@ -16,8 +17,7 @@ export class LocalBackend implements ContextoBackend { this.config = config; this.logger = logger; - const storage: MindmapStorage = config.storage - ?? jsonFileStorage(config.storagePath ?? DEFAULT_STORAGE_PATH); + const storage = config.storage ?? jsonFileStorage(STORAGE_PATH); this.mindmap = new Mindmap({ provider: config.provider, diff --git a/packages/contexto/src/local/types.ts b/packages/contexto/src/local/types.ts index 08c3180..5bf5fdb 100644 --- a/packages/contexto/src/local/types.ts +++ b/packages/contexto/src/local/types.ts @@ -22,7 +22,6 @@ export interface LocalBackendConfig { apiKey: string; embedModel?: string; llmModel?: string; - storagePath?: string; storage?: MindmapStorage; mindmapConfig?: Partial; } diff --git a/packages/contexto/src/types.ts b/packages/contexto/src/types.ts index cfa6704..ab127ee 100644 --- a/packages/contexto/src/types.ts +++ b/packages/contexto/src/types.ts @@ -5,7 +5,6 @@ export interface BaseConfig { minScore?: number; filter?: Record; backend?: 'remote' | 'local'; - storagePath?: string; } export interface DefaultConfig extends BaseConfig { From d61740033d3d820efed9d1243e378f72ed9cdf4c Mon Sep 17 00:00:00 2001 From: DaevMithran Date: Wed, 8 Apr 2026 16:40:19 +0700 Subject: [PATCH 3/9] Update release config --- release.config.cjs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/release.config.cjs b/release.config.cjs index eb486f7..ad0b767 100644 --- a/release.config.cjs +++ b/release.config.cjs @@ -4,7 +4,8 @@ module.exports = { ['@semantic-release/commit-analyzer', { preset: 'conventionalcommits', releaseRules: [ - { type: 'feat', release: 'minor' }, + { breaking: true, release: 'minor' }, + { type: 'feat', release: 'patch' }, { type: 'fix', release: 'patch' }, { type: 'perf', release: 'patch' }, { type: 'refactor', release: 'patch' }, From 6330c55caa38ca72bb56eb2e6f22d7135cf01310 Mon Sep 17 00:00:00 2001 From: DaevMithran Date: Wed, 8 Apr 2026 16:45:03 +0700 Subject: [PATCH 4/9] Update openclaw.plugin.json --- docs/why-agnes.md | 397 +++++++++++++++++++++++++ packages/contexto/openclaw.plugin.json | 5 + 2 files changed, 402 insertions(+) create mode 100644 docs/why-agnes.md diff --git a/docs/why-agnes.md b/docs/why-agnes.md new file mode 100644 index 0000000..22f324f --- /dev/null +++ b/docs/why-agnes.md @@ -0,0 +1,397 @@ +# Why AGNES: Hierarchical Clustering for Agent Context + +## The Problem + +Every AI memory system faces the same fundamental question: **how do you organize memories so retrieval is fast, relevant, and context-aware?** + +Most systems default to flat vector search — embed everything, store it in a vector DB, and retrieve the top-k nearest neighbors. This works for simple lookups, but breaks down for agent context management where you need: + +- **Multi-resolution retrieval** — sometimes you need a broad topic summary, sometimes a specific detail +- **Token-budget-aware retrieval** — you can't just return top-k; you need to fill a fixed context window efficiently +- **Semantic organization** — related memories should be grouped, not scattered across an unstructured index +- **No pre-defined categories** — agent conversations span unpredictable topics; you can't hardcode a taxonomy + +## What Others Do + +We surveyed the major AI memory systems to understand the landscape: + +### Mem0 + +Uses **pluggable vector stores** (Qdrant, pgvector, Pinecone, etc.) with HNSW or DiskANN indexing. Memories are stored as **flat atomic facts** with an optional Neo4j/Kuzu knowledge graph layer. Consolidation is LLM-driven — on each insert, an LLM decides whether to ADD, UPDATE, DELETE, or NOOP against existing memories. + +**Structure**: Flat facts + optional entity graph. No clustering. + +### Supermemory + +Built on **pgvector** with hybrid search (semantic + fact retrieval). Memories are flat atomic facts scoped by projects. Uses Anthropic's Contextual Retrieval technique for chunking. Supports contradiction resolution and stale information expiration. + +**Structure**: Flat facts + document chunks. No clustering. + +### Zep (Graphiti) + +The most sophisticated graph-based approach. Uses a **temporal knowledge graph** backed by Neo4j with triple parallel retrieval: cosine similarity, BM25 full-text search, and breadth-first graph traversal. Entities are clustered into communities via **dynamic label propagation** — the closest thing to clustering in this space. + +**Structure**: Temporal knowledge graph with flat community clustering. Not hierarchical. + +### LangChain / LangGraph Memory + +Stores memories as **JSON documents** in developer-defined namespace hierarchies (e.g., `("user", "123", "memories")`). Backed by pgvector (HNSW/IVFFlat) or SQLite with sqlite-vec. Consolidation uses a memory enrichment process that balances creation vs. update. + +**Structure**: Flat key-value store with static namespace organization. No dynamic clustering. + +### Letta (formerly MemGPT) + +Models memory after **OS memory management** with three tiers: core memory (in-context, agent-editable), recall memory (conversation log), and archival memory (vector DB — ChromaDB or pgvector). The agent self-manages what goes where via tool calls. Context window overflow triggers conversation summarization. + +**Structure**: Fixed 3-tier architecture. No clustering within tiers. + +### Summary + +| System | Index | Memory Structure | Clustering | Consolidation | +|--------|-------|-----------------|------------|---------------| +| **Mem0** | HNSW / DiskANN | Flat facts + knowledge graph | None | LLM-driven ADD/UPDATE/DELETE | +| **Supermemory** | pgvector | Flat facts + doc chunks | None | Contradiction resolution + expiry | +| **Zep** | Neo4j vectors + BM25 + BFS | Temporal knowledge graph | Label propagation (flat) | Temporal invalidation + LLM dedup | +| **LangGraph** | HNSW / IVFFlat | JSON docs in static namespaces | None | Memory enrichment + TTL | +| **Letta** | ChromaDB (HNSW) / pgvector | 3-tier: core / recall / archival | None | Summarization + agent self-editing | +| **Contexto** | **AGNES dendrogram** | **Dynamic semantic hierarchy** | **Hierarchical (agglomerative)** | Algorithmic + incremental centroid | + +**Key finding**: No major AI memory system uses hierarchical clustering. All rely on flat similarity search, with Zep being the only one adding flat community detection via label propagation. Memory consolidation across the board is LLM-driven rather than algorithmic. + +--- + +## Why Hierarchical Clustering? + +### Over flat ANN (HNSW, IVFFlat) + +Approximate Nearest Neighbor indices like HNSW are excellent for raw retrieval speed. They're what most vector databases use internally. But they solve a different problem: + +| | Flat ANN (HNSW) | Hierarchical Clustering | +|---|---|---| +| **Query type** | "Find the k most similar items" | "Find all items in the most relevant semantic branch" | +| **Structure** | Flat index, no organization | Semantic tree with labeled clusters | +| **Token budgeting** | Return top-k, hope it fits | Beam search fills budget by exploring branches | +| **Multi-resolution** | No — always item-level | Yes — can retrieve at cluster or item level | +| **Interpretability** | Opaque index | Labeled tree (travel → Japan trip → visa documents) | +| **Pruning** | Must score candidates | Prune entire branches by centroid similarity | + +HNSW answers "what's similar?" — hierarchical clustering answers "what's this about, and what details matter?" + +For agent context, the second question is more useful. You don't just want the 7 most similar messages; you want the most relevant *topic branches* packed into your token budget. + +### Over K-Means + +K-means is the default clustering algorithm most engineers reach for. If you're going to cluster, why not the simpler algorithm? Because K-Means produces flat partitions; hierarchical clustering produces a dendrogram — a tree you can cut at any level. + +#### 1. You must specify k upfront + +K-means requires choosing the number of clusters before running. Agent conversations are open-ended — a session might touch 3 topics or 30. Choosing k=5 when there are 15 natural topics merges unrelated episodes; choosing k=15 when there are 3 fragments coherent topics into noise. + +Hierarchical clustering discovers the natural cluster count by cutting the dendrogram at a similarity threshold. No k needed. + +#### 2. Spherical cluster assumption + +K-means assumes clusters are roughly spherical and equally sized in embedding space. Conversational topics are neither — a 50-message thread and a 3-message question are both valid clusters but wildly different in density. + +Hierarchical clustering with average linkage handles variable-density clusters naturally because it merges based on average pairwise distance, not distance to a centroid. + +#### 3. No hierarchy + +K-means produces a flat partition. You get k buckets with no relationship between them. This means: + +- No multi-resolution retrieval (can't zoom in/out) +- No branch pruning during search (must check all k centroids) +- No interpretable structure (which clusters are subtopics of which?) + +Hierarchical clustering produces a dendrogram — a full hierarchy from individual items up to a single root. We cut it at our similarity threshold to get the right granularity, but the hierarchy is preserved for navigation. + +#### 4. Instability + +K-means results depend on random initialization. Run it twice on the same data and you may get different clusters. This is unacceptable for a context system where users expect consistent retrieval behavior. + +Hierarchical clustering is deterministic — same data, same tree, every time. + +#### 5. No incremental updates + +Standard k-means requires re-running on the full dataset when new items arrive. Mini-batch k-means exists but still shifts centroids unpredictably. + +Our implementation supports **greedy incremental insertion** — new items walk the existing tree and slot into the best-matching branch in O(log N) time, with O(d) centroid updates. + +--- + +## Why AGNES Specifically? + +Among hierarchical clustering algorithms, we chose [AGNES (Agglomerative Nesting)](https://onlinelibrary.wiley.com/doi/book/10.1002/9780470316801) (Kaufman & Rousseeuw, 1990) with [average linkage (UPGMA)](https://www.semanticscholar.org/paper/A-statistical-method-for-evaluating-systematic-Sokal-Michener/0db093335bc3b9445fa5a1a5526d634921d7b59a) (Sokal & Michener, 1958) for specific reasons: + +### Agglomerative vs. Divisive + +- **AGNES (bottom-up)**: Start with individual items, merge the closest pairs upward. Produces fine-grained leaf clusters that accurately reflect local similarity. +- **DIANA (top-down)**: Start with everything in one cluster, split recursively. Better for finding large-scale structure but can make poor early splits that propagate. + +For agent context, **local accuracy matters more than global structure**. A user asking about "OAuth token refresh" needs the system to find the tight cluster of auth-related messages, not a broad "security" supercluster. AGNES gets this right because it builds from the leaves up. + +### Why Average Linkage (UPGMA)? + +The linkage method determines how inter-cluster distance is measured during merges: + +| Linkage | Distance Between Clusters | Behavior | +|---------|--------------------------|----------| +| **Single** | Minimum pairwise distance | Chaining — long, stringy clusters. One tangentially related message pulls two unrelated topics together | +| **Complete** | Maximum pairwise distance | Compact but tends to split natural clusters. A single outlier message forces a premature split | +| **Average (UPGMA)** | Mean of all pairwise distances | Balanced — cohesive clusters that tolerate some variance without chaining | +| **Ward's** | Minimizes total within-cluster variance | Prefers equal-sized clusters — not ideal when one topic has 50 messages and another has 3 | + +Average linkage is the right trade-off for conversational data where topic sizes vary naturally and you want cohesive-but-not-rigid groupings. + +--- + +## How It Works in Contexto + +### Building the Tree + +``` +Conversation items (with embeddings) + ↓ +Pairwise cosine distance matrix + ↓ +AGNES with average linkage (via ml-hclust) + ↓ +Dendrogram + ↓ +Cut at similarity threshold (0.65) + ↓ +ClusterNode tree (max depth 4) +``` + +Each `ClusterNode` stores: +- **Centroid** — average embedding of all items in the subtree +- **Label** — auto-generated from most representative keywords +- **Items** — leaf-level conversation items +- **Children** — sub-clusters + +The result is a navigable semantic hierarchy. Each leaf is an **episode** — a full conversation turn containing the user message, assistant response, and any tool outputs bundled together. Consider an AI assistant used across weeks of varied conversations — the tree self-organizes from these episodes, regardless of domain: + +``` +root (Knowledge) +├── [travel planning & logistics] ← 19 episodes, depth 1 +│ ├── [Japan trip itinerary] ← 8 episodes, depth 2 +│ │ ├── Episode: User asked about 3-week Japan route +│ │ │ → Assistant planned Tokyo → Kyoto → Osaka → Hiroshima +│ │ │ with transit times and suggested stays per city +│ │ ├── Episode: User compared JR Pass options +│ │ │ → Assistant broke down 21-day JR Pass vs individual +│ │ │ shinkansen tickets with cost analysis +│ │ ├── Episode: User asked for ryokan recommendations +│ │ │ → Assistant suggested 3 ryokans near Arashiyama +│ │ │ with prices, booking links, and onsen details +│ │ └── Episode: User asked about vegetarian dining in Shibuya +│ │ → Assistant listed 5 restaurants with menus, price +│ │ ranges, and reservation requirements +│ ├── [visa & travel documents] ← 6 episodes, depth 2 +│ │ ├── Episode: User asked about passport renewal timeline +│ │ │ → Assistant calculated 6-week processing, deadline March 10 +│ │ ├── Episode: User checked Japan visa requirements +│ │ │ → Assistant confirmed visa waiver for US citizens, +│ │ │ 90-day max, entry requirements and customs forms +│ │ └── Episode: User compared travel insurance providers +│ │ → Assistant compared World Nomads vs SafetyWing +│ │ coverage, deductibles, and adventure sports policy +│ └── [packing & budget] ← 5 episodes, depth 2 +│ ├── Episode: User asked for full trip budget estimate +│ │ → Assistant built spreadsheet: flights $1200, rail $450, +│ │ accommodation $1800, food $500, activities $250 = $4200 +│ └── Episode: User asked about connectivity options +│ → Assistant compared portable WiFi rental vs eSIM +│ providers with coverage maps and daily costs +│ +├── [health & fitness] ← 15 episodes, depth 1 +│ ├── [running training plan] ← 7 episodes, depth 2 +│ │ ├── Episode: User set goal for half marathon in 10 weeks +│ │ │ → Assistant built progressive plan from 15 mi/week base +│ │ │ with tempo, interval, and long run schedule +│ │ ├── Episode: User asked about tempo run pacing +│ │ │ → Assistant calculated 8:30/mile threshold pace from +│ │ │ recent 5K time, explained RPE and heart rate zones +│ │ ├── Episode: User asked about race-day nutrition +│ │ │ → Assistant recommended gel every 45 min after mile 6, +│ │ │ electrolyte strategy, and pre-race meal timing +│ │ └── Episode: User reported hip flexor tightness +│ │ → Assistant suggested 4 stretches, foam rolling routine, +│ │ and when to reduce training load vs push through +│ ├── [meal planning & nutrition] ← 5 episodes, depth 2 +│ │ ├── Episode: User asked for high-protein vegetarian meals +│ │ │ → Assistant created 5-day meal plan with macros: +│ │ │ lentil soup, quinoa bowls, tofu stir-fry, tempeh tacos +│ │ └── Episode: User concerned about iron on plant-based diet +│ │ → Assistant explained iron + B12 supplementation, +│ │ food pairing for absorption, and blood test schedule +│ └── [sleep & recovery] ← 3 episodes, depth 2 +│ ├── Episode: User reported sleep dropping to 6hrs +│ │ → Assistant suggested magnesium glycinate dosage, +│ │ sleep hygiene adjustments, and training load reduction +│ └── Episode: User asked about recovery on rest days +│ → Assistant outlined active recovery protocol: +│ light walking, stretching, hydration targets +│ +├── [career & job search] ← 14 episodes, depth 1 +│ ├── [resume & applications] ← 6 episodes, depth 2 +│ │ ├── Episode: User asked to rewrite resume for PM roles +│ │ │ → Assistant rewrote 4 bullet points with quantified impact: +│ │ │ "grew DAU 40% in 6 months" instead of "managed growth" +│ │ └── Episode: User drafted cover letter for Stripe +│ │ → Assistant tailored letter emphasizing payments experience, +│ │ API design background, and growth metrics +│ ├── [interview preparation] ← 5 episodes, depth 2 +│ │ ├── Episode: User practiced behavioral questions +│ │ │ → Assistant walked through STAR format with 3 example +│ │ │ answers tailored to user's payments experience +│ │ └── Episode: User asked for system design practice +│ │ → Assistant ran mock interview: "design a notification +│ │ service at scale" with follow-up questions and feedback +│ └── [salary & negotiation] ← 3 episodes, depth 2 +│ ├── Episode: User asked about SF PM compensation +│ │ → Assistant provided market data ($180-220k base + equity), +│ │ negotiation scripts, and counter-offer strategy +│ └── Episode: User received initial offer +│ → Assistant analyzed offer breakdown, suggested countering +│ at 15% above base with equity acceleration ask +│ +├── [home improvement & DIY] ← 11 episodes, depth 1 +│ ├── [kitchen renovation] ← 6 episodes, depth 2 +│ │ ├── Episode: User compared countertop materials +│ │ │ → Assistant analyzed quartz vs granite on cost, porosity, +│ │ │ heat resistance, and long-term maintenance +│ │ ├── Episode: User shared contractor quote of $28k +│ │ │ → Assistant benchmarked against 120 sqft averages, +│ │ │ flagged missing line items, suggested counter questions +│ │ └── Episode: User asked about backsplash tile layout +│ │ → Assistant explained herringbone pattern needs 15% more +│ │ tiles than straight lay, provided calculation and diagram +│ ├── [smart home setup] ← 3 episodes, depth 2 +│ │ ├── Episode: User compared smart switch protocols +│ │ │ → Assistant compared Zigbee vs Z-Wave vs WiFi on range, +│ │ │ power draw, mesh reliability, and Home Assistant support +│ │ └── Episode: User asked about Home Assistant hardware +│ │ → Assistant compared Raspberry Pi vs mini PC on cost, +│ │ reliability, addon support, and power consumption +│ └── [garden & outdoor] ← 2 episodes, depth 2 +│ └── Episode: User planned raised bed vegetable garden +│ → Assistant recommended soil mix 1:1:1 topsoil:compost:perlite, +│ bed dimensions, and seasonal planting schedule +│ +├── [personal finance] ← 10 episodes, depth 1 +│ ├── [investment strategy] ← 5 episodes, depth 2 +│ │ ├── Episode: User asked about portfolio allocation +│ │ │ → Assistant explained three-fund approach (US total market, +│ │ │ international, bonds) with age-based allocation ratios +│ │ └── Episode: User asked about tax optimization +│ │ → Assistant explained Roth IRA $7000 limit, tax-loss +│ │ harvesting strategy, and wash sale rule timing +│ └── [budgeting & expenses] ← 5 episodes, depth 2 +│ ├── Episode: User asked for monthly budget breakdown +│ │ → Assistant categorized: rent $2400, food $600, transit $120, +│ │ subscriptions $85, discretionary $400 — identified savings gaps +│ └── Episode: User asked for emergency fund target +│ → Assistant calculated 6 months of expenses ($21k), +│ suggested high-yield savings accounts, and built timeline +│ +└── [learning & side projects] ← 8 episodes, depth 1 + ├── [Rust programming] ← 5 episodes, depth 2 + │ ├── Episode: User learning ownership and borrowing + │ │ → Assistant explained with examples, reframed compiler + │ │ errors as learning tool, built mental model from C++ analogues + │ ├── Episode: User building CLI for markdown-to-epub + │ │ → Assistant helped with clap argument parsing, file I/O + │ │ with std::fs, and epub chapter structure + │ └── Episode: User asked about async in Rust + │ → Assistant compared Tokio runtime vs std::thread for + │ file I/O workload, recommended std::thread for this case + └── [photography] ← 3 episodes, depth 2 + ├── Episode: User asked about street photography settings + │ → Assistant explained aperture priority mode, ISO auto + │ range, and zone focusing technique for candid shots + └── Episode: User wanted to create a film look preset + → Assistant walked through Lightroom adjustments: lift + blacks, warm highlights, fade curve, grain amount +``` + +Notice how the tree reflects the *natural topology of conversations* — not a pre-defined taxonomy. The agent never asked "what categories do you want?" — it discovered that Japan trip planning, visa logistics, and budgeting are related because the episode embeddings cluster in semantic space. A two-episode tangent about garden soil doesn't get its own top-level branch; it nests under home improvement where it belongs. The running training plan and meal planning land together under health, even though those conversations happened weeks apart. + +The structure also reveals cross-domain connections that flat retrieval would miss. When the user asks "what should I eat the week before my race while staying in budget?", beam search descends into `health → meal planning`, `health → running training`, and `personal finance → budgeting` simultaneously — surfacing full episodes from three branches in a single retrieval pass. The agent gets back not just isolated facts, but the complete conversational context: the meal plan with macros, the race nutrition strategy, and the monthly food budget. + +### Hybrid Rebuild Strategy + +Full AGNES is O(n^2) — fine for small trees, expensive at scale. We use a hybrid approach: + +| Condition | Strategy | Why | +|-----------|----------|-----| +| Total items < 100 | Full rebuild | Cheap enough, optimal structure | +| Inserts since rebuild >= 50 | Full rebuild | Accumulated drift from incremental inserts | +| Otherwise | Incremental insertion | O(log N) per item, keeps latency low | + +Incremental insertion walks the tree top-down, following the highest-similarity child at each level. If no child exceeds the threshold, a new sibling cluster is created. Centroids update in O(d): + +``` +newCentroid[i] = (oldCentroid[i] * oldCount + newVector[i]) / (oldCount + 1) +``` + +### Retrieval: Multi-Branch Beam Search + +The hierarchy enables [**beam search**](https://www.semanticscholar.org/paper/The-HARPY-speech-recognition-system-Lowerre/bdb3f20fe41bb95f6bc9d162e827de8db3f952d7) (Lowerre, 1976) — exploring multiple promising branches simultaneously: + +``` +Level 0: root +Level 1: [auth: 0.82] [deploy: 0.71] [testing: 0.58] ← keep top 3 +Level 2: [oauth: 0.85] [k8s: 0.68] [ci-cd: 0.61] ← expand & re-rank +Level 3: ... collect terminal items +``` + +This is fundamentally different from flat top-k retrieval: +- **Branch pruning** — skip irrelevant subtrees entirely +- **Token budgeting** — fill the context window by expanding branches until budget is exhausted +- **Path tracing** — know *why* an item was retrieved (e.g., `auth → oauth → token refresh`) + +--- + +## The Case for Hierarchical Clustering in Agent Context + +Agent context management has unique requirements that align with hierarchical clustering: + +**1. Conversations are inherently hierarchical.** A debugging session has subtopics (error diagnosis, fix attempts, verification). A planning session has phases (requirements, design, implementation). Flat storage discards this structure; a tree preserves it. + +**2. Token budgets demand intelligent packing.** You can't just return top-k similar items — you need to fill a fixed token window with the most relevant *coverage*. Beam search over a hierarchy naturally provides this by expanding the most promising branches until the budget is full. + +**3. Topics emerge dynamically.** Unlike document retrieval where categories are known upfront, agent conversations create new topics in real-time. AGNES discovers these topics algorithmically from the embedding space — no pre-defined taxonomy needed. + +**4. Deduplication is structural.** Items within the same cluster are semantically related by definition. This makes deduplication and consolidation natural — you don't need an LLM to decide if two memories overlap; the tree already groups them together. + +**5. Retrieval should be explainable.** When an agent injects recalled context, it helps to know the retrieval path (`deploy → k8s → helm config`). Hierarchical clustering provides this for free; flat vector search cannot. + +--- + +## Trade-offs and Limitations + +We're not claiming AGNES is universally superior. The trade-offs are real: + +| Aspect | AGNES | Flat Vector Search | +|--------|-------|--------------------| +| **Build cost** | O(n^2) full rebuild (periodic) | O(n log n) index build | +| **Between rebuilds** | O(log N) incremental insert | O(log n) insert | +| **Query latency** | O(beam * depth) | O(log n) with HNSW | +| **Memory overhead** | Tree + centroids + items | Index + items | +| **Scale** | Thousands to tens of thousands (incremental beyond) | Millions of items | +| **Best for** | Structured, budget-aware context retrieval | Raw similarity search at scale | + +Full AGNES rebuilds are O(n^2) — at 10k items that's ~100M distance computations (seconds), but at 100k items the distance matrix alone would need ~80GB of memory. In practice, full rebuilds are for smaller trees (thousands of items). Beyond that, the system relies on incremental insertion (O(log N) per item), which accumulates some structural drift but keeps latency low. The rebuild interval is configurable to balance structure quality vs. cost. + +AGNES is the right tool for **agent context windows** — token-budget-constrained, structure-aware retrieval at conversational scale. It is not trying to replace HNSW for million-scale document retrieval. + +--- + +## References + +- Kaufman, L. & Rousseeuw, P.J. (1990). [*Finding Groups in Data: An Introduction to Cluster Analysis*](https://onlinelibrary.wiley.com/doi/book/10.1002/9780470316801), Chapter 5 (AGNES). Wiley Series in Probability and Statistics. +- Sokal, R.R. & Michener, C.D. (1958). [A statistical method for evaluating systematic relationships](https://www.semanticscholar.org/paper/A-statistical-method-for-evaluating-systematic-Sokal-Michener/0db093335bc3b9445fa5a1a5526d634921d7b59a). *University of Kansas Science Bulletin*, 38, 1409–1438. (UPGMA / average linkage) +- Lowerre, B.T. (1976). [*The HARPY Speech Recognition System*](https://www.semanticscholar.org/paper/The-HARPY-speech-recognition-system-Lowerre/bdb3f20fe41bb95f6bc9d162e827de8db3f952d7). PhD thesis, Carnegie Mellon University. (Origin of beam search) +- Salton, G., Wong, A. & Yang, C.S. (1975). [A vector space model for automatic indexing](https://dl.acm.org/doi/10.1145/361219.361220). *Communications of the ACM*, 18(11), 613–620. (Cosine similarity) +- [`ml-hclust`](https://github.com/mljs/hclust) — JavaScript hierarchical clustering library (MIT) diff --git a/packages/contexto/openclaw.plugin.json b/packages/contexto/openclaw.plugin.json index a740537..9be84a9 100644 --- a/packages/contexto/openclaw.plugin.json +++ b/packages/contexto/openclaw.plugin.json @@ -15,6 +15,11 @@ "maxContextChars": { "type": "number", "description": "Maximum characters of context to inject (default: 2000)" + }, + "backend": { + "type": "string", + "default": "remote", + "description": "Backend mode: 'remote' (hosted API) or 'local' (local pipeline with LLM summarization + mindmap)" } } } From 7f3878e7a1f8870d3dbd90476fbd2ad139d08280 Mon Sep 17 00:00:00 2001 From: DaevMithran Date: Wed, 8 Apr 2026 16:49:17 +0700 Subject: [PATCH 5/9] Rename backend to mode in config --- packages/contexto/README.md | 19 ++++++++++++++++++- packages/contexto/openclaw.plugin.json | 4 ++-- packages/contexto/src/index.ts | 6 +++--- packages/contexto/src/types.ts | 2 +- 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/packages/contexto/README.md b/packages/contexto/README.md index a2762af..483e7c8 100644 --- a/packages/contexto/README.md +++ b/packages/contexto/README.md @@ -120,7 +120,24 @@ For the deeper technical reasoning: | Property | Type | Required | Description | | --- | --- | --- | --- | -| `apiKey` | string | Yes | Your Contexto API key | +| `apiKey` | string | Yes (remote) | Your Contexto API key | +| `mode` | string | No | `remote` (default) or `local` | + +### Remote mode (default) + +Uses the hosted Contexto API. Get an API key at [getcontexto.com](https://getcontexto.com/). + +```bash +openclaw config set plugins.entries.contexto.config.apiKey YOUR_KEY +``` + +### Local mode + +Runs the full pipeline locally: summarize via LLM, embed, cluster (AGNES), and persist to `~/.openclaw/data/contexto/mindmap.json`. Uses your OpenClaw provider and API key — no extra config needed. + +```bash +openclaw config set plugins.entries.contexto.config.mode local +``` ## Community diff --git a/packages/contexto/openclaw.plugin.json b/packages/contexto/openclaw.plugin.json index 9be84a9..e4af558 100644 --- a/packages/contexto/openclaw.plugin.json +++ b/packages/contexto/openclaw.plugin.json @@ -16,10 +16,10 @@ "type": "number", "description": "Maximum characters of context to inject (default: 2000)" }, - "backend": { + "mode": { "type": "string", "default": "remote", - "description": "Backend mode: 'remote' (hosted API) or 'local' (local pipeline with LLM summarization + mindmap)" + "description": "'remote' (hosted API) or 'local' (local pipeline with LLM summarization + mindmap)" } } } diff --git a/packages/contexto/src/index.ts b/packages/contexto/src/index.ts index 14954dd..dfda604 100644 --- a/packages/contexto/src/index.ts +++ b/packages/contexto/src/index.ts @@ -22,19 +22,19 @@ export default { maxContextChars: { type: 'number' }, compactThreshold: { type: 'number', default: 0.50 }, compactionStrategy: { type: 'string', default: 'default' }, - backend: { type: 'string', default: 'remote' }, + mode: { type: 'string', default: 'remote' }, }, }, register(api: any) { const strategy = api.pluginConfig?.compactionStrategy ?? 'default'; - const backendMode = api.pluginConfig?.backend ?? 'remote'; + const backendMode = api.pluginConfig?.mode ?? 'remote'; const base = { apiKey: api.pluginConfig?.apiKey, contextEnabled: api.pluginConfig?.contextEnabled ?? true, maxContextChars: api.pluginConfig?.maxContextChars, - backend: backendMode as 'remote' | 'local', + mode: backendMode as 'remote' | 'local', }; const config: PluginConfig = strategy === 'default' diff --git a/packages/contexto/src/types.ts b/packages/contexto/src/types.ts index ab127ee..5343b8e 100644 --- a/packages/contexto/src/types.ts +++ b/packages/contexto/src/types.ts @@ -4,7 +4,7 @@ export interface BaseConfig { maxContextChars?: number; minScore?: number; filter?: Record; - backend?: 'remote' | 'local'; + mode?: 'remote' | 'local'; } export interface DefaultConfig extends BaseConfig { From f910633d6f42e013f3f6546678bfe21aeff811bc Mon Sep 17 00:00:00 2001 From: DaevMithran Date: Wed, 8 Apr 2026 16:59:40 +0700 Subject: [PATCH 6/9] Debug --- packages/contexto/src/index.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/packages/contexto/src/index.ts b/packages/contexto/src/index.ts index dfda604..5fa2e14 100644 --- a/packages/contexto/src/index.ts +++ b/packages/contexto/src/index.ts @@ -47,10 +47,15 @@ export default { const logger = api.logger; + logger.debug(`[contexto] pluginConfig: ${JSON.stringify(api.pluginConfig)}`); + if (backendMode === 'local') { // Resolve provider and apiKey from OpenClaw runtime defaults const defaults = api.runtime?.agent?.defaults; - const provider = defaults?.provider ?? 'openrouter'; + logger.info(`[contexto] runtime defaults: ${JSON.stringify(defaults)}`); + const SUPPORTED_PROVIDERS = new Set(['openrouter', 'openai']); + const rawProvider = defaults?.provider; + const provider = SUPPORTED_PROVIDERS.has(rawProvider) ? rawProvider : 'openrouter'; const apiKey = api.pluginConfig?.apiKey ?? defaults?.apiKey; if (!apiKey) { From 187e15b8f2a4831a82ef6f28ba99349580fc8a36 Mon Sep 17 00:00:00 2001 From: DaevMithran Date: Wed, 8 Apr 2026 19:34:31 +0700 Subject: [PATCH 7/9] fix use resolveApiKeyForProvider --- packages/contexto/src/engine/base.ts | 6 +-- packages/contexto/src/index.ts | 63 +++++++++++++++------------- packages/contexto/src/types.ts | 2 +- 3 files changed, 37 insertions(+), 34 deletions(-) diff --git a/packages/contexto/src/engine/base.ts b/packages/contexto/src/engine/base.ts index 2cf1925..6773f08 100644 --- a/packages/contexto/src/engine/base.ts +++ b/packages/contexto/src/engine/base.ts @@ -65,12 +65,12 @@ export abstract class AbstractContextEngine implements ContextEngine { if (tokenBudget != null) this.state.cachedTokenBudget = tokenBudget; const lastMsg = messages?.[messages.length - 1]; - this.logger.info(`[contexto] assemble() called — ${messages?.length} messages, tokenBudget: ${tokenBudget}, contextEnabled: ${this.config.contextEnabled}, hasApiKey: ${!!this.config.apiKey}`); + this.logger.info(`[contexto] assemble() called — ${messages?.length} messages, tokenBudget: ${tokenBudget}, hasApiKey: ${!!this.config.apiKey}`); const lastMsgContent = lastMsg && 'content' in lastMsg ? lastMsg.content : undefined; this.logger.debug(`[contexto] last message — role: ${lastMsg?.role}, content type: ${typeof lastMsgContent}, isArray: ${Array.isArray(lastMsgContent)}, sample: ${JSON.stringify(lastMsgContent)?.slice(0, 200)}`); - if (!this.config.apiKey || !this.config.contextEnabled) { - this.logger.info(`[contexto] assemble() skipping — apiKey: ${!!this.config.apiKey}, contextEnabled: ${this.config.contextEnabled}`); + if (!this.config.apiKey) { + this.logger.info(`[contexto] assemble() skipping — no apiKey`); return { messages, estimatedTokens: 0 }; } diff --git a/packages/contexto/src/index.ts b/packages/contexto/src/index.ts index 5fa2e14..d188719 100644 --- a/packages/contexto/src/index.ts +++ b/packages/contexto/src/index.ts @@ -18,7 +18,7 @@ export default { type: 'object', properties: { apiKey: { type: 'string' }, - contextEnabled: { type: 'boolean', default: true }, + maxContextChars: { type: 'number' }, compactThreshold: { type: 'number', default: 0.50 }, compactionStrategy: { type: 'string', default: 'default' }, @@ -32,7 +32,6 @@ export default { const base = { apiKey: api.pluginConfig?.apiKey, - contextEnabled: api.pluginConfig?.contextEnabled ?? true, maxContextChars: api.pluginConfig?.maxContextChars, mode: backendMode as 'remote' | 'local', }; @@ -47,34 +46,41 @@ export default { const logger = api.logger; - logger.debug(`[contexto] pluginConfig: ${JSON.stringify(api.pluginConfig)}`); - if (backendMode === 'local') { - // Resolve provider and apiKey from OpenClaw runtime defaults - const defaults = api.runtime?.agent?.defaults; - logger.info(`[contexto] runtime defaults: ${JSON.stringify(defaults)}`); - const SUPPORTED_PROVIDERS = new Set(['openrouter', 'openai']); - const rawProvider = defaults?.provider; - const provider = SUPPORTED_PROVIDERS.has(rawProvider) ? rawProvider : 'openrouter'; - const apiKey = api.pluginConfig?.apiKey ?? defaults?.apiKey; - - if (!apiKey) { - // Try resolving from runtime modelAuth as fallback - logger.warn('[contexto] No apiKey available for local backend — provide apiKey in plugin config or ensure OpenClaw runtime has a configured provider'); + const modelAuth = api.runtime?.modelAuth; + if (!modelAuth?.resolveApiKeyForProvider) { + logger.warn('[contexto] Local mode requires modelAuth — not available'); return; } - // Set apiKey to a truthy value so engine guards (if (!this.config.apiKey)) pass - config.apiKey = config.apiKey || 'local'; - - const backend = new LocalBackend({ - provider, - apiKey, - }, logger); - - const engine = createContextEngine(config, backend, logger); - api.registerContextEngine('contexto', () => engine); - logger.info(`[contexto] Plugin registered with local backend (provider: ${provider}, contextEnabled: ${config.contextEnabled})`); + // Resolve API key via .then() since register() must be synchronous + modelAuth.resolveApiKeyForProvider({ provider: 'openrouter', cfg: api.config }) + .then((openrouterAuth: any) => { + if (openrouterAuth?.apiKey) { + return { provider: 'openrouter' as const, apiKey: openrouterAuth.apiKey }; + } + return modelAuth.resolveApiKeyForProvider({ provider: 'openai', cfg: api.config }) + .then((openaiAuth: any) => { + if (openaiAuth?.apiKey) { + return { provider: 'openai' as const, apiKey: openaiAuth.apiKey }; + } + return null; + }); + }) + .then((result: { provider: 'openrouter' | 'openai'; apiKey: string } | null) => { + if (!result) { + logger.warn('[contexto] Local mode requires an OpenRouter or OpenAI API key configured in OpenClaw'); + return; + } + config.apiKey = 'local'; + const backend = new LocalBackend({ provider: result.provider, apiKey: result.apiKey }, logger); + const engine = createContextEngine(config, backend, logger); + api.registerContextEngine('contexto', () => engine); + logger.info(`[contexto] Plugin registered with local backend (provider: ${result.provider})`); + }) + .catch((err: any) => { + logger.warn(`[contexto] Failed to resolve API key: ${err?.message ?? err}`); + }); return; } @@ -85,11 +91,8 @@ export default { } const backend = new RemoteBackend(config, logger); - const engine = createContextEngine(config, backend, logger); - api.registerContextEngine('contexto', () => engine); - - logger.info(`[contexto] Plugin registered (contextEnabled: ${config.contextEnabled})`); + logger.info('[contexto] Plugin registered'); }, }; diff --git a/packages/contexto/src/types.ts b/packages/contexto/src/types.ts index 5343b8e..80986bf 100644 --- a/packages/contexto/src/types.ts +++ b/packages/contexto/src/types.ts @@ -1,6 +1,6 @@ export interface BaseConfig { apiKey: string; - contextEnabled: boolean; + maxContextChars?: number; minScore?: number; filter?: Record; From ce3520744c7a651a5fc7f383d190adbb9ade72d7 Mon Sep 17 00:00:00 2001 From: DaevMithran Date: Wed, 8 Apr 2026 20:28:24 +0700 Subject: [PATCH 8/9] Remove debug logs --- packages/contexto/src/engine/base.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/contexto/src/engine/base.ts b/packages/contexto/src/engine/base.ts index 6773f08..47cfeb0 100644 --- a/packages/contexto/src/engine/base.ts +++ b/packages/contexto/src/engine/base.ts @@ -65,12 +65,12 @@ export abstract class AbstractContextEngine implements ContextEngine { if (tokenBudget != null) this.state.cachedTokenBudget = tokenBudget; const lastMsg = messages?.[messages.length - 1]; - this.logger.info(`[contexto] assemble() called — ${messages?.length} messages, tokenBudget: ${tokenBudget}, hasApiKey: ${!!this.config.apiKey}`); + this.logger.info(`[contexto] assemble() called — ${messages?.length} messages, tokenBudget: ${tokenBudget}`); const lastMsgContent = lastMsg && 'content' in lastMsg ? lastMsg.content : undefined; this.logger.debug(`[contexto] last message — role: ${lastMsg?.role}, content type: ${typeof lastMsgContent}, isArray: ${Array.isArray(lastMsgContent)}, sample: ${JSON.stringify(lastMsgContent)?.slice(0, 200)}`); if (!this.config.apiKey) { - this.logger.info(`[contexto] assemble() skipping — no apiKey`); + this.logger.info(`[contexto] assemble() skipping — not configured`); return { messages, estimatedTokens: 0 }; } From d0316fa28be162d76f9bf6e7766c7328b9427fb5 Mon Sep 17 00:00:00 2001 From: DaevMithran Date: Wed, 8 Apr 2026 21:53:25 +0700 Subject: [PATCH 9/9] docs: Remove unused doc --- docs/why-agnes.md | 397 ---------------------------------------------- 1 file changed, 397 deletions(-) delete mode 100644 docs/why-agnes.md diff --git a/docs/why-agnes.md b/docs/why-agnes.md deleted file mode 100644 index 22f324f..0000000 --- a/docs/why-agnes.md +++ /dev/null @@ -1,397 +0,0 @@ -# Why AGNES: Hierarchical Clustering for Agent Context - -## The Problem - -Every AI memory system faces the same fundamental question: **how do you organize memories so retrieval is fast, relevant, and context-aware?** - -Most systems default to flat vector search — embed everything, store it in a vector DB, and retrieve the top-k nearest neighbors. This works for simple lookups, but breaks down for agent context management where you need: - -- **Multi-resolution retrieval** — sometimes you need a broad topic summary, sometimes a specific detail -- **Token-budget-aware retrieval** — you can't just return top-k; you need to fill a fixed context window efficiently -- **Semantic organization** — related memories should be grouped, not scattered across an unstructured index -- **No pre-defined categories** — agent conversations span unpredictable topics; you can't hardcode a taxonomy - -## What Others Do - -We surveyed the major AI memory systems to understand the landscape: - -### Mem0 - -Uses **pluggable vector stores** (Qdrant, pgvector, Pinecone, etc.) with HNSW or DiskANN indexing. Memories are stored as **flat atomic facts** with an optional Neo4j/Kuzu knowledge graph layer. Consolidation is LLM-driven — on each insert, an LLM decides whether to ADD, UPDATE, DELETE, or NOOP against existing memories. - -**Structure**: Flat facts + optional entity graph. No clustering. - -### Supermemory - -Built on **pgvector** with hybrid search (semantic + fact retrieval). Memories are flat atomic facts scoped by projects. Uses Anthropic's Contextual Retrieval technique for chunking. Supports contradiction resolution and stale information expiration. - -**Structure**: Flat facts + document chunks. No clustering. - -### Zep (Graphiti) - -The most sophisticated graph-based approach. Uses a **temporal knowledge graph** backed by Neo4j with triple parallel retrieval: cosine similarity, BM25 full-text search, and breadth-first graph traversal. Entities are clustered into communities via **dynamic label propagation** — the closest thing to clustering in this space. - -**Structure**: Temporal knowledge graph with flat community clustering. Not hierarchical. - -### LangChain / LangGraph Memory - -Stores memories as **JSON documents** in developer-defined namespace hierarchies (e.g., `("user", "123", "memories")`). Backed by pgvector (HNSW/IVFFlat) or SQLite with sqlite-vec. Consolidation uses a memory enrichment process that balances creation vs. update. - -**Structure**: Flat key-value store with static namespace organization. No dynamic clustering. - -### Letta (formerly MemGPT) - -Models memory after **OS memory management** with three tiers: core memory (in-context, agent-editable), recall memory (conversation log), and archival memory (vector DB — ChromaDB or pgvector). The agent self-manages what goes where via tool calls. Context window overflow triggers conversation summarization. - -**Structure**: Fixed 3-tier architecture. No clustering within tiers. - -### Summary - -| System | Index | Memory Structure | Clustering | Consolidation | -|--------|-------|-----------------|------------|---------------| -| **Mem0** | HNSW / DiskANN | Flat facts + knowledge graph | None | LLM-driven ADD/UPDATE/DELETE | -| **Supermemory** | pgvector | Flat facts + doc chunks | None | Contradiction resolution + expiry | -| **Zep** | Neo4j vectors + BM25 + BFS | Temporal knowledge graph | Label propagation (flat) | Temporal invalidation + LLM dedup | -| **LangGraph** | HNSW / IVFFlat | JSON docs in static namespaces | None | Memory enrichment + TTL | -| **Letta** | ChromaDB (HNSW) / pgvector | 3-tier: core / recall / archival | None | Summarization + agent self-editing | -| **Contexto** | **AGNES dendrogram** | **Dynamic semantic hierarchy** | **Hierarchical (agglomerative)** | Algorithmic + incremental centroid | - -**Key finding**: No major AI memory system uses hierarchical clustering. All rely on flat similarity search, with Zep being the only one adding flat community detection via label propagation. Memory consolidation across the board is LLM-driven rather than algorithmic. - ---- - -## Why Hierarchical Clustering? - -### Over flat ANN (HNSW, IVFFlat) - -Approximate Nearest Neighbor indices like HNSW are excellent for raw retrieval speed. They're what most vector databases use internally. But they solve a different problem: - -| | Flat ANN (HNSW) | Hierarchical Clustering | -|---|---|---| -| **Query type** | "Find the k most similar items" | "Find all items in the most relevant semantic branch" | -| **Structure** | Flat index, no organization | Semantic tree with labeled clusters | -| **Token budgeting** | Return top-k, hope it fits | Beam search fills budget by exploring branches | -| **Multi-resolution** | No — always item-level | Yes — can retrieve at cluster or item level | -| **Interpretability** | Opaque index | Labeled tree (travel → Japan trip → visa documents) | -| **Pruning** | Must score candidates | Prune entire branches by centroid similarity | - -HNSW answers "what's similar?" — hierarchical clustering answers "what's this about, and what details matter?" - -For agent context, the second question is more useful. You don't just want the 7 most similar messages; you want the most relevant *topic branches* packed into your token budget. - -### Over K-Means - -K-means is the default clustering algorithm most engineers reach for. If you're going to cluster, why not the simpler algorithm? Because K-Means produces flat partitions; hierarchical clustering produces a dendrogram — a tree you can cut at any level. - -#### 1. You must specify k upfront - -K-means requires choosing the number of clusters before running. Agent conversations are open-ended — a session might touch 3 topics or 30. Choosing k=5 when there are 15 natural topics merges unrelated episodes; choosing k=15 when there are 3 fragments coherent topics into noise. - -Hierarchical clustering discovers the natural cluster count by cutting the dendrogram at a similarity threshold. No k needed. - -#### 2. Spherical cluster assumption - -K-means assumes clusters are roughly spherical and equally sized in embedding space. Conversational topics are neither — a 50-message thread and a 3-message question are both valid clusters but wildly different in density. - -Hierarchical clustering with average linkage handles variable-density clusters naturally because it merges based on average pairwise distance, not distance to a centroid. - -#### 3. No hierarchy - -K-means produces a flat partition. You get k buckets with no relationship between them. This means: - -- No multi-resolution retrieval (can't zoom in/out) -- No branch pruning during search (must check all k centroids) -- No interpretable structure (which clusters are subtopics of which?) - -Hierarchical clustering produces a dendrogram — a full hierarchy from individual items up to a single root. We cut it at our similarity threshold to get the right granularity, but the hierarchy is preserved for navigation. - -#### 4. Instability - -K-means results depend on random initialization. Run it twice on the same data and you may get different clusters. This is unacceptable for a context system where users expect consistent retrieval behavior. - -Hierarchical clustering is deterministic — same data, same tree, every time. - -#### 5. No incremental updates - -Standard k-means requires re-running on the full dataset when new items arrive. Mini-batch k-means exists but still shifts centroids unpredictably. - -Our implementation supports **greedy incremental insertion** — new items walk the existing tree and slot into the best-matching branch in O(log N) time, with O(d) centroid updates. - ---- - -## Why AGNES Specifically? - -Among hierarchical clustering algorithms, we chose [AGNES (Agglomerative Nesting)](https://onlinelibrary.wiley.com/doi/book/10.1002/9780470316801) (Kaufman & Rousseeuw, 1990) with [average linkage (UPGMA)](https://www.semanticscholar.org/paper/A-statistical-method-for-evaluating-systematic-Sokal-Michener/0db093335bc3b9445fa5a1a5526d634921d7b59a) (Sokal & Michener, 1958) for specific reasons: - -### Agglomerative vs. Divisive - -- **AGNES (bottom-up)**: Start with individual items, merge the closest pairs upward. Produces fine-grained leaf clusters that accurately reflect local similarity. -- **DIANA (top-down)**: Start with everything in one cluster, split recursively. Better for finding large-scale structure but can make poor early splits that propagate. - -For agent context, **local accuracy matters more than global structure**. A user asking about "OAuth token refresh" needs the system to find the tight cluster of auth-related messages, not a broad "security" supercluster. AGNES gets this right because it builds from the leaves up. - -### Why Average Linkage (UPGMA)? - -The linkage method determines how inter-cluster distance is measured during merges: - -| Linkage | Distance Between Clusters | Behavior | -|---------|--------------------------|----------| -| **Single** | Minimum pairwise distance | Chaining — long, stringy clusters. One tangentially related message pulls two unrelated topics together | -| **Complete** | Maximum pairwise distance | Compact but tends to split natural clusters. A single outlier message forces a premature split | -| **Average (UPGMA)** | Mean of all pairwise distances | Balanced — cohesive clusters that tolerate some variance without chaining | -| **Ward's** | Minimizes total within-cluster variance | Prefers equal-sized clusters — not ideal when one topic has 50 messages and another has 3 | - -Average linkage is the right trade-off for conversational data where topic sizes vary naturally and you want cohesive-but-not-rigid groupings. - ---- - -## How It Works in Contexto - -### Building the Tree - -``` -Conversation items (with embeddings) - ↓ -Pairwise cosine distance matrix - ↓ -AGNES with average linkage (via ml-hclust) - ↓ -Dendrogram - ↓ -Cut at similarity threshold (0.65) - ↓ -ClusterNode tree (max depth 4) -``` - -Each `ClusterNode` stores: -- **Centroid** — average embedding of all items in the subtree -- **Label** — auto-generated from most representative keywords -- **Items** — leaf-level conversation items -- **Children** — sub-clusters - -The result is a navigable semantic hierarchy. Each leaf is an **episode** — a full conversation turn containing the user message, assistant response, and any tool outputs bundled together. Consider an AI assistant used across weeks of varied conversations — the tree self-organizes from these episodes, regardless of domain: - -``` -root (Knowledge) -├── [travel planning & logistics] ← 19 episodes, depth 1 -│ ├── [Japan trip itinerary] ← 8 episodes, depth 2 -│ │ ├── Episode: User asked about 3-week Japan route -│ │ │ → Assistant planned Tokyo → Kyoto → Osaka → Hiroshima -│ │ │ with transit times and suggested stays per city -│ │ ├── Episode: User compared JR Pass options -│ │ │ → Assistant broke down 21-day JR Pass vs individual -│ │ │ shinkansen tickets with cost analysis -│ │ ├── Episode: User asked for ryokan recommendations -│ │ │ → Assistant suggested 3 ryokans near Arashiyama -│ │ │ with prices, booking links, and onsen details -│ │ └── Episode: User asked about vegetarian dining in Shibuya -│ │ → Assistant listed 5 restaurants with menus, price -│ │ ranges, and reservation requirements -│ ├── [visa & travel documents] ← 6 episodes, depth 2 -│ │ ├── Episode: User asked about passport renewal timeline -│ │ │ → Assistant calculated 6-week processing, deadline March 10 -│ │ ├── Episode: User checked Japan visa requirements -│ │ │ → Assistant confirmed visa waiver for US citizens, -│ │ │ 90-day max, entry requirements and customs forms -│ │ └── Episode: User compared travel insurance providers -│ │ → Assistant compared World Nomads vs SafetyWing -│ │ coverage, deductibles, and adventure sports policy -│ └── [packing & budget] ← 5 episodes, depth 2 -│ ├── Episode: User asked for full trip budget estimate -│ │ → Assistant built spreadsheet: flights $1200, rail $450, -│ │ accommodation $1800, food $500, activities $250 = $4200 -│ └── Episode: User asked about connectivity options -│ → Assistant compared portable WiFi rental vs eSIM -│ providers with coverage maps and daily costs -│ -├── [health & fitness] ← 15 episodes, depth 1 -│ ├── [running training plan] ← 7 episodes, depth 2 -│ │ ├── Episode: User set goal for half marathon in 10 weeks -│ │ │ → Assistant built progressive plan from 15 mi/week base -│ │ │ with tempo, interval, and long run schedule -│ │ ├── Episode: User asked about tempo run pacing -│ │ │ → Assistant calculated 8:30/mile threshold pace from -│ │ │ recent 5K time, explained RPE and heart rate zones -│ │ ├── Episode: User asked about race-day nutrition -│ │ │ → Assistant recommended gel every 45 min after mile 6, -│ │ │ electrolyte strategy, and pre-race meal timing -│ │ └── Episode: User reported hip flexor tightness -│ │ → Assistant suggested 4 stretches, foam rolling routine, -│ │ and when to reduce training load vs push through -│ ├── [meal planning & nutrition] ← 5 episodes, depth 2 -│ │ ├── Episode: User asked for high-protein vegetarian meals -│ │ │ → Assistant created 5-day meal plan with macros: -│ │ │ lentil soup, quinoa bowls, tofu stir-fry, tempeh tacos -│ │ └── Episode: User concerned about iron on plant-based diet -│ │ → Assistant explained iron + B12 supplementation, -│ │ food pairing for absorption, and blood test schedule -│ └── [sleep & recovery] ← 3 episodes, depth 2 -│ ├── Episode: User reported sleep dropping to 6hrs -│ │ → Assistant suggested magnesium glycinate dosage, -│ │ sleep hygiene adjustments, and training load reduction -│ └── Episode: User asked about recovery on rest days -│ → Assistant outlined active recovery protocol: -│ light walking, stretching, hydration targets -│ -├── [career & job search] ← 14 episodes, depth 1 -│ ├── [resume & applications] ← 6 episodes, depth 2 -│ │ ├── Episode: User asked to rewrite resume for PM roles -│ │ │ → Assistant rewrote 4 bullet points with quantified impact: -│ │ │ "grew DAU 40% in 6 months" instead of "managed growth" -│ │ └── Episode: User drafted cover letter for Stripe -│ │ → Assistant tailored letter emphasizing payments experience, -│ │ API design background, and growth metrics -│ ├── [interview preparation] ← 5 episodes, depth 2 -│ │ ├── Episode: User practiced behavioral questions -│ │ │ → Assistant walked through STAR format with 3 example -│ │ │ answers tailored to user's payments experience -│ │ └── Episode: User asked for system design practice -│ │ → Assistant ran mock interview: "design a notification -│ │ service at scale" with follow-up questions and feedback -│ └── [salary & negotiation] ← 3 episodes, depth 2 -│ ├── Episode: User asked about SF PM compensation -│ │ → Assistant provided market data ($180-220k base + equity), -│ │ negotiation scripts, and counter-offer strategy -│ └── Episode: User received initial offer -│ → Assistant analyzed offer breakdown, suggested countering -│ at 15% above base with equity acceleration ask -│ -├── [home improvement & DIY] ← 11 episodes, depth 1 -│ ├── [kitchen renovation] ← 6 episodes, depth 2 -│ │ ├── Episode: User compared countertop materials -│ │ │ → Assistant analyzed quartz vs granite on cost, porosity, -│ │ │ heat resistance, and long-term maintenance -│ │ ├── Episode: User shared contractor quote of $28k -│ │ │ → Assistant benchmarked against 120 sqft averages, -│ │ │ flagged missing line items, suggested counter questions -│ │ └── Episode: User asked about backsplash tile layout -│ │ → Assistant explained herringbone pattern needs 15% more -│ │ tiles than straight lay, provided calculation and diagram -│ ├── [smart home setup] ← 3 episodes, depth 2 -│ │ ├── Episode: User compared smart switch protocols -│ │ │ → Assistant compared Zigbee vs Z-Wave vs WiFi on range, -│ │ │ power draw, mesh reliability, and Home Assistant support -│ │ └── Episode: User asked about Home Assistant hardware -│ │ → Assistant compared Raspberry Pi vs mini PC on cost, -│ │ reliability, addon support, and power consumption -│ └── [garden & outdoor] ← 2 episodes, depth 2 -│ └── Episode: User planned raised bed vegetable garden -│ → Assistant recommended soil mix 1:1:1 topsoil:compost:perlite, -│ bed dimensions, and seasonal planting schedule -│ -├── [personal finance] ← 10 episodes, depth 1 -│ ├── [investment strategy] ← 5 episodes, depth 2 -│ │ ├── Episode: User asked about portfolio allocation -│ │ │ → Assistant explained three-fund approach (US total market, -│ │ │ international, bonds) with age-based allocation ratios -│ │ └── Episode: User asked about tax optimization -│ │ → Assistant explained Roth IRA $7000 limit, tax-loss -│ │ harvesting strategy, and wash sale rule timing -│ └── [budgeting & expenses] ← 5 episodes, depth 2 -│ ├── Episode: User asked for monthly budget breakdown -│ │ → Assistant categorized: rent $2400, food $600, transit $120, -│ │ subscriptions $85, discretionary $400 — identified savings gaps -│ └── Episode: User asked for emergency fund target -│ → Assistant calculated 6 months of expenses ($21k), -│ suggested high-yield savings accounts, and built timeline -│ -└── [learning & side projects] ← 8 episodes, depth 1 - ├── [Rust programming] ← 5 episodes, depth 2 - │ ├── Episode: User learning ownership and borrowing - │ │ → Assistant explained with examples, reframed compiler - │ │ errors as learning tool, built mental model from C++ analogues - │ ├── Episode: User building CLI for markdown-to-epub - │ │ → Assistant helped with clap argument parsing, file I/O - │ │ with std::fs, and epub chapter structure - │ └── Episode: User asked about async in Rust - │ → Assistant compared Tokio runtime vs std::thread for - │ file I/O workload, recommended std::thread for this case - └── [photography] ← 3 episodes, depth 2 - ├── Episode: User asked about street photography settings - │ → Assistant explained aperture priority mode, ISO auto - │ range, and zone focusing technique for candid shots - └── Episode: User wanted to create a film look preset - → Assistant walked through Lightroom adjustments: lift - blacks, warm highlights, fade curve, grain amount -``` - -Notice how the tree reflects the *natural topology of conversations* — not a pre-defined taxonomy. The agent never asked "what categories do you want?" — it discovered that Japan trip planning, visa logistics, and budgeting are related because the episode embeddings cluster in semantic space. A two-episode tangent about garden soil doesn't get its own top-level branch; it nests under home improvement where it belongs. The running training plan and meal planning land together under health, even though those conversations happened weeks apart. - -The structure also reveals cross-domain connections that flat retrieval would miss. When the user asks "what should I eat the week before my race while staying in budget?", beam search descends into `health → meal planning`, `health → running training`, and `personal finance → budgeting` simultaneously — surfacing full episodes from three branches in a single retrieval pass. The agent gets back not just isolated facts, but the complete conversational context: the meal plan with macros, the race nutrition strategy, and the monthly food budget. - -### Hybrid Rebuild Strategy - -Full AGNES is O(n^2) — fine for small trees, expensive at scale. We use a hybrid approach: - -| Condition | Strategy | Why | -|-----------|----------|-----| -| Total items < 100 | Full rebuild | Cheap enough, optimal structure | -| Inserts since rebuild >= 50 | Full rebuild | Accumulated drift from incremental inserts | -| Otherwise | Incremental insertion | O(log N) per item, keeps latency low | - -Incremental insertion walks the tree top-down, following the highest-similarity child at each level. If no child exceeds the threshold, a new sibling cluster is created. Centroids update in O(d): - -``` -newCentroid[i] = (oldCentroid[i] * oldCount + newVector[i]) / (oldCount + 1) -``` - -### Retrieval: Multi-Branch Beam Search - -The hierarchy enables [**beam search**](https://www.semanticscholar.org/paper/The-HARPY-speech-recognition-system-Lowerre/bdb3f20fe41bb95f6bc9d162e827de8db3f952d7) (Lowerre, 1976) — exploring multiple promising branches simultaneously: - -``` -Level 0: root -Level 1: [auth: 0.82] [deploy: 0.71] [testing: 0.58] ← keep top 3 -Level 2: [oauth: 0.85] [k8s: 0.68] [ci-cd: 0.61] ← expand & re-rank -Level 3: ... collect terminal items -``` - -This is fundamentally different from flat top-k retrieval: -- **Branch pruning** — skip irrelevant subtrees entirely -- **Token budgeting** — fill the context window by expanding branches until budget is exhausted -- **Path tracing** — know *why* an item was retrieved (e.g., `auth → oauth → token refresh`) - ---- - -## The Case for Hierarchical Clustering in Agent Context - -Agent context management has unique requirements that align with hierarchical clustering: - -**1. Conversations are inherently hierarchical.** A debugging session has subtopics (error diagnosis, fix attempts, verification). A planning session has phases (requirements, design, implementation). Flat storage discards this structure; a tree preserves it. - -**2. Token budgets demand intelligent packing.** You can't just return top-k similar items — you need to fill a fixed token window with the most relevant *coverage*. Beam search over a hierarchy naturally provides this by expanding the most promising branches until the budget is full. - -**3. Topics emerge dynamically.** Unlike document retrieval where categories are known upfront, agent conversations create new topics in real-time. AGNES discovers these topics algorithmically from the embedding space — no pre-defined taxonomy needed. - -**4. Deduplication is structural.** Items within the same cluster are semantically related by definition. This makes deduplication and consolidation natural — you don't need an LLM to decide if two memories overlap; the tree already groups them together. - -**5. Retrieval should be explainable.** When an agent injects recalled context, it helps to know the retrieval path (`deploy → k8s → helm config`). Hierarchical clustering provides this for free; flat vector search cannot. - ---- - -## Trade-offs and Limitations - -We're not claiming AGNES is universally superior. The trade-offs are real: - -| Aspect | AGNES | Flat Vector Search | -|--------|-------|--------------------| -| **Build cost** | O(n^2) full rebuild (periodic) | O(n log n) index build | -| **Between rebuilds** | O(log N) incremental insert | O(log n) insert | -| **Query latency** | O(beam * depth) | O(log n) with HNSW | -| **Memory overhead** | Tree + centroids + items | Index + items | -| **Scale** | Thousands to tens of thousands (incremental beyond) | Millions of items | -| **Best for** | Structured, budget-aware context retrieval | Raw similarity search at scale | - -Full AGNES rebuilds are O(n^2) — at 10k items that's ~100M distance computations (seconds), but at 100k items the distance matrix alone would need ~80GB of memory. In practice, full rebuilds are for smaller trees (thousands of items). Beyond that, the system relies on incremental insertion (O(log N) per item), which accumulates some structural drift but keeps latency low. The rebuild interval is configurable to balance structure quality vs. cost. - -AGNES is the right tool for **agent context windows** — token-budget-constrained, structure-aware retrieval at conversational scale. It is not trying to replace HNSW for million-scale document retrieval. - ---- - -## References - -- Kaufman, L. & Rousseeuw, P.J. (1990). [*Finding Groups in Data: An Introduction to Cluster Analysis*](https://onlinelibrary.wiley.com/doi/book/10.1002/9780470316801), Chapter 5 (AGNES). Wiley Series in Probability and Statistics. -- Sokal, R.R. & Michener, C.D. (1958). [A statistical method for evaluating systematic relationships](https://www.semanticscholar.org/paper/A-statistical-method-for-evaluating-systematic-Sokal-Michener/0db093335bc3b9445fa5a1a5526d634921d7b59a). *University of Kansas Science Bulletin*, 38, 1409–1438. (UPGMA / average linkage) -- Lowerre, B.T. (1976). [*The HARPY Speech Recognition System*](https://www.semanticscholar.org/paper/The-HARPY-speech-recognition-system-Lowerre/bdb3f20fe41bb95f6bc9d162e827de8db3f952d7). PhD thesis, Carnegie Mellon University. (Origin of beam search) -- Salton, G., Wong, A. & Yang, C.S. (1975). [A vector space model for automatic indexing](https://dl.acm.org/doi/10.1145/361219.361220). *Communications of the ACM*, 18(11), 613–620. (Cosine similarity) -- [`ml-hclust`](https://github.com/mljs/hclust) — JavaScript hierarchical clustering library (MIT)